feedsbrain · May 20, 2026 11:10
diff --git a/webui.sh b/webui.sh
 #!/bin/bash

 source venv/bin/activate

 # RX 6800 XT = gfx1030; ROCm 7.1 supports it natively — override kept for edge-case safety
 export HSA_OVERRIDE_GFX_VERSION="10.3.0"

 # SDMA causes hangs/illegal address on RDNA 2 under load
 export HSA_ENABLE_SDMA=0

 # AOTriton flash attention — production-stable on gfx1030 in ROCm 7.1
 export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1

 # hipblaslt's pre-built Tensile library (TensileLibrary_lazy_gfx1030.dat) is missing from
 # the PyTorch ROCm wheel for gfx1030 — fall back to hipblas which supports it correctly
 export TORCH_BLAS_PREFER_HIPBLASLT=0

 # MIOpen FAST mode (2): heuristic-only kernel selection — avoids minutes of exhaustive search at startup
 export MIOPEN_FIND_MODE=2

 # expandable_segments reduces VRAM fragmentation but caused hipErrorIllegalAddress with GFX overrides.
 # garbage_collection_threshold:0.3 — trigger GC at 30% occupancy (was 0.5); lower threshold gives
 # headroom for ControlNet/MODEL_PATCH spikes on top of the base model (~11.5 GB on disk, BF16 ≈ 5.7B params).
 # max_split_size_mb:2048 — ControlNet activation tensors at 768×1024 (latent 96×128 = 12k tokens)
 # can exceed 1024MB; 2048 avoids falling back to fresh allocations that fragment the pool.
 export PYTORCH_HIP_ALLOC_CONF=garbage_collection_threshold:0.3,max_split_size_mb:2048

 # glibc malloc: use mmap() for allocations >128MB so the OS can reclaim them immediately when freed.
 # Without this, PyTorch's CPU tensors (offloaded model weights under --lowvram) stay in the brk heap
 # and RSS grows across repeated runs until the process crashes from RAM exhaustion.
 export MALLOC_MMAP_THRESHOLD_=134217728
 export MALLOC_TRIM_THRESHOLD_=134217728

 # WAN video generation: use VAEDecodeTiled (not VAEDecode) to avoid VRAM OOM on long sequences.
 #   Only tested/working: WAN 2.2 5B model, 960×544 resolution, tile_size=256, overlap=64, temporal_size=16, temporal_overlap=4.

 # proto-plus/protobuf C extension segfaults when google-generativeai is loaded by comfyui_starnodes;
 # pure Python implementation avoids the crash at the cost of slightly slower protobuf serialization.
 export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python

 # Usage: ./webui.sh [any other ComfyUI args]
 # --reserve-vram 1: keep 1 GB VRAM free so ROCm never fully exhausts the allocator,
 # which would dump everything to RAM simultaneously and spike usage.
 python main.py --enable-manager --use-split-cross-attention --lowvram --reserve-vram 1 "$@"

 deactivate
	#!/bin/bash

	source venv/bin/activate

	# RX 6800 XT = gfx1030; ROCm 7.1 supports it natively — override kept for edge-case safety
	export HSA_OVERRIDE_GFX_VERSION="10.3.0"

	# SDMA causes hangs/illegal address on RDNA 2 under load
	export HSA_ENABLE_SDMA=0

	# AOTriton flash attention — production-stable on gfx1030 in ROCm 7.1
	export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1

	# hipblaslt's pre-built Tensile library (TensileLibrary_lazy_gfx1030.dat) is missing from
	# the PyTorch ROCm wheel for gfx1030 — fall back to hipblas which supports it correctly
	export TORCH_BLAS_PREFER_HIPBLASLT=0

	# MIOpen FAST mode (2): heuristic-only kernel selection — avoids minutes of exhaustive search at startup
	export MIOPEN_FIND_MODE=2

	# expandable_segments reduces VRAM fragmentation but caused hipErrorIllegalAddress with GFX overrides.
	# garbage_collection_threshold:0.3 — trigger GC at 30% occupancy (was 0.5); lower threshold gives
	# headroom for ControlNet/MODEL_PATCH spikes on top of the base model (~11.5 GB on disk, BF16 ≈ 5.7B params).
	# max_split_size_mb:2048 — ControlNet activation tensors at 768×1024 (latent 96×128 = 12k tokens)
	# can exceed 1024MB; 2048 avoids falling back to fresh allocations that fragment the pool.
	export PYTORCH_HIP_ALLOC_CONF=garbage_collection_threshold:0.3,max_split_size_mb:2048

	# glibc malloc: use mmap() for allocations >128MB so the OS can reclaim them immediately when freed.
	# Without this, PyTorch's CPU tensors (offloaded model weights under --lowvram) stay in the brk heap
	# and RSS grows across repeated runs until the process crashes from RAM exhaustion.
	export MALLOC_MMAP_THRESHOLD_=134217728
	export MALLOC_TRIM_THRESHOLD_=134217728

	# WAN video generation: use VAEDecodeTiled (not VAEDecode) to avoid VRAM OOM on long sequences.
	# Only tested/working: WAN 2.2 5B model, 960×544 resolution, tile_size=256, overlap=64, temporal_size=16, temporal_overlap=4.

	# proto-plus/protobuf C extension segfaults when google-generativeai is loaded by comfyui_starnodes;
	# pure Python implementation avoids the crash at the cost of slightly slower protobuf serialization.
	export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python

	# Usage: ./webui.sh [any other ComfyUI args]
	# --reserve-vram 1: keep 1 GB VRAM free so ROCm never fully exhausts the allocator,
	# which would dump everything to RAM simultaneously and spike usage.
	python main.py --enable-manager --use-split-cross-attention --lowvram --reserve-vram 1 "$@"

	deactivate
No results found