Last active
May 20, 2026 11:10
-
-
Save feedsbrain/805661a2d28b238a38ed4118f6211ef6 to your computer and use it in GitHub Desktop.
ComfyUI with AMD Radeon RX6800XT GPU with 16 GB VRAM on Ubuntu 26 + ROCm 7.1.1
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| source venv/bin/activate | |
| # RX 6800 XT = gfx1030; ROCm 7.1 supports it natively — override kept for edge-case safety | |
| export HSA_OVERRIDE_GFX_VERSION="10.3.0" | |
| # SDMA causes hangs/illegal address on RDNA 2 under load | |
| export HSA_ENABLE_SDMA=0 | |
| # AOTriton flash attention — production-stable on gfx1030 in ROCm 7.1 | |
| export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 | |
| # hipblaslt's pre-built Tensile library (TensileLibrary_lazy_gfx1030.dat) is missing from | |
| # the PyTorch ROCm wheel for gfx1030 — fall back to hipblas which supports it correctly | |
| export TORCH_BLAS_PREFER_HIPBLASLT=0 | |
| # MIOpen FAST mode (2): heuristic-only kernel selection — avoids minutes of exhaustive search at startup | |
| export MIOPEN_FIND_MODE=2 | |
| # expandable_segments reduces VRAM fragmentation but caused hipErrorIllegalAddress with GFX overrides. | |
| # garbage_collection_threshold:0.3 — trigger GC at 30% occupancy (was 0.5); lower threshold gives | |
| # headroom for ControlNet/MODEL_PATCH spikes on top of the base model (~11.5 GB on disk, BF16 ≈ 5.7B params). | |
| # max_split_size_mb:2048 — ControlNet activation tensors at 768×1024 (latent 96×128 = 12k tokens) | |
| # can exceed 1024MB; 2048 avoids falling back to fresh allocations that fragment the pool. | |
| export PYTORCH_HIP_ALLOC_CONF=garbage_collection_threshold:0.3,max_split_size_mb:2048 | |
| # glibc malloc: use mmap() for allocations >128MB so the OS can reclaim them immediately when freed. | |
| # Without this, PyTorch's CPU tensors (offloaded model weights under --lowvram) stay in the brk heap | |
| # and RSS grows across repeated runs until the process crashes from RAM exhaustion. | |
| export MALLOC_MMAP_THRESHOLD_=134217728 | |
| export MALLOC_TRIM_THRESHOLD_=134217728 | |
| # WAN video generation: use VAEDecodeTiled (not VAEDecode) to avoid VRAM OOM on long sequences. | |
| # Only tested/working: WAN 2.2 5B model, 960×544 resolution, tile_size=256, overlap=64, temporal_size=16, temporal_overlap=4. | |
| # proto-plus/protobuf C extension segfaults when google-generativeai is loaded by comfyui_starnodes; | |
| # pure Python implementation avoids the crash at the cost of slightly slower protobuf serialization. | |
| export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python | |
| # Usage: ./webui.sh [any other ComfyUI args] | |
| # --reserve-vram 1: keep 1 GB VRAM free so ROCm never fully exhausts the allocator, | |
| # which would dump everything to RAM simultaneously and spike usage. | |
| python main.py --enable-manager --use-split-cross-attention --lowvram --reserve-vram 1 "$@" | |
| deactivate |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment