Skip to content

Instantly share code, notes, and snippets.

@feedsbrain
Last active May 20, 2026 11:10
Show Gist options
  • Select an option

  • Save feedsbrain/805661a2d28b238a38ed4118f6211ef6 to your computer and use it in GitHub Desktop.

Select an option

Save feedsbrain/805661a2d28b238a38ed4118f6211ef6 to your computer and use it in GitHub Desktop.
ComfyUI with AMD Radeon RX6800XT GPU with 16 GB VRAM on Ubuntu 26 + ROCm 7.1.1
#!/bin/bash
source venv/bin/activate
# RX 6800 XT = gfx1030; ROCm 7.1 supports it natively — override kept for edge-case safety
export HSA_OVERRIDE_GFX_VERSION="10.3.0"
# SDMA causes hangs/illegal address on RDNA 2 under load
export HSA_ENABLE_SDMA=0
# AOTriton flash attention — production-stable on gfx1030 in ROCm 7.1
export TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1
# hipblaslt's pre-built Tensile library (TensileLibrary_lazy_gfx1030.dat) is missing from
# the PyTorch ROCm wheel for gfx1030 — fall back to hipblas which supports it correctly
export TORCH_BLAS_PREFER_HIPBLASLT=0
# MIOpen FAST mode (2): heuristic-only kernel selection — avoids minutes of exhaustive search at startup
export MIOPEN_FIND_MODE=2
# expandable_segments reduces VRAM fragmentation but caused hipErrorIllegalAddress with GFX overrides.
# garbage_collection_threshold:0.3 — trigger GC at 30% occupancy (was 0.5); lower threshold gives
# headroom for ControlNet/MODEL_PATCH spikes on top of the base model (~11.5 GB on disk, BF16 ≈ 5.7B params).
# max_split_size_mb:2048 — ControlNet activation tensors at 768×1024 (latent 96×128 = 12k tokens)
# can exceed 1024MB; 2048 avoids falling back to fresh allocations that fragment the pool.
export PYTORCH_HIP_ALLOC_CONF=garbage_collection_threshold:0.3,max_split_size_mb:2048
# glibc malloc: use mmap() for allocations >128MB so the OS can reclaim them immediately when freed.
# Without this, PyTorch's CPU tensors (offloaded model weights under --lowvram) stay in the brk heap
# and RSS grows across repeated runs until the process crashes from RAM exhaustion.
export MALLOC_MMAP_THRESHOLD_=134217728
export MALLOC_TRIM_THRESHOLD_=134217728
# WAN video generation: use VAEDecodeTiled (not VAEDecode) to avoid VRAM OOM on long sequences.
# Only tested/working: WAN 2.2 5B model, 960×544 resolution, tile_size=256, overlap=64, temporal_size=16, temporal_overlap=4.
# proto-plus/protobuf C extension segfaults when google-generativeai is loaded by comfyui_starnodes;
# pure Python implementation avoids the crash at the cost of slightly slower protobuf serialization.
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
# Usage: ./webui.sh [any other ComfyUI args]
# --reserve-vram 1: keep 1 GB VRAM free so ROCm never fully exhausts the allocator,
# which would dump everything to RAM simultaneously and spike usage.
python main.py --enable-manager --use-split-cross-attention --lowvram --reserve-vram 1 "$@"
deactivate
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment