coolaj86 · May 16, 2026 22:22
diff --git a/llama-cpp-install.sh b/llama-cpp-install.sh
 #!/bin/sh

 # Install xcode and cmake
 xcode-select --install
 curl https://webi.sh/cmake | sh
 . ~/.config/envman/PATH.env

 # Download llama.cpp at 1348f67
 git clone https://github.com/ggml-org/llama.cpp
 cd ./llama.cpp/
 # Note: MTP added on May 16th in b9186
 git switch b9186 --detach

 # Build llama.cpp
 cmake -B build -DLLAMA_BUILD_BORINGSSL=ON -DGGML_CUDA=OFF
 cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

 # Run and download model
 ./build/bin/llama-server \
   --verbose \
   --metrics \
   --api-key-file ~/.config/agents/tokens.txt \
   --n-gpu-layers 99 \
   --ctx-size 132000 \
   --no-context-shift \
   --host 0.0.0.0 --port 11234 \
   -hf unsloth/Qwen3.6-35B-A3B-GGUF:UD-Q4_K_M
diff --git a/run-qwen-mtp.sh b/run-qwen-mtp.sh
 #!/bin/sh
 set -eu

 # https://www.reddit.com/r/LocalLLaMA/comments/1tdns1i/used_over_a_million_tokens_in_three_separate/
 # https://huggingface.co/unsloth/Qwen3.6-27B-MTP-GGUF
 # https://unsloth.ai/docs/models/qwen3.6#mtp-guide

 #Thinking mode for general tasks: temperature=1.0, top_p=0.95, top_k=20, min_p=0.0, presence_penalty=0.0, repetition_penalty=1.0
 #Thinking mode for precise coding tasks (e.g. WebDev): temperature=0.6, top_p=0.95, top_k=20, min_p=0.0, presence_penalty=0.0, repetition_penalty=1.0
 #Instruct (or non-thinking) mode: temperature=0.7, top_p=0.80, top_k=20, min_p=0.0, presence_penalty=1.5, repetition_penalty=1.0

 #export LLAMA_CACHE="$HOME/.cache/huggingface/hub"
 ./build/bin/llama-server \
   --verbose \
   --metrics \
   --host 0.0.0.0 --port 11234 \
   --api-key-file ~/.config/agents/tokens.txt \
   --n-gpu-layers 99 \
   --ctx-size 262144 \
   --no-context-shift \
   -hf unsloth/Qwen3.6-27B-MTP-GGUF:UD-Q4_K_XL \
   --temp 0.6 \
   --top-p 0.95 \
   --top-k 20 \
   --min-p 0.0 \
   --presence-penalty 0.0 \
   --repeat-penalty 1.0

 # In my testing, MTP was a minor slowdown: (baseline) ~14.5 t/s to (1) 13.5 t/s, (2) 12.5 t/s, (6) 9.5 t/s
 #   --spec-type draft-mtp --spec-draft-n-max 2 \
 #-hf unsloth/Qwen3.6-27B-MTP-GGUF:UD-Q4_K_M \
	#!/bin/sh

	# Install xcode and cmake
	xcode-select --install
	curl https://webi.sh/cmake \| sh
	. ~/.config/envman/PATH.env

	# Download llama.cpp at 1348f67
	git clone https://github.com/ggml-org/llama.cpp
	cd ./llama.cpp/
	# Note: MTP added on May 16th in b9186
	git switch b9186 --detach

	# Build llama.cpp
	cmake -B build -DLLAMA_BUILD_BORINGSSL=ON -DGGML_CUDA=OFF
	cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

	# Run and download model
	./build/bin/llama-server \
	--verbose \
	--metrics \
	--api-key-file ~/.config/agents/tokens.txt \
	--n-gpu-layers 99 \
	--ctx-size 132000 \
	--no-context-shift \
	--host 0.0.0.0 --port 11234 \
	-hf unsloth/Qwen3.6-35B-A3B-GGUF:UD-Q4_K_M
	#!/bin/sh
	set -eu

	# https://www.reddit.com/r/LocalLLaMA/comments/1tdns1i/used_over_a_million_tokens_in_three_separate/
	# https://huggingface.co/unsloth/Qwen3.6-27B-MTP-GGUF
	# https://unsloth.ai/docs/models/qwen3.6#mtp-guide

	#Thinking mode for general tasks: temperature=1.0, top_p=0.95, top_k=20, min_p=0.0, presence_penalty=0.0, repetition_penalty=1.0
	#Thinking mode for precise coding tasks (e.g. WebDev): temperature=0.6, top_p=0.95, top_k=20, min_p=0.0, presence_penalty=0.0, repetition_penalty=1.0
	#Instruct (or non-thinking) mode: temperature=0.7, top_p=0.80, top_k=20, min_p=0.0, presence_penalty=1.5, repetition_penalty=1.0

	#export LLAMA_CACHE="$HOME/.cache/huggingface/hub"
	./build/bin/llama-server \
	--verbose \
	--metrics \
	--host 0.0.0.0 --port 11234 \
	--api-key-file ~/.config/agents/tokens.txt \
	--n-gpu-layers 99 \
	--ctx-size 262144 \
	--no-context-shift \
	-hf unsloth/Qwen3.6-27B-MTP-GGUF:UD-Q4_K_XL \
	--temp 0.6 \
	--top-p 0.95 \
	--top-k 20 \
	--min-p 0.0 \
	--presence-penalty 0.0 \
	--repeat-penalty 1.0

	# In my testing, MTP was a minor slowdown: (baseline) ~14.5 t/s to (1) 13.5 t/s, (2) 12.5 t/s, (6) 9.5 t/s
	# --spec-type draft-mtp --spec-draft-n-max 2 \
	#-hf unsloth/Qwen3.6-27B-MTP-GGUF:UD-Q4_K_M \