Last active
May 16, 2026 22:22
-
-
Save coolaj86/43884323c299ad8bab13a75072732703 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/sh | |
| # Install xcode and cmake | |
| xcode-select --install | |
| curl https://webi.sh/cmake | sh | |
| . ~/.config/envman/PATH.env | |
| # Download llama.cpp at 1348f67 | |
| git clone https://github.com/ggml-org/llama.cpp | |
| cd ./llama.cpp/ | |
| # Note: MTP added on May 16th in b9186 | |
| git switch b9186 --detach | |
| # Build llama.cpp | |
| cmake -B build -DLLAMA_BUILD_BORINGSSL=ON -DGGML_CUDA=OFF | |
| cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) | |
| # Run and download model | |
| ./build/bin/llama-server \ | |
| --verbose \ | |
| --metrics \ | |
| --api-key-file ~/.config/agents/tokens.txt \ | |
| --n-gpu-layers 99 \ | |
| --ctx-size 132000 \ | |
| --no-context-shift \ | |
| --host 0.0.0.0 --port 11234 \ | |
| -hf unsloth/Qwen3.6-35B-A3B-GGUF:UD-Q4_K_M |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/sh | |
| set -eu | |
| # https://www.reddit.com/r/LocalLLaMA/comments/1tdns1i/used_over_a_million_tokens_in_three_separate/ | |
| # https://huggingface.co/unsloth/Qwen3.6-27B-MTP-GGUF | |
| # https://unsloth.ai/docs/models/qwen3.6#mtp-guide | |
| #Thinking mode for general tasks: temperature=1.0, top_p=0.95, top_k=20, min_p=0.0, presence_penalty=0.0, repetition_penalty=1.0 | |
| #Thinking mode for precise coding tasks (e.g. WebDev): temperature=0.6, top_p=0.95, top_k=20, min_p=0.0, presence_penalty=0.0, repetition_penalty=1.0 | |
| #Instruct (or non-thinking) mode: temperature=0.7, top_p=0.80, top_k=20, min_p=0.0, presence_penalty=1.5, repetition_penalty=1.0 | |
| #export LLAMA_CACHE="$HOME/.cache/huggingface/hub" | |
| ./build/bin/llama-server \ | |
| --verbose \ | |
| --metrics \ | |
| --host 0.0.0.0 --port 11234 \ | |
| --api-key-file ~/.config/agents/tokens.txt \ | |
| --n-gpu-layers 99 \ | |
| --ctx-size 262144 \ | |
| --no-context-shift \ | |
| -hf unsloth/Qwen3.6-27B-MTP-GGUF:UD-Q4_K_XL \ | |
| --temp 0.6 \ | |
| --top-p 0.95 \ | |
| --top-k 20 \ | |
| --min-p 0.0 \ | |
| --presence-penalty 0.0 \ | |
| --repeat-penalty 1.0 | |
| # In my testing, MTP was a minor slowdown: (baseline) ~14.5 t/s to (1) 13.5 t/s, (2) 12.5 t/s, (6) 9.5 t/s | |
| # --spec-type draft-mtp --spec-draft-n-max 2 \ | |
| #-hf unsloth/Qwen3.6-27B-MTP-GGUF:UD-Q4_K_M \ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment