chore(halo): tune llama models.ini and drop 35B-A3B model

Serve only Qwen3.6-27B; remove the unused 35B-A3B preset.

Tuning:
- Move model-specific keys (spec-type, sampling temp/top-p/top-k/min-p)
  out of the [*] defaults into [Qwen3.6-27B] so they no longer leak onto
  other models; draft-mtp in particular only works on MTP-weighted models.
- Drop the duplicate parallel key from [*].
- Bump ubatch-size 256 -> 512 for faster iGPU prefill on Strix Halo.
- Add threads-batch = 16 to use all cores for prefill while keeping
  generation at threads = 8 under full GPU offload.
This commit is contained in:
Harald Hoyer 2026-05-20 11:16:18 +02:00
parent 2e5fb2bf83
commit 5ee2f65337

View file

@ -6,30 +6,24 @@ parallel = 1
jinja = true jinja = true
n-gpu-layers = 99 n-gpu-layers = 99
threads = 8 threads = 8
ubatch-size = 256 ubatch-size = 512
cache-type-k = bf16 cache-type-k = bf16
cache-type-v = bf16 cache-type-v = bf16
temp = 0.6
top-p = 0.95
top-k = 20
min-p = 0.0
mmap = false mmap = false
no-context-shift = true no-context-shift = true
chat-template-kwargs = {"preserve_thinking": true} chat-template-kwargs = {"preserve_thinking": true}
fit = on fit = on
spec-type = draft-mtp
[Qwen3.6-35B-A3B]
hf = unsloth/Qwen3.6-35B-A3B-MTP-GGUF:UD-Q8_K_XL
spec-draft-n-max = 2
parallel = 1
c = 131072 c = 131072
load-on-startup = true
[Qwen3.6-27B] [Qwen3.6-27B]
hf = unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K hf = unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K
spec-type = draft-mtp
spec-draft-n-max = 6 spec-draft-n-max = 6
threads-batch = 16
temp = 0.6
top-p = 0.95
top-k = 20
min-p = 0.0
parallel = 2 parallel = 2
c = 524288 c = 524288
load-on-startup = true load-on-startup = true