nixcfg/systems/x86_64-linux/halo/models.ini
Harald Hoyer 5ee2f65337 chore(halo): tune llama models.ini and drop 35B-A3B model
Serve only Qwen3.6-27B; remove the unused 35B-A3B preset.

Tuning:
- Move model-specific keys (spec-type, sampling temp/top-p/top-k/min-p)
  out of the [*] defaults into [Qwen3.6-27B] so they no longer leak onto
  other models; draft-mtp in particular only works on MTP-weighted models.
- Drop the duplicate parallel key from [*].
- Bump ubatch-size 256 -> 512 for faster iGPU prefill on Strix Halo.
- Add threads-batch = 16 to use all cores for prefill while keeping
  generation at threads = 8 under full GPU offload.
2026-05-20 14:23:42 +02:00

29 lines
718 B
INI

version = 1
[*]
flash-attn = on
parallel = 1
jinja = true
n-gpu-layers = 99
threads = 8
ubatch-size = 512
cache-type-k = bf16
cache-type-v = bf16
mmap = false
no-context-shift = true
chat-template-kwargs = {"preserve_thinking": true}
fit = on
c = 131072
[Qwen3.6-27B]
hf = unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K
spec-type = draft-mtp
spec-draft-n-max = 6
threads-batch = 16
temp = 0.6
top-p = 0.95
top-k = 20
min-p = 0.0
parallel = 2
c = 524288
load-on-startup = true