chore(halo): preload both llama models and tune preset

Preload Qwen3.6-27B and Qwen3.6-35B-A3B at startup (load-on-startup)
so both are warm immediately under --models-max 2, set parallel = 1
as the [*] fallback for any other model, and adjust per-model context
size and draft depth.
This commit is contained in:
Harald Hoyer 2026-05-20 07:14:26 +02:00
parent 31e491e314
commit ac70c57c15

View file

@ -2,6 +2,7 @@ version = 1
[*]
flash-attn = on
parallel = 1
jinja = true
n-gpu-layers = 99
threads = 8
@ -20,13 +21,15 @@ spec-type = draft-mtp
[Qwen3.6-35B-A3B]
hf = unsloth/Qwen3.6-35B-A3B-MTP-GGUF:UD-Q8_K_XL
spec-draft-n-max = 3
spec-draft-n-max = 2
parallel = 1
c = 262144
c = 131072
load-on-startup = true
[Qwen3.6-27B]
hf = unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K
spec-draft-n-max = 6
parallel = 2
c = 524288
load-on-startup = true