chore(halo): preload both llama models and tune preset

Preload Qwen3.6-27B and Qwen3.6-35B-A3B at startup (load-on-startup)
so both are warm immediately under --models-max 2, set parallel = 1
as the [*] fallback for any other model, and adjust per-model context
size and draft depth.
This commit is contained in:
Harald Hoyer 2026-05-20 07:14:26 +02:00
parent 31e491e314
commit ac70c57c15

View file

@ -2,6 +2,7 @@ version = 1
[*] [*]
flash-attn = on flash-attn = on
parallel = 1
jinja = true jinja = true
n-gpu-layers = 99 n-gpu-layers = 99
threads = 8 threads = 8
@ -20,13 +21,15 @@ spec-type = draft-mtp
[Qwen3.6-35B-A3B] [Qwen3.6-35B-A3B]
hf = unsloth/Qwen3.6-35B-A3B-MTP-GGUF:UD-Q8_K_XL hf = unsloth/Qwen3.6-35B-A3B-MTP-GGUF:UD-Q8_K_XL
spec-draft-n-max = 3 spec-draft-n-max = 2
parallel = 1 parallel = 1
c = 262144 c = 131072
load-on-startup = true
[Qwen3.6-27B] [Qwen3.6-27B]
hf = unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K hf = unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K
spec-draft-n-max = 6 spec-draft-n-max = 6
parallel = 2 parallel = 2
c = 524288 c = 524288
load-on-startup = true