From ac70c57c1501da872b29092e06a6fa07cfe3c29e Mon Sep 17 00:00:00 2001 From: Harald Hoyer Date: Wed, 20 May 2026 07:14:26 +0200 Subject: [PATCH] chore(halo): preload both llama models and tune preset Preload Qwen3.6-27B and Qwen3.6-35B-A3B at startup (load-on-startup) so both are warm immediately under --models-max 2, set parallel = 1 as the [*] fallback for any other model, and adjust per-model context size and draft depth. --- systems/x86_64-linux/halo/models.ini | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/systems/x86_64-linux/halo/models.ini b/systems/x86_64-linux/halo/models.ini index 9d23606..83b160b 100644 --- a/systems/x86_64-linux/halo/models.ini +++ b/systems/x86_64-linux/halo/models.ini @@ -2,6 +2,7 @@ version = 1 [*] flash-attn = on +parallel = 1 jinja = true n-gpu-layers = 99 threads = 8 @@ -20,13 +21,15 @@ spec-type = draft-mtp [Qwen3.6-35B-A3B] hf = unsloth/Qwen3.6-35B-A3B-MTP-GGUF:UD-Q8_K_XL -spec-draft-n-max = 3 +spec-draft-n-max = 2 parallel = 1 -c = 262144 +c = 131072 +load-on-startup = true [Qwen3.6-27B] hf = unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K spec-draft-n-max = 6 parallel = 2 c = 524288 +load-on-startup = true