From ac70c57c1501da872b29092e06a6fa07cfe3c29e Mon Sep 17 00:00:00 2001
From: Harald Hoyer <harald@hoyer.xyz>
Date: Wed, 20 May 2026 07:14:26 +0200
Subject: [PATCH] chore(halo): preload both llama models and tune preset

Preload Qwen3.6-27B and Qwen3.6-35B-A3B at startup (load-on-startup)
so both are warm immediately under --models-max 2, set parallel = 1
as the [*] fallback for any other model, and adjust per-model context
size and draft depth.
---
 systems/x86_64-linux/halo/models.ini | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/systems/x86_64-linux/halo/models.ini b/systems/x86_64-linux/halo/models.ini
index 9d23606..83b160b 100644
--- a/systems/x86_64-linux/halo/models.ini
+++ b/systems/x86_64-linux/halo/models.ini
@@ -2,6 +2,7 @@ version = 1
 
 [*]
 flash-attn          = on
+parallel            = 1
 jinja               = true
 n-gpu-layers        = 99
 threads             = 8
@@ -20,13 +21,15 @@ spec-type           = draft-mtp
 
 [Qwen3.6-35B-A3B]
 hf                  = unsloth/Qwen3.6-35B-A3B-MTP-GGUF:UD-Q8_K_XL
-spec-draft-n-max    = 3
+spec-draft-n-max    = 2
 parallel            = 1
-c                   = 262144
+c                   = 131072
+load-on-startup     = true
 
 [Qwen3.6-27B]
 hf                  = unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K
 spec-draft-n-max    = 6
 parallel            = 2
 c                   = 524288
+load-on-startup     = true