feat(halo): serve multiple llama models via models.ini preset

Replace the per-model llama-server units with a single service that uses llama-server's --models-preset (models.ini) and --models-max 2, so the 35B-A3B and 27B models are loaded on demand from one config. Drop the now-redundant 27B / 27B-MTP / coder-next variant files and the unused CacheDirectory + slot-save-path KV-slot handling.
2026-05-20 00:19:27 +02:00 · 2026-05-20 00:19:27 +02:00 · 0edf975c30
commit 0edf975c30
parent ae068cfd84
6 changed files with 34 additions and 199 deletions
--- a/systems/x86_64-linux/halo/models.ini
+++ b/systems/x86_64-linux/halo/models.ini
@ -0,0 +1,30 @@
+version = 1
+
+[*]
+flash-attn          = on
+parallel            = 2
+jinja               = true
+n-gpu-layers        = 99
+threads             = 8
+ubatch-size         = 256
+cache-type-k        = bf16
+cache-type-v        = bf16
+temp                = 0.6
+top-p               = 0.95
+top-k               = 20
+min-p               = 0.0
+mmap                = false
+no-context-shift    = true
+chat-template-kwargs = {"preserve_thinking": true}
+c                   = 524288
+fit                 = on
+spec-type           = draft-mtp
+
+[Qwen3.6-35B-A3B]
+hf                  = unsloth/Qwen3.6-35B-A3B-MTP-GGUF:UD-Q6_K
+spec-draft-n-max    = 3
+
+[Qwen3.6-27B]
+hf                  = unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K
+spec-draft-n-max    = 6
+