From 5ee2f653372ece18e51e13b93a560f827f540a3c Mon Sep 17 00:00:00 2001 From: Harald Hoyer Date: Wed, 20 May 2026 11:16:18 +0200 Subject: [PATCH] chore(halo): tune llama models.ini and drop 35B-A3B model Serve only Qwen3.6-27B; remove the unused 35B-A3B preset. Tuning: - Move model-specific keys (spec-type, sampling temp/top-p/top-k/min-p) out of the [*] defaults into [Qwen3.6-27B] so they no longer leak onto other models; draft-mtp in particular only works on MTP-weighted models. - Drop the duplicate parallel key from [*]. - Bump ubatch-size 256 -> 512 for faster iGPU prefill on Strix Halo. - Add threads-batch = 16 to use all cores for prefill while keeping generation at threads = 8 under full GPU offload. --- systems/x86_64-linux/halo/models.ini | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/systems/x86_64-linux/halo/models.ini b/systems/x86_64-linux/halo/models.ini index 83b160b..ddb3509 100644 --- a/systems/x86_64-linux/halo/models.ini +++ b/systems/x86_64-linux/halo/models.ini @@ -6,30 +6,24 @@ parallel = 1 jinja = true n-gpu-layers = 99 threads = 8 -ubatch-size = 256 +ubatch-size = 512 cache-type-k = bf16 cache-type-v = bf16 -temp = 0.6 -top-p = 0.95 -top-k = 20 -min-p = 0.0 mmap = false no-context-shift = true chat-template-kwargs = {"preserve_thinking": true} fit = on -spec-type = draft-mtp - -[Qwen3.6-35B-A3B] -hf = unsloth/Qwen3.6-35B-A3B-MTP-GGUF:UD-Q8_K_XL -spec-draft-n-max = 2 -parallel = 1 c = 131072 -load-on-startup = true [Qwen3.6-27B] hf = unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K +spec-type = draft-mtp spec-draft-n-max = 6 +threads-batch = 16 +temp = 0.6 +top-p = 0.95 +top-k = 20 +min-p = 0.0 parallel = 2 c = 524288 load-on-startup = true -