diff --git a/systems/x86_64-linux/halo/models.ini b/systems/x86_64-linux/halo/models.ini index 83b160b..ddb3509 100644 --- a/systems/x86_64-linux/halo/models.ini +++ b/systems/x86_64-linux/halo/models.ini @@ -6,30 +6,24 @@ parallel = 1 jinja = true n-gpu-layers = 99 threads = 8 -ubatch-size = 256 +ubatch-size = 512 cache-type-k = bf16 cache-type-v = bf16 -temp = 0.6 -top-p = 0.95 -top-k = 20 -min-p = 0.0 mmap = false no-context-shift = true chat-template-kwargs = {"preserve_thinking": true} fit = on -spec-type = draft-mtp - -[Qwen3.6-35B-A3B] -hf = unsloth/Qwen3.6-35B-A3B-MTP-GGUF:UD-Q8_K_XL -spec-draft-n-max = 2 -parallel = 1 c = 131072 -load-on-startup = true [Qwen3.6-27B] hf = unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K +spec-type = draft-mtp spec-draft-n-max = 6 +threads-batch = 16 +temp = 0.6 +top-p = 0.95 +top-k = 20 +min-p = 0.0 parallel = 2 c = 524288 load-on-startup = true -