nixcfg/systems/x86_64-linux/halo/models.ini
Harald Hoyer 0edf975c30 feat(halo): serve multiple llama models via models.ini preset
Replace the per-model llama-server units with a single service that
uses llama-server's --models-preset (models.ini) and --models-max 2,
so the 35B-A3B and 27B models are loaded on demand from one config.

Drop the now-redundant 27B / 27B-MTP / coder-next variant files and
the unused CacheDirectory + slot-save-path KV-slot handling.
2026-05-20 00:23:50 +02:00

30 lines
720 B
INI

version = 1
[*]
flash-attn = on
parallel = 2
jinja = true
n-gpu-layers = 99
threads = 8
ubatch-size = 256
cache-type-k = bf16
cache-type-v = bf16
temp = 0.6
top-p = 0.95
top-k = 20
min-p = 0.0
mmap = false
no-context-shift = true
chat-template-kwargs = {"preserve_thinking": true}
c = 524288
fit = on
spec-type = draft-mtp
[Qwen3.6-35B-A3B]
hf = unsloth/Qwen3.6-35B-A3B-MTP-GGUF:UD-Q6_K
spec-draft-n-max = 3
[Qwen3.6-27B]
hf = unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K
spec-draft-n-max = 6