feat(halo): serve multiple llama models via models.ini preset
Replace the per-model llama-server units with a single service that uses llama-server's --models-preset (models.ini) and --models-max 2, so the 35B-A3B and 27B models are loaded on demand from one config. Drop the now-redundant 27B / 27B-MTP / coder-next variant files and the unused CacheDirectory + slot-save-path KV-slot handling.
This commit is contained in:
parent
ae068cfd84
commit
0edf975c30
6 changed files with 34 additions and 199 deletions
30
systems/x86_64-linux/halo/models.ini
Normal file
30
systems/x86_64-linux/halo/models.ini
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
version = 1
|
||||
|
||||
[*]
|
||||
flash-attn = on
|
||||
parallel = 2
|
||||
jinja = true
|
||||
n-gpu-layers = 99
|
||||
threads = 8
|
||||
ubatch-size = 256
|
||||
cache-type-k = bf16
|
||||
cache-type-v = bf16
|
||||
temp = 0.6
|
||||
top-p = 0.95
|
||||
top-k = 20
|
||||
min-p = 0.0
|
||||
mmap = false
|
||||
no-context-shift = true
|
||||
chat-template-kwargs = {"preserve_thinking": true}
|
||||
c = 524288
|
||||
fit = on
|
||||
spec-type = draft-mtp
|
||||
|
||||
[Qwen3.6-35B-A3B]
|
||||
hf = unsloth/Qwen3.6-35B-A3B-MTP-GGUF:UD-Q6_K
|
||||
spec-draft-n-max = 3
|
||||
|
||||
[Qwen3.6-27B]
|
||||
hf = unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K
|
||||
spec-draft-n-max = 6
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue