nixcfg/systems/x86_64-linux/halo/models.ini

version = 1

[*]
flash-attn          = on
parallel            = 1
jinja               = true
n-gpu-layers        = 99
threads             = 8
ubatch-size         = 256
cache-type-k        = bf16
cache-type-v        = bf16
mmap                = false
no-context-shift    = true
fit                 = on
c                   = 131072

# Multilingual embedding model for RAG, kept co-resident with `coder`
# (requires --models-max 2). Overrides global defaults that break embeddings:
# ubatch-size must cover the whole input (global 256 would truncate), and the
# context is pinned to the model's 8192 max instead of the global 131072.
[bge-m3]
hf                  = ggml-org/bge-m3-Q8_0-GGUF
embeddings          = true
pooling             = cls
ubatch-size         = 8192
c                   = 8192
fit                 = off
load-on-startup     = true

[coder]
hf                  = unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K
spec-type           = ngram-simple,draft-mtp
spec-draft-n-max    = 5
threads-batch       = 16
temp                = 0.6
top-p               = 0.95
top-k               = 20
min-p               = 0.0
presence-penalty    = 1.5
parallel            = 2
c                   = 524288
load-on-startup     = true
chat-template-kwargs = {"preserve_thinking": true}

[fast]
hf                  = byteshape/Qwen3.6-35B-A3B-MTP-GGUF:IQ4_XS
spec-type           = ngram-simple,draft-mtp
spec-draft-n-max    = 3
threads-batch       = 16
temp                = 0.6
top-p               = 0.95
top-k               = 20
min-p               = 0.0
presence-penalty    = 1.5
parallel            = 1
c                   = 131072
load-on-startup     = true
chat-template-kwargs = {"preserve_thinking": true}