Add a multilingual bge-m3 embedding model to the llama-server preset and raise --models-max to 2 so it stays co-resident with the coder model. This gives the RAG stack a local embeddings endpoint without a second service, keeping all inference on halo. Embedding-specific overrides (ubatch-size, context, pooling) are pinned since the global defaults would truncate or misconfigure embedding requests.
43 lines
1.2 KiB
INI
43 lines
1.2 KiB
INI
version = 1
|
|
|
|
[*]
|
|
flash-attn = on
|
|
parallel = 1
|
|
jinja = true
|
|
n-gpu-layers = 99
|
|
threads = 8
|
|
ubatch-size = 256
|
|
cache-type-k = bf16
|
|
cache-type-v = bf16
|
|
mmap = false
|
|
no-context-shift = true
|
|
fit = on
|
|
c = 131072
|
|
|
|
# Multilingual embedding model for RAG, kept co-resident with `coder`
|
|
# (requires --models-max 2). Overrides global defaults that break embeddings:
|
|
# ubatch-size must cover the whole input (global 256 would truncate), and the
|
|
# context is pinned to the model's 8192 max instead of the global 131072.
|
|
[bge-m3]
|
|
hf = ggml-org/bge-m3-Q8_0-GGUF
|
|
embeddings = true
|
|
pooling = cls
|
|
ubatch-size = 8192
|
|
c = 8192
|
|
fit = off
|
|
load-on-startup = true
|
|
|
|
[coder]
|
|
hf = unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K
|
|
spec-type = ngram-simple,draft-mtp
|
|
spec-draft-n-max = 5
|
|
threads-batch = 16
|
|
temp = 0.6
|
|
top-p = 0.95
|
|
top-k = 20
|
|
min-p = 0.0
|
|
presence-penalty = 1.5
|
|
parallel = 2
|
|
c = 524288
|
|
load-on-startup = true
|
|
chat-template-kwargs = {"preserve_thinking": true}
|