feat(halo): serve bge-m3 embeddings alongside coder

Add a multilingual bge-m3 embedding model to the llama-server preset and
raise --models-max to 2 so it stays co-resident with the coder model.
This gives the RAG stack a local embeddings endpoint without a second
service, keeping all inference on halo. Embedding-specific overrides
(ubatch-size, context, pooling) are pinned since the global defaults
would truncate or misconfigure embedding requests.
This commit is contained in:
Harald Hoyer 2026-05-22 00:35:28 +02:00
parent a1b55fe2ec
commit ab729a0720
2 changed files with 14 additions and 1 deletions

View file

@ -14,6 +14,19 @@ no-context-shift = true
fit = on
c = 131072
# Multilingual embedding model for RAG, kept co-resident with `coder`
# (requires --models-max 2). Overrides global defaults that break embeddings:
# ubatch-size must cover the whole input (global 256 would truncate), and the
# context is pinned to the model's 8192 max instead of the global 131072.
[bge-m3]
hf = ggml-org/bge-m3-Q8_0-GGUF
embeddings = true
pooling = cls
ubatch-size = 8192
c = 8192
fit = off
load-on-startup = true
[coder]
hf = unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K
spec-type = ngram-simple,draft-mtp