feat(halo): serve bge-m3 embeddings alongside coder

Add a multilingual bge-m3 embedding model to the llama-server preset and raise --models-max to 2 so it stays co-resident with the coder model. This gives the RAG stack a local embeddings endpoint without a second service, keeping all inference on halo. Embedding-specific overrides (ubatch-size, context, pooling) are pinned since the global defaults would truncate or misconfigure embedding requests.
2026-05-22 00:35:28 +02:00 · 2026-05-22 00:35:28 +02:00 · ab729a0720
commit ab729a0720
parent a1b55fe2ec
2 changed files with 14 additions and 1 deletions
--- a/systems/x86_64-linux/halo/models.ini
+++ b/systems/x86_64-linux/halo/models.ini
@ -14,6 +14,19 @@ no-context-shift    = true
 fit                 = on
 c                   = 131072

+# Multilingual embedding model for RAG, kept co-resident with `coder`
+# (requires --models-max 2). Overrides global defaults that break embeddings:
+# ubatch-size must cover the whole input (global 256 would truncate), and the
+# context is pinned to the model's 8192 max instead of the global 131072.
+[bge-m3]
+hf                  = ggml-org/bge-m3-Q8_0-GGUF
+embeddings          = true
+pooling             = cls
+ubatch-size         = 8192
+c                   = 8192
+fit                 = off
+load-on-startup     = true
+
 [coder]
 hf                  = unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K
 spec-type           = ngram-simple,draft-mtp