From ab729a0720b06844ac9fe9e4b7a4391d5517c32e Mon Sep 17 00:00:00 2001 From: Harald Hoyer Date: Fri, 22 May 2026 00:35:28 +0200 Subject: [PATCH] feat(halo): serve bge-m3 embeddings alongside coder Add a multilingual bge-m3 embedding model to the llama-server preset and raise --models-max to 2 so it stays co-resident with the coder model. This gives the RAG stack a local embeddings endpoint without a second service, keeping all inference on halo. Embedding-specific overrides (ubatch-size, context, pooling) are pinned since the global defaults would truncate or misconfigure embedding requests. --- systems/x86_64-linux/halo/llama-server.nix | 2 +- systems/x86_64-linux/halo/models.ini | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/systems/x86_64-linux/halo/llama-server.nix b/systems/x86_64-linux/halo/llama-server.nix index 4cca9f8..8264f3a 100644 --- a/systems/x86_64-linux/halo/llama-server.nix +++ b/systems/x86_64-linux/halo/llama-server.nix @@ -29,7 +29,7 @@ "--host 0.0.0.0" "--port 8000" "--models-preset ${./models.ini}" - "--models-max 1" + "--models-max 2" ]; Restart = "on-failure"; RestartSec = 10; diff --git a/systems/x86_64-linux/halo/models.ini b/systems/x86_64-linux/halo/models.ini index b35ddd0..00ad6ba 100644 --- a/systems/x86_64-linux/halo/models.ini +++ b/systems/x86_64-linux/halo/models.ini @@ -14,6 +14,19 @@ no-context-shift = true fit = on c = 131072 +# Multilingual embedding model for RAG, kept co-resident with `coder` +# (requires --models-max 2). Overrides global defaults that break embeddings: +# ubatch-size must cover the whole input (global 256 would truncate), and the +# context is pinned to the model's 8192 max instead of the global 131072. +[bge-m3] +hf = ggml-org/bge-m3-Q8_0-GGUF +embeddings = true +pooling = cls +ubatch-size = 8192 +c = 8192 +fit = off +load-on-startup = true + [coder] hf = unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K spec-type = ngram-simple,draft-mtp