From ab729a0720b06844ac9fe9e4b7a4391d5517c32e Mon Sep 17 00:00:00 2001
From: Harald Hoyer <harald@hoyer.xyz>
Date: Fri, 22 May 2026 00:35:28 +0200
Subject: [PATCH] feat(halo): serve bge-m3 embeddings alongside coder

Add a multilingual bge-m3 embedding model to the llama-server preset and
raise --models-max to 2 so it stays co-resident with the coder model.
This gives the RAG stack a local embeddings endpoint without a second
service, keeping all inference on halo. Embedding-specific overrides
(ubatch-size, context, pooling) are pinned since the global defaults
would truncate or misconfigure embedding requests.
---
 systems/x86_64-linux/halo/llama-server.nix |  2 +-
 systems/x86_64-linux/halo/models.ini       | 13 +++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/systems/x86_64-linux/halo/llama-server.nix b/systems/x86_64-linux/halo/llama-server.nix
index 4cca9f8..8264f3a 100644
--- a/systems/x86_64-linux/halo/llama-server.nix
+++ b/systems/x86_64-linux/halo/llama-server.nix
@@ -29,7 +29,7 @@
         "--host 0.0.0.0"
         "--port 8000"
         "--models-preset ${./models.ini}"
-        "--models-max 1"
+        "--models-max 2"
       ];
       Restart = "on-failure";
       RestartSec = 10;
diff --git a/systems/x86_64-linux/halo/models.ini b/systems/x86_64-linux/halo/models.ini
index b35ddd0..00ad6ba 100644
--- a/systems/x86_64-linux/halo/models.ini
+++ b/systems/x86_64-linux/halo/models.ini
@@ -14,6 +14,19 @@ no-context-shift    = true
 fit                 = on
 c                   = 131072
 
+# Multilingual embedding model for RAG, kept co-resident with `coder`
+# (requires --models-max 2). Overrides global defaults that break embeddings:
+# ubatch-size must cover the whole input (global 256 would truncate), and the
+# context is pinned to the model's 8192 max instead of the global 131072.
+[bge-m3]
+hf                  = ggml-org/bge-m3-Q8_0-GGUF
+embeddings          = true
+pooling             = cls
+ubatch-size         = 8192
+c                   = 8192
+fit                 = off
+load-on-startup     = true
+
 [coder]
 hf                  = unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K
 spec-type           = ngram-simple,draft-mtp