diff --git a/systems/x86_64-linux/halo/llama-server.nix b/systems/x86_64-linux/halo/llama-server.nix
index 4cca9f8..8264f3a 100644
--- a/systems/x86_64-linux/halo/llama-server.nix
+++ b/systems/x86_64-linux/halo/llama-server.nix
@@ -29,7 +29,7 @@
         "--host 0.0.0.0"
         "--port 8000"
         "--models-preset ${./models.ini}"
-        "--models-max 1"
+        "--models-max 2"
       ];
       Restart = "on-failure";
       RestartSec = 10;
diff --git a/systems/x86_64-linux/halo/models.ini b/systems/x86_64-linux/halo/models.ini
index b35ddd0..00ad6ba 100644
--- a/systems/x86_64-linux/halo/models.ini
+++ b/systems/x86_64-linux/halo/models.ini
@@ -14,6 +14,19 @@ no-context-shift    = true
 fit                 = on
 c                   = 131072
 
+# Multilingual embedding model for RAG, kept co-resident with `coder`
+# (requires --models-max 2). Overrides global defaults that break embeddings:
+# ubatch-size must cover the whole input (global 256 would truncate), and the
+# context is pinned to the model's 8192 max instead of the global 131072.
+[bge-m3]
+hf                  = ggml-org/bge-m3-Q8_0-GGUF
+embeddings          = true
+pooling             = cls
+ubatch-size         = 8192
+c                   = 8192
+fit                 = off
+load-on-startup     = true
+
 [coder]
 hf                  = unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K
 spec-type           = ngram-simple,draft-mtp