diff --git a/systems/x86_64-linux/halo/llama-server.nix b/systems/x86_64-linux/halo/llama-server.nix index 4cca9f8..8264f3a 100644 --- a/systems/x86_64-linux/halo/llama-server.nix +++ b/systems/x86_64-linux/halo/llama-server.nix @@ -29,7 +29,7 @@ "--host 0.0.0.0" "--port 8000" "--models-preset ${./models.ini}" - "--models-max 1" + "--models-max 2" ]; Restart = "on-failure"; RestartSec = 10; diff --git a/systems/x86_64-linux/halo/models.ini b/systems/x86_64-linux/halo/models.ini index b35ddd0..00ad6ba 100644 --- a/systems/x86_64-linux/halo/models.ini +++ b/systems/x86_64-linux/halo/models.ini @@ -14,6 +14,19 @@ no-context-shift = true fit = on c = 131072 +# Multilingual embedding model for RAG, kept co-resident with `coder` +# (requires --models-max 2). Overrides global defaults that break embeddings: +# ubatch-size must cover the whole input (global 256 would truncate), and the +# context is pinned to the model's 8192 max instead of the global 131072. +[bge-m3] +hf = ggml-org/bge-m3-Q8_0-GGUF +embeddings = true +pooling = cls +ubatch-size = 8192 +c = 8192 +fit = off +load-on-startup = true + [coder] hf = unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K spec-type = ngram-simple,draft-mtp