feat(halo): serve bge-m3 embeddings alongside coder
Add a multilingual bge-m3 embedding model to the llama-server preset and raise --models-max to 2 so it stays co-resident with the coder model. This gives the RAG stack a local embeddings endpoint without a second service, keeping all inference on halo. Embedding-specific overrides (ubatch-size, context, pooling) are pinned since the global defaults would truncate or misconfigure embedding requests.
This commit is contained in:
parent
a1b55fe2ec
commit
ab729a0720
2 changed files with 14 additions and 1 deletions
|
|
@ -29,7 +29,7 @@
|
||||||
"--host 0.0.0.0"
|
"--host 0.0.0.0"
|
||||||
"--port 8000"
|
"--port 8000"
|
||||||
"--models-preset ${./models.ini}"
|
"--models-preset ${./models.ini}"
|
||||||
"--models-max 1"
|
"--models-max 2"
|
||||||
];
|
];
|
||||||
Restart = "on-failure";
|
Restart = "on-failure";
|
||||||
RestartSec = 10;
|
RestartSec = 10;
|
||||||
|
|
|
||||||
|
|
@ -14,6 +14,19 @@ no-context-shift = true
|
||||||
fit = on
|
fit = on
|
||||||
c = 131072
|
c = 131072
|
||||||
|
|
||||||
|
# Multilingual embedding model for RAG, kept co-resident with `coder`
|
||||||
|
# (requires --models-max 2). Overrides global defaults that break embeddings:
|
||||||
|
# ubatch-size must cover the whole input (global 256 would truncate), and the
|
||||||
|
# context is pinned to the model's 8192 max instead of the global 131072.
|
||||||
|
[bge-m3]
|
||||||
|
hf = ggml-org/bge-m3-Q8_0-GGUF
|
||||||
|
embeddings = true
|
||||||
|
pooling = cls
|
||||||
|
ubatch-size = 8192
|
||||||
|
c = 8192
|
||||||
|
fit = off
|
||||||
|
load-on-startup = true
|
||||||
|
|
||||||
[coder]
|
[coder]
|
||||||
hf = unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K
|
hf = unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K
|
||||||
spec-type = ngram-simple,draft-mtp
|
spec-type = ngram-simple,draft-mtp
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue