feat(rag): route the fast model and use it for synthesis by default

Expose halo's [fast] MoE preset through the LiteLLM gateway and make it
the rag CLI's default chat model (overridable via RAG_CHAT_MODEL), so
query synthesis is quicker than the larger coder model.
This commit is contained in:
Harald Hoyer 2026-05-22 09:15:59 +02:00
parent 2b1bba0703
commit bc0d79db57
2 changed files with 10 additions and 1 deletions

View file

@ -27,7 +27,7 @@ writers.writePython3Bin "rag"
API_KEY = os.environ.get("RAG_API_KEY", "none") API_KEY = os.environ.get("RAG_API_KEY", "none")
QDRANT_URL = os.environ.get("RAG_QDRANT_URL", "http://sgx:6333") QDRANT_URL = os.environ.get("RAG_QDRANT_URL", "http://sgx:6333")
EMBED_MODEL = os.environ.get("RAG_EMBED_MODEL", "bge-m3") EMBED_MODEL = os.environ.get("RAG_EMBED_MODEL", "bge-m3")
CHAT_MODEL = os.environ.get("RAG_CHAT_MODEL", "coder") CHAT_MODEL = os.environ.get("RAG_CHAT_MODEL", "fast")
DEFAULT_COLLECTION = os.environ.get("RAG_COLLECTION", "docs") DEFAULT_COLLECTION = os.environ.get("RAG_COLLECTION", "docs")
client = OpenAI(base_url=API_BASE, api_key=API_KEY) client = OpenAI(base_url=API_BASE, api_key=API_KEY)

View file

@ -22,6 +22,15 @@
api_key = "none"; # llama-server requires no key; value is ignored api_key = "none"; # llama-server requires no key; value is ignored
}; };
} }
{
# Faster MoE chat model (the `[fast]` preset), default for rag synthesis.
model_name = "fast";
litellm_params = {
model = "openai/fast";
api_base = "http://halo:8000/v1";
api_key = "none";
};
}
{ {
# Multilingual embeddings, also served by halo's router (the `[bge-m3]` # Multilingual embeddings, also served by halo's router (the `[bge-m3]`
# preset). Exposes /v1/embeddings on this gateway for the rag CLI. # preset). Exposes /v1/embeddings on this gateway for the rag CLI.