diff --git a/packages/rag/default.nix b/packages/rag/default.nix index 1335461..dd0223e 100644 --- a/packages/rag/default.nix +++ b/packages/rag/default.nix @@ -27,7 +27,7 @@ writers.writePython3Bin "rag" API_KEY = os.environ.get("RAG_API_KEY", "none") QDRANT_URL = os.environ.get("RAG_QDRANT_URL", "http://sgx:6333") EMBED_MODEL = os.environ.get("RAG_EMBED_MODEL", "bge-m3") - CHAT_MODEL = os.environ.get("RAG_CHAT_MODEL", "coder") + CHAT_MODEL = os.environ.get("RAG_CHAT_MODEL", "fast") DEFAULT_COLLECTION = os.environ.get("RAG_COLLECTION", "docs") client = OpenAI(base_url=API_BASE, api_key=API_KEY) diff --git a/systems/x86_64-linux/sgx/litellm.nix b/systems/x86_64-linux/sgx/litellm.nix index 29b03d8..f8cdb7b 100644 --- a/systems/x86_64-linux/sgx/litellm.nix +++ b/systems/x86_64-linux/sgx/litellm.nix @@ -22,6 +22,15 @@ api_key = "none"; # llama-server requires no key; value is ignored }; } + { + # Faster MoE chat model (the `[fast]` preset), default for rag synthesis. + model_name = "fast"; + litellm_params = { + model = "openai/fast"; + api_base = "http://halo:8000/v1"; + api_key = "none"; + }; + } { # Multilingual embeddings, also served by halo's router (the `[bge-m3]` # preset). Exposes /v1/embeddings on this gateway for the rag CLI.