nixcfg/systems/x86_64-linux/sgx/litellm.nix
Harald Hoyer bc0d79db57 feat(rag): route the fast model and use it for synthesis by default
Expose halo's [fast] MoE preset through the LiteLLM gateway and make it
the rag CLI's default chat model (overridable via RAG_CHAT_MODEL), so
query synthesis is quicker than the larger coder model.
2026-05-22 09:15:59 +02:00

62 lines
2 KiB
Nix

{ config, ... }:
{
# OpenAI-compatible gateway in front of halo's llama-server, exposed as a
# shared endpoint across the Tailnet (per-key routing, logging, future cloud
# fallback) so clients hit sgx:4000 instead of hardcoding halo's address.
services.litellm = {
enable = true;
host = "0.0.0.0";
port = 4001; # 8080 Open WebUI, 8081 searx, 4000 uptime-kuma
openFirewall = true; # reachable across the LAN
environmentFile = config.sops.secrets."litellm/env".path;
settings = {
model_list = [
{
# halo exposes the `[coder]` preset from systems/.../halo/models.ini.
# llama-server speaks the OpenAI API, so route it as an openai/* model.
model_name = "coder";
litellm_params = {
model = "openai/coder";
api_base = "http://halo:8000/v1";
api_key = "none"; # llama-server requires no key; value is ignored
};
}
{
# Faster MoE chat model (the `[fast]` preset), default for rag synthesis.
model_name = "fast";
litellm_params = {
model = "openai/fast";
api_base = "http://halo:8000/v1";
api_key = "none";
};
}
{
# Multilingual embeddings, also served by halo's router (the `[bge-m3]`
# preset). Exposes /v1/embeddings on this gateway for the rag CLI.
model_name = "bge-m3";
litellm_params = {
model = "openai/bge-m3";
api_base = "http://halo:8000/v1";
api_key = "none";
};
}
];
general_settings = {
master_key = "os.environ/LITELLM_MASTER_KEY";
};
litellm_settings = {
drop_params = true;
};
};
};
# Decrypted file must contain the env line: LITELLM_MASTER_KEY=sk-...
# Read by systemd (as root) before dropping to litellm's DynamicUser.
sops.secrets."litellm/env" = {
sopsFile = ../../../.secrets/sgx/litellm.yaml;
restartUnits = [ "litellm.service" ];
};
}