Stand up document retrieval as shared, client-agnostic primitives rather than locking it inside Open WebUI: - Qdrant as the LAN-reachable vector store - LiteLLM gains a bge-m3 route so sgx:4000 also serves /v1/embeddings - a thin `rag` CLI (ingest/query, optional coder synthesis) usable from any machine and from scripts Embeddings and synthesis run on halo via the gateway; the CLI is configured entirely through RAG_* env vars.
53 lines
1.7 KiB
Nix
53 lines
1.7 KiB
Nix
{ config, ... }:
|
|
{
|
|
# OpenAI-compatible gateway in front of halo's llama-server, exposed as a
|
|
# shared endpoint across the Tailnet (per-key routing, logging, future cloud
|
|
# fallback) so clients hit sgx:4000 instead of hardcoding halo's address.
|
|
services.litellm = {
|
|
enable = true;
|
|
host = "0.0.0.0";
|
|
port = 4000; # 8080 is Open WebUI, 8081 is searx
|
|
openFirewall = true; # reachable across the LAN
|
|
environmentFile = config.sops.secrets."litellm/env".path;
|
|
|
|
settings = {
|
|
model_list = [
|
|
{
|
|
# halo exposes the `[coder]` preset from systems/.../halo/models.ini.
|
|
# llama-server speaks the OpenAI API, so route it as an openai/* model.
|
|
model_name = "coder";
|
|
litellm_params = {
|
|
model = "openai/coder";
|
|
api_base = "http://halo:8000/v1";
|
|
api_key = "none"; # llama-server requires no key; value is ignored
|
|
};
|
|
}
|
|
{
|
|
# Multilingual embeddings, also served by halo's router (the `[bge-m3]`
|
|
# preset). Exposes /v1/embeddings on this gateway for the rag CLI.
|
|
model_name = "bge-m3";
|
|
litellm_params = {
|
|
model = "openai/bge-m3";
|
|
api_base = "http://halo:8000/v1";
|
|
api_key = "none";
|
|
};
|
|
}
|
|
];
|
|
|
|
general_settings = {
|
|
master_key = "os.environ/LITELLM_MASTER_KEY";
|
|
};
|
|
|
|
litellm_settings = {
|
|
drop_params = true;
|
|
};
|
|
};
|
|
};
|
|
|
|
# Decrypted file must contain the env line: LITELLM_MASTER_KEY=sk-...
|
|
# Read by systemd (as root) before dropping to litellm's DynamicUser.
|
|
sops.secrets."litellm/env" = {
|
|
sopsFile = ../../../.secrets/sgx/litellm.yaml;
|
|
restartUnits = [ "litellm.service" ];
|
|
};
|
|
}
|