Uptime Kuma already binds 4000, so the gateway never got the port and requests hit the wrong service. Move LiteLLM to 4001 and update the rag CLI default endpoint to match.
53 lines
1.7 KiB
Nix
53 lines
1.7 KiB
Nix
{ config, ... }:
|
|
{
|
|
# OpenAI-compatible gateway in front of halo's llama-server, exposed as a
|
|
# shared endpoint across the Tailnet (per-key routing, logging, future cloud
|
|
# fallback) so clients hit sgx:4000 instead of hardcoding halo's address.
|
|
services.litellm = {
|
|
enable = true;
|
|
host = "0.0.0.0";
|
|
port = 4001; # 8080 Open WebUI, 8081 searx, 4000 uptime-kuma
|
|
openFirewall = true; # reachable across the LAN
|
|
environmentFile = config.sops.secrets."litellm/env".path;
|
|
|
|
settings = {
|
|
model_list = [
|
|
{
|
|
# halo exposes the `[coder]` preset from systems/.../halo/models.ini.
|
|
# llama-server speaks the OpenAI API, so route it as an openai/* model.
|
|
model_name = "coder";
|
|
litellm_params = {
|
|
model = "openai/coder";
|
|
api_base = "http://halo:8000/v1";
|
|
api_key = "none"; # llama-server requires no key; value is ignored
|
|
};
|
|
}
|
|
{
|
|
# Multilingual embeddings, also served by halo's router (the `[bge-m3]`
|
|
# preset). Exposes /v1/embeddings on this gateway for the rag CLI.
|
|
model_name = "bge-m3";
|
|
litellm_params = {
|
|
model = "openai/bge-m3";
|
|
api_base = "http://halo:8000/v1";
|
|
api_key = "none";
|
|
};
|
|
}
|
|
];
|
|
|
|
general_settings = {
|
|
master_key = "os.environ/LITELLM_MASTER_KEY";
|
|
};
|
|
|
|
litellm_settings = {
|
|
drop_params = true;
|
|
};
|
|
};
|
|
};
|
|
|
|
# Decrypted file must contain the env line: LITELLM_MASTER_KEY=sk-...
|
|
# Read by systemd (as root) before dropping to litellm's DynamicUser.
|
|
sops.secrets."litellm/env" = {
|
|
sopsFile = ../../../.secrets/sgx/litellm.yaml;
|
|
restartUnits = [ "litellm.service" ];
|
|
};
|
|
}
|