nixcfg/systems/x86_64-linux/halo/llama-server.nix
Harald Hoyer ab729a0720 feat(halo): serve bge-m3 embeddings alongside coder
Add a multilingual bge-m3 embedding model to the llama-server preset and
raise --models-max to 2 so it stays co-resident with the coder model.
This gives the RAG stack a local embeddings endpoint without a second
service, keeping all inference on halo. Embedding-specific overrides
(ubatch-size, context, pooling) are pinned since the global defaults
would truncate or misconfigure embedding requests.
2026-05-22 00:35:54 +02:00

43 lines
972 B
Nix

{
pkgs,
lib,
...
}:
{
systemd.services.llama-server = {
description = "llama.cpp server (multi-model preset, ROCm)";
after = [ "network-online.target" ];
wants = [ "network-online.target" ];
wantedBy = [ "multi-user.target" ];
environment = {
HOME = "%S/llama-server";
HF_HOME = "%S/llama-server";
};
serviceConfig = {
Type = "simple";
DynamicUser = true;
SupplementaryGroups = [
"video"
"render"
];
StateDirectory = "llama-server";
WorkingDirectory = "%S/llama-server";
ExecStart = lib.concatStringsSep " " [
"${pkgs.llama-cpp-rocm}/bin/llama-server"
"--host 0.0.0.0"
"--port 8000"
"--models-preset ${./models.ini}"
"--models-max 2"
];
Restart = "on-failure";
RestartSec = 10;
PrivateTmp = true;
ProtectSystem = "strict";
ProtectHome = true;
NoNewPrivileges = true;
};
};
}