feat(halo): add llama-server systemd unit for Qwen3.6-35B-A3B

Runs llama.cpp's ROCm build under DynamicUser, with the HF model cache
in StateDirectory (survives systemctl clean) and KV slot saves in
CacheDirectory. Listens on :8000.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Harald Hoyer 2026-05-05 10:02:51 +02:00
parent 603e435db8
commit b11e5c8356
2 changed files with 61 additions and 0 deletions

View file

@ -0,0 +1,60 @@
{
pkgs,
lib,
...
}:
{
systemd.services.llama-server = {
description = "llama.cpp server (Qwen3.6-35B-A3B, ROCm)";
after = [ "network-online.target" ];
wants = [ "network-online.target" ];
wantedBy = [ "multi-user.target" ];
environment = {
HOME = "%S/llama-server";
HF_HOME = "%S/llama-server";
HSA_OVERRIDE_GFX_VERSION = lib.mkDefault "11.0.0";
};
serviceConfig = {
Type = "simple";
DynamicUser = true;
SupplementaryGroups = [
"video"
"render"
];
StateDirectory = "llama-server";
CacheDirectory = "llama-server";
WorkingDirectory = "%S/llama-server";
ExecStartPre = "${pkgs.coreutils}/bin/mkdir -p %C/llama-server/kv-slots";
ExecStart = lib.concatStringsSep " " [
"${pkgs.llama-cpp-rocm}/bin/llama-server"
"--flash-attn on"
"--parallel 2"
"--jinja"
"--host 0.0.0.0"
"--port 8000"
"--no-mmap"
"--n-gpu-layers 99"
"-hf unsloth/Qwen3.6-35B-A3B-GGUF:UD-Q8_K_XL"
"--alias qwen3.6-35b-a3b"
"--threads 8"
"--ubatch-size 256"
"-ctk q8_0 -ctv q8_0"
"--temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00"
"--no-context-shift"
''--chat-template-kwargs '{"preserve_thinking": true}' ''
"-c 524288"
"--fit on"
"--slot-save-path %C/llama-server/kv-slots"
];
Restart = "on-failure";
RestartSec = 10;
PrivateTmp = true;
ProtectSystem = "strict";
ProtectHome = true;
NoNewPrivileges = true;
};
};
}