feat(halo): serve multiple llama models via models.ini preset

Replace the per-model llama-server units with a single service that
uses llama-server's --models-preset (models.ini) and --models-max 2,
so the 35B-A3B and 27B models are loaded on demand from one config.

Drop the now-redundant 27B / 27B-MTP / coder-next variant files and
the unused CacheDirectory + slot-save-path KV-slot handling.
This commit is contained in:
Harald Hoyer 2026-05-20 00:19:27 +02:00
parent ae068cfd84
commit 0edf975c30
6 changed files with 34 additions and 199 deletions

View file

@ -10,8 +10,7 @@ with lib.metacfg;
./hardware-configuration.nix
#./xremap.nix
./wyoming.nix
#./llama-server-coder-next.nix
./llama-server-27B-MTP.nix
./llama-server.nix
];
boot.lanzaboote.pkiBundle = "/var/lib/sbctl";

View file

@ -1,61 +0,0 @@
{
pkgs,
lib,
...
}:
{
systemd.services.llama-server = {
description = "llama.cpp server (Qwen3.6-27B-MTP, ROCm)";
after = [ "network-online.target" ];
wants = [ "network-online.target" ];
wantedBy = [ "multi-user.target" ];
environment = {
HOME = "%S/llama-server";
HF_HOME = "%S/llama-server";
};
serviceConfig = {
Type = "simple";
DynamicUser = true;
SupplementaryGroups = [
"video"
"render"
];
StateDirectory = "llama-server";
CacheDirectory = "llama-server";
WorkingDirectory = "%S/llama-server";
ExecStartPre = "${pkgs.coreutils}/bin/mkdir -p %C/llama-server/kv-slots-27B-MTP";
ExecStart = lib.concatStringsSep " " [
"${pkgs.llama-cpp-rocm}/bin/llama-server"
"--flash-attn on"
"--parallel 2"
"--jinja"
"--host 0.0.0.0"
"--port 8000"
"--no-mmap"
"--n-gpu-layers 99"
"-hf unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K"
"--alias halo-8000"
"--threads 8"
"--ubatch-size 256"
"-ctk bf16 -ctv bf16"
"--temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00"
"--no-context-shift"
''--chat-template-kwargs '{"preserve_thinking": true}' ''
"-c 524288"
"--fit on"
"--slot-save-path %C/llama-server/kv-slots-27B-MTP"
"--spec-type draft-mtp --spec-draft-n-max 6"
#"--spec-type ngram-mod --spec-ngram-mod-n-match 24 --spec-ngram-mod-n-min 48 --spec-ngram-mod-n-max 64"
];
Restart = "on-failure";
RestartSec = 10;
PrivateTmp = true;
ProtectSystem = "strict";
ProtectHome = true;
NoNewPrivileges = true;
};
};
}

View file

@ -1,60 +0,0 @@
{
pkgs,
lib,
...
}:
{
systemd.services.llama-server = {
description = "llama.cpp server (Qwen3.6-27B, ROCm)";
after = [ "network-online.target" ];
wants = [ "network-online.target" ];
wantedBy = [ "multi-user.target" ];
environment = {
HOME = "%S/llama-server";
HF_HOME = "%S/llama-server";
};
serviceConfig = {
Type = "simple";
DynamicUser = true;
SupplementaryGroups = [
"video"
"render"
];
StateDirectory = "llama-server";
CacheDirectory = "llama-server";
WorkingDirectory = "%S/llama-server";
ExecStartPre = "${pkgs.coreutils}/bin/mkdir -p %C/llama-server/kv-slots-27B";
ExecStart = lib.concatStringsSep " " [
"${pkgs.llama-cpp-rocm}/bin/llama-server"
"--flash-attn on"
"--parallel 2"
"--jinja"
"--host 0.0.0.0"
"--port 8000"
"--no-mmap"
"--n-gpu-layers 99"
"-hf unsloth/Qwen3.6-27B-GGUF:UD-Q8_K_XL"
"--alias halo-8000"
"--threads 8"
"--ubatch-size 256"
"-ctk bf16 -ctv bf16"
"--temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00"
"--no-context-shift"
''--chat-template-kwargs '{"preserve_thinking": true}' ''
"-c 524288"
"--fit on"
"--slot-save-path %C/llama-server/kv-slots-27B"
"--cache-ram 0"
];
Restart = "on-failure";
RestartSec = 10;
PrivateTmp = true;
ProtectSystem = "strict";
ProtectHome = true;
NoNewPrivileges = true;
};
};
}

View file

@ -1,57 +0,0 @@
{
pkgs,
lib,
...
}:
{
systemd.services.llama-server = {
description = "llama.cpp server (Qwen3-Coder-Next, ROCm)";
after = [ "network-online.target" ];
wants = [ "network-online.target" ];
wantedBy = [ "multi-user.target" ];
environment = {
HOME = "%S/llama-server";
HF_HOME = "%S/llama-server";
};
serviceConfig = {
Type = "simple";
DynamicUser = true;
SupplementaryGroups = [
"video"
"render"
];
StateDirectory = "llama-server";
CacheDirectory = "llama-server";
WorkingDirectory = "%S/llama-server";
ExecStartPre = "${pkgs.coreutils}/bin/mkdir -p %C/llama-server/kv-slots-coder-next";
ExecStart = lib.concatStringsSep " " [
"${pkgs.llama-cpp-rocm}/bin/llama-server"
"--flash-attn on"
"--parallel 1"
"--jinja"
"--host 0.0.0.0"
"--port 8000"
"--no-mmap"
"--n-gpu-layers 99"
"--threads 8"
"--ubatch-size 256"
"-ctk bf16 -ctv bf16"
"--fit on"
"--no-context-shift"
"-hf unsloth/Qwen3-Coder-Next-GGUF:UD-Q8_K_XL"
"--alias halo-8000"
"--temp 1.0 --top-p 0.95 --min-p 0.01 --top-k 40"
"--slot-save-path %C/llama-server/kv-slots-coder-next"
];
Restart = "on-failure";
RestartSec = 10;
PrivateTmp = true;
ProtectSystem = "strict";
ProtectHome = true;
NoNewPrivileges = true;
};
};
}

View file

@ -5,7 +5,7 @@
}:
{
systemd.services.llama-server = {
description = "llama.cpp server (Qwen3.6-35B-A3B, ROCm)";
description = "llama.cpp server (multi-model preset, ROCm)";
after = [ "network-online.target" ];
wants = [ "network-online.target" ];
wantedBy = [ "multi-user.target" ];
@ -23,29 +23,13 @@
"render"
];
StateDirectory = "llama-server";
CacheDirectory = "llama-server";
WorkingDirectory = "%S/llama-server";
ExecStartPre = "${pkgs.coreutils}/bin/mkdir -p %C/llama-server/kv-slots";
ExecStart = lib.concatStringsSep " " [
"${pkgs.llama-cpp-rocm}/bin/llama-server"
"--flash-attn on"
"--parallel 1"
"--jinja"
"--host 0.0.0.0"
"--port 8000"
"--no-mmap"
"--n-gpu-layers 99"
"-hf unsloth/Qwen3.6-35B-A3B-GGUF:UD-Q8_K_XL"
"--alias halo-8000"
"--threads 8"
"--ubatch-size 256"
"-ctk bf16 -ctv bf16"
"--temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00"
"--no-context-shift"
''--chat-template-kwargs '{"preserve_thinking": true}' ''
"-c 262144"
"--fit on"
"--slot-save-path %C/llama-server/kv-slots"
"--models-preset ${./models.ini}"
"--models-max 2"
];
Restart = "on-failure";
RestartSec = 10;

View file

@ -0,0 +1,30 @@
version = 1
[*]
flash-attn = on
parallel = 2
jinja = true
n-gpu-layers = 99
threads = 8
ubatch-size = 256
cache-type-k = bf16
cache-type-v = bf16
temp = 0.6
top-p = 0.95
top-k = 20
min-p = 0.0
mmap = false
no-context-shift = true
chat-template-kwargs = {"preserve_thinking": true}
c = 524288
fit = on
spec-type = draft-mtp
[Qwen3.6-35B-A3B]
hf = unsloth/Qwen3.6-35B-A3B-MTP-GGUF:UD-Q6_K
spec-draft-n-max = 3
[Qwen3.6-27B]
hf = unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K
spec-draft-n-max = 6