feat(halo): add different llama servers

This commit is contained in:
Harald Hoyer 2026-05-07 14:34:58 +02:00
parent b548126fb8
commit 2009ea96b3
4 changed files with 122 additions and 6 deletions

View file

@ -5,7 +5,7 @@
}:
{
systemd.services.llama-server = {
description = "llama.cpp server (Qwen3.6-27B-MTP, ROCm)";
description = "llama.cpp server (Qwen3.6-35B-A3B, ROCm)";
after = [ "network-online.target" ];
wants = [ "network-online.target" ];
wantedBy = [ "multi-user.target" ];
@ -29,13 +29,13 @@
ExecStart = lib.concatStringsSep " " [
"${pkgs.llama-cpp-rocm}/bin/llama-server"
"--flash-attn on"
"--parallel 1"
"--parallel 2"
"--jinja"
"--host 0.0.0.0"
"--port 8000"
"--no-mmap"
"--n-gpu-layers 99"
"-hf am17an/Qwen3.6-27B-MTP-GGUF:Q8_0"
"-hf unsloth/Qwen3.6-27B-GGUF:UD-Q8_K_XL"
"--alias qwen3.6-27b"
"--threads 8"
"--ubatch-size 256"
@ -43,10 +43,9 @@
"--temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00"
"--no-context-shift"
''--chat-template-kwargs '{"preserve_thinking": true}' ''
"-c 262144"
"-c 524288"
"--fit on"
"--slot-save-path %C/llama-server/kv-slots"
"--spec-type mtp --spec-draft-n-max 3"
];
Restart = "on-failure";
RestartSec = 10;