feat(halo): add different llama servers

2026-05-07 14:34:58 +02:00 · 2026-05-07 14:34:58 +02:00 · d47bb6e15b
commit d47bb6e15b
parent b548126fb8
4 changed files with 122 additions and 6 deletions
--- a/systems/x86_64-linux/halo/llama-server.nix
+++ b/systems/x86_64-linux/halo/llama-server.nix
@ -5,7 +5,7 @@
 }:
 {
  systemd.services.llama-server = {
-    description = "llama.cpp server (Qwen3.6-27B-MTP, ROCm)";
+    description = "llama.cpp server (Qwen3.6-35B-A3B, ROCm)";
    after = [ "network-online.target" ];
    wants = [ "network-online.target" ];
    wantedBy = [ "multi-user.target" ];
@ -29,13 +29,13 @@
      ExecStart = lib.concatStringsSep " " [
        "${pkgs.llama-cpp-rocm}/bin/llama-server"
        "--flash-attn on"
-        "--parallel 1"
+        "--parallel 2"
        "--jinja"
        "--host 0.0.0.0"
        "--port 8000"
        "--no-mmap"
        "--n-gpu-layers 99"
-        "-hf am17an/Qwen3.6-27B-MTP-GGUF:Q8_0"
+        "-hf unsloth/Qwen3.6-27B-GGUF:UD-Q8_K_XL"
        "--alias qwen3.6-27b"
        "--threads 8"
        "--ubatch-size 256"
@ -43,10 +43,9 @@
        "--temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00"
        "--no-context-shift"
        ''--chat-template-kwargs '{"preserve_thinking": true}' ''
-        "-c 262144"
+        "-c 524288"
        "--fit on"
        "--slot-save-path %C/llama-server/kv-slots"
-        "--spec-type mtp --spec-draft-n-max 3"
      ];
      Restart = "on-failure";
      RestartSec = 10;