feat(halo): add different llama servers

2026-05-07 14:34:58 +02:00 · 2026-05-07 14:34:58 +02:00 · 2009ea96b3
commit 2009ea96b3
parent b548126fb8
4 changed files with 122 additions and 6 deletions
--- a/systems/x86_64-linux/halo/default.nix
+++ b/systems/x86_64-linux/halo/default.nix
@ -10,7 +10,7 @@ with lib.metacfg;
    ./hardware-configuration.nix
    #./xremap.nix
    ./wyoming.nix
-    ./llama-server.nix
+    ./llama-server-code-next.nix
  ];

  boot.lanzaboote.pkiBundle = "/var/lib/sbctl";
--- a/systems/x86_64-linux/halo/llama-server-27B-MTP.nix
+++ b/systems/x86_64-linux/halo/llama-server-27B-MTP.nix
@ -0,0 +1,60 @@
+{
+  pkgs,
+  lib,
+  ...
+}:
+{
+  systemd.services.llama-server = {
+    description = "llama.cpp server (Qwen3.6-27B-MTP, ROCm)";
+    after = [ "network-online.target" ];
+    wants = [ "network-online.target" ];
+    wantedBy = [ "multi-user.target" ];
+
+    environment = {
+      HOME = "%S/llama-server";
+      HF_HOME = "%S/llama-server";
+    };
+
+    serviceConfig = {
+      Type = "simple";
+      DynamicUser = true;
+      SupplementaryGroups = [
+        "video"
+        "render"
+      ];
+      StateDirectory = "llama-server";
+      CacheDirectory = "llama-server";
+      WorkingDirectory = "%S/llama-server";
+      ExecStartPre = "${pkgs.coreutils}/bin/mkdir -p %C/llama-server/kv-slots-27B-MTP";
+      ExecStart = lib.concatStringsSep " " [
+        "${pkgs.llama-cpp-rocm}/bin/llama-server"
+        "--flash-attn on"
+        "--parallel 1"
+        "--jinja"
+        "--host 0.0.0.0"
+        "--port 8000"
+        "--no-mmap"
+        "--n-gpu-layers 99"
+        "-hf am17an/Qwen3.6-27B-MTP-GGUF:Q8_0"
+        "--alias qwen3.6-27b"
+        "--threads 8"
+        "--ubatch-size 256"
+        "-ctk bf16 -ctv bf16"
+        "--temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00"
+        "--no-context-shift"
+        ''--chat-template-kwargs '{"preserve_thinking": true}' ''
+        "-c 262144"
+        "--fit on"
+        "--slot-save-path %C/llama-server/kv-slots-27B-MTP"
+        "--spec-type mtp --spec-draft-n-max 3"
+      ];
+      Restart = "on-failure";
+      RestartSec = 10;
+
+      PrivateTmp = true;
+      ProtectSystem = "strict";
+      ProtectHome = true;
+      NoNewPrivileges = true;
+    };
+  };
+}
--- a/systems/x86_64-linux/halo/llama-server-coder-next.nix
+++ b/systems/x86_64-linux/halo/llama-server-coder-next.nix
@ -0,0 +1,57 @@
+{
+  pkgs,
+  lib,
+  ...
+}:
+{
+  systemd.services.llama-server = {
+    description = "llama.cpp server (Qwen3-Coder-Next, ROCm)";
+    after = [ "network-online.target" ];
+    wants = [ "network-online.target" ];
+    wantedBy = [ "multi-user.target" ];
+
+    environment = {
+      HOME = "%S/llama-server";
+      HF_HOME = "%S/llama-server";
+    };
+
+    serviceConfig = {
+      Type = "simple";
+      DynamicUser = true;
+      SupplementaryGroups = [
+        "video"
+        "render"
+      ];
+      StateDirectory = "llama-server";
+      CacheDirectory = "llama-server";
+      WorkingDirectory = "%S/llama-server";
+      ExecStartPre = "${pkgs.coreutils}/bin/mkdir -p %C/llama-server/kv-slots-coder-next";
+      ExecStart = lib.concatStringsSep " " [
+        "${pkgs.llama-cpp-rocm}/bin/llama-server"
+	"--flash-attn on"
+        "--parallel 1"
+        "--jinja"
+        "--host 0.0.0.0"
+        "--port 8000"
+        "--no-mmap"
+        "--n-gpu-layers 99"
+        "--threads 8"
+        "--ubatch-size 256"
+        "-ctk bf16 -ctv bf16"
+        "--fit on"
+        "--no-context-shift"
+        "-hf unsloth/Qwen3-Coder-Next-GGUF:UD-Q8_K_XL"
+        "--alias qwen3-coder-next"
+        "--temp 1.0 --top-p 0.95 --min-p 0.01 --top-k 40"
+        "--slot-save-path %C/llama-server/kv-slots-coder-next"
+      ];
+      Restart = "on-failure";
+      RestartSec = 10;
+
+      PrivateTmp = true;
+      ProtectSystem = "strict";
+      ProtectHome = true;
+      NoNewPrivileges = true;
+    };
+  };
+}
--- a/systems/x86_64-linux/halo/llama-server.nix
+++ b/systems/x86_64-linux/halo/llama-server.nix
@ -5,7 +5,7 @@
 }:
 {
  systemd.services.llama-server = {
-    description = "llama.cpp server (Qwen3.6-27B-MTP, ROCm)";
+    description = "llama.cpp server (Qwen3.6-35B-A3B, ROCm)";
    after = [ "network-online.target" ];
    wants = [ "network-online.target" ];
    wantedBy = [ "multi-user.target" ];
@ -29,13 +29,13 @@
      ExecStart = lib.concatStringsSep " " [
        "${pkgs.llama-cpp-rocm}/bin/llama-server"
        "--flash-attn on"
-        "--parallel 1"
+        "--parallel 2"
        "--jinja"
        "--host 0.0.0.0"
        "--port 8000"
        "--no-mmap"
        "--n-gpu-layers 99"
-        "-hf am17an/Qwen3.6-27B-MTP-GGUF:Q8_0"
+        "-hf unsloth/Qwen3.6-27B-GGUF:UD-Q8_K_XL"
        "--alias qwen3.6-27b"
        "--threads 8"
        "--ubatch-size 256"
@ -43,10 +43,9 @@
        "--temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00"
        "--no-context-shift"
        ''--chat-template-kwargs '{"preserve_thinking": true}' ''
-        "-c 262144"
+        "-c 524288"
        "--fit on"
        "--slot-save-path %C/llama-server/kv-slots"
-        "--spec-type mtp --spec-draft-n-max 3"
      ];
      Restart = "on-failure";
      RestartSec = 10;