feat(halo): serve multiple llama models via models.ini preset

Replace the per-model llama-server units with a single service that uses llama-server's --models-preset (models.ini) and --models-max 2, so the 35B-A3B and 27B models are loaded on demand from one config. Drop the now-redundant 27B / 27B-MTP / coder-next variant files and the unused CacheDirectory + slot-save-path KV-slot handling.
2026-05-20 00:19:27 +02:00 · 2026-05-20 00:19:27 +02:00 · 0edf975c30
commit 0edf975c30
parent ae068cfd84
6 changed files with 34 additions and 199 deletions
--- a/systems/x86_64-linux/halo/default.nix
+++ b/systems/x86_64-linux/halo/default.nix
@ -10,8 +10,7 @@ with lib.metacfg;
    ./hardware-configuration.nix
    #./xremap.nix
    ./wyoming.nix
-    #./llama-server-coder-next.nix
-    ./llama-server-27B-MTP.nix
+    ./llama-server.nix
  ];

  boot.lanzaboote.pkiBundle = "/var/lib/sbctl";
--- a/systems/x86_64-linux/halo/llama-server-27B-MTP.nix
+++ b/systems/x86_64-linux/halo/llama-server-27B-MTP.nix
@ -1,61 +0,0 @@
-{
-  pkgs,
-  lib,
-  ...
-}:
-{
-  systemd.services.llama-server = {
-    description = "llama.cpp server (Qwen3.6-27B-MTP, ROCm)";
-    after = [ "network-online.target" ];
-    wants = [ "network-online.target" ];
-    wantedBy = [ "multi-user.target" ];
-
-    environment = {
-      HOME = "%S/llama-server";
-      HF_HOME = "%S/llama-server";
-    };
-
-    serviceConfig = {
-      Type = "simple";
-      DynamicUser = true;
-      SupplementaryGroups = [
-        "video"
-        "render"
-      ];
-      StateDirectory = "llama-server";
-      CacheDirectory = "llama-server";
-      WorkingDirectory = "%S/llama-server";
-      ExecStartPre = "${pkgs.coreutils}/bin/mkdir -p %C/llama-server/kv-slots-27B-MTP";
-      ExecStart = lib.concatStringsSep " " [
-        "${pkgs.llama-cpp-rocm}/bin/llama-server"
-        "--flash-attn on"
-        "--parallel 2"
-        "--jinja"
-        "--host 0.0.0.0"
-        "--port 8000"
-        "--no-mmap"
-        "--n-gpu-layers 99"
-        "-hf unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K"
-        "--alias halo-8000"
-        "--threads 8"
-        "--ubatch-size 256"
-        "-ctk bf16 -ctv bf16"
-        "--temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00"
-        "--no-context-shift"
-        ''--chat-template-kwargs '{"preserve_thinking": true}' ''
-        "-c 524288"
-        "--fit on"
-        "--slot-save-path %C/llama-server/kv-slots-27B-MTP"
-        "--spec-type draft-mtp --spec-draft-n-max 6"
-        #"--spec-type ngram-mod --spec-ngram-mod-n-match 24 --spec-ngram-mod-n-min 48 --spec-ngram-mod-n-max 64"
-      ];
-      Restart = "on-failure";
-      RestartSec = 10;
-
-      PrivateTmp = true;
-      ProtectSystem = "strict";
-      ProtectHome = true;
-      NoNewPrivileges = true;
-    };
-  };
-}
--- a/systems/x86_64-linux/halo/llama-server-27B.nix
+++ b/systems/x86_64-linux/halo/llama-server-27B.nix
@ -1,60 +0,0 @@
-{
-  pkgs,
-  lib,
-  ...
-}:
-{
-  systemd.services.llama-server = {
-    description = "llama.cpp server (Qwen3.6-27B, ROCm)";
-    after = [ "network-online.target" ];
-    wants = [ "network-online.target" ];
-    wantedBy = [ "multi-user.target" ];
-
-    environment = {
-      HOME = "%S/llama-server";
-      HF_HOME = "%S/llama-server";
-    };
-
-    serviceConfig = {
-      Type = "simple";
-      DynamicUser = true;
-      SupplementaryGroups = [
-        "video"
-        "render"
-      ];
-      StateDirectory = "llama-server";
-      CacheDirectory = "llama-server";
-      WorkingDirectory = "%S/llama-server";
-      ExecStartPre = "${pkgs.coreutils}/bin/mkdir -p %C/llama-server/kv-slots-27B";
-      ExecStart = lib.concatStringsSep " " [
-        "${pkgs.llama-cpp-rocm}/bin/llama-server"
-        "--flash-attn on"
-        "--parallel 2"
-        "--jinja"
-        "--host 0.0.0.0"
-        "--port 8000"
-        "--no-mmap"
-        "--n-gpu-layers 99"
-        "-hf unsloth/Qwen3.6-27B-GGUF:UD-Q8_K_XL"
-        "--alias halo-8000"
-        "--threads 8"
-        "--ubatch-size 256"
-        "-ctk bf16 -ctv bf16"
-        "--temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00"
-        "--no-context-shift"
-        ''--chat-template-kwargs '{"preserve_thinking": true}' ''
-        "-c 524288"
-        "--fit on"
-        "--slot-save-path %C/llama-server/kv-slots-27B"
-        "--cache-ram 0"
-      ];
-      Restart = "on-failure";
-      RestartSec = 10;
-
-      PrivateTmp = true;
-      ProtectSystem = "strict";
-      ProtectHome = true;
-      NoNewPrivileges = true;
-    };
-  };
-}
--- a/systems/x86_64-linux/halo/llama-server-coder-next.nix
+++ b/systems/x86_64-linux/halo/llama-server-coder-next.nix
@ -1,57 +0,0 @@
-{
-  pkgs,
-  lib,
-  ...
-}:
-{
-  systemd.services.llama-server = {
-    description = "llama.cpp server (Qwen3-Coder-Next, ROCm)";
-    after = [ "network-online.target" ];
-    wants = [ "network-online.target" ];
-    wantedBy = [ "multi-user.target" ];
-
-    environment = {
-      HOME = "%S/llama-server";
-      HF_HOME = "%S/llama-server";
-    };
-
-    serviceConfig = {
-      Type = "simple";
-      DynamicUser = true;
-      SupplementaryGroups = [
-        "video"
-        "render"
-      ];
-      StateDirectory = "llama-server";
-      CacheDirectory = "llama-server";
-      WorkingDirectory = "%S/llama-server";
-      ExecStartPre = "${pkgs.coreutils}/bin/mkdir -p %C/llama-server/kv-slots-coder-next";
-      ExecStart = lib.concatStringsSep " " [
-        "${pkgs.llama-cpp-rocm}/bin/llama-server"
-        "--flash-attn on"
-        "--parallel 1"
-        "--jinja"
-        "--host 0.0.0.0"
-        "--port 8000"
-        "--no-mmap"
-        "--n-gpu-layers 99"
-        "--threads 8"
-        "--ubatch-size 256"
-        "-ctk bf16 -ctv bf16"
-        "--fit on"
-        "--no-context-shift"
-        "-hf unsloth/Qwen3-Coder-Next-GGUF:UD-Q8_K_XL"
-        "--alias halo-8000"
-        "--temp 1.0 --top-p 0.95 --min-p 0.01 --top-k 40"
-        "--slot-save-path %C/llama-server/kv-slots-coder-next"
-      ];
-      Restart = "on-failure";
-      RestartSec = 10;
-
-      PrivateTmp = true;
-      ProtectSystem = "strict";
-      ProtectHome = true;
-      NoNewPrivileges = true;
-    };
-  };
-}
--- a/systems/x86_64-linux/halo/llama-server.nix
+++ b/systems/x86_64-linux/halo/llama-server.nix
@ -5,7 +5,7 @@
 }:
 {
  systemd.services.llama-server = {
-    description = "llama.cpp server (Qwen3.6-35B-A3B, ROCm)";
+    description = "llama.cpp server (multi-model preset, ROCm)";
    after = [ "network-online.target" ];
    wants = [ "network-online.target" ];
    wantedBy = [ "multi-user.target" ];
@ -23,29 +23,13 @@
        "render"
      ];
      StateDirectory = "llama-server";
-      CacheDirectory = "llama-server";
      WorkingDirectory = "%S/llama-server";
-      ExecStartPre = "${pkgs.coreutils}/bin/mkdir -p %C/llama-server/kv-slots";
      ExecStart = lib.concatStringsSep " " [
        "${pkgs.llama-cpp-rocm}/bin/llama-server"
-        "--flash-attn on"
-        "--parallel 1"
-        "--jinja"
        "--host 0.0.0.0"
        "--port 8000"
-        "--no-mmap"
-        "--n-gpu-layers 99"
-        "-hf unsloth/Qwen3.6-35B-A3B-GGUF:UD-Q8_K_XL"
-        "--alias halo-8000"
-        "--threads 8"
-        "--ubatch-size 256"
-        "-ctk bf16 -ctv bf16"
-        "--temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00"
-        "--no-context-shift"
-        ''--chat-template-kwargs '{"preserve_thinking": true}' ''
-        "-c 262144"
-        "--fit on"
-        "--slot-save-path %C/llama-server/kv-slots"
+        "--models-preset ${./models.ini}"
+        "--models-max 2"
      ];
      Restart = "on-failure";
      RestartSec = 10;
--- a/systems/x86_64-linux/halo/models.ini
+++ b/systems/x86_64-linux/halo/models.ini
@ -0,0 +1,30 @@
+version = 1
+
+[*]
+flash-attn          = on
+parallel            = 2
+jinja               = true
+n-gpu-layers        = 99
+threads             = 8
+ubatch-size         = 256
+cache-type-k        = bf16
+cache-type-v        = bf16
+temp                = 0.6
+top-p               = 0.95
+top-k               = 20
+min-p               = 0.0
+mmap                = false
+no-context-shift    = true
+chat-template-kwargs = {"preserve_thinking": true}
+c                   = 524288
+fit                 = on
+spec-type           = draft-mtp
+
+[Qwen3.6-35B-A3B]
+hf                  = unsloth/Qwen3.6-35B-A3B-MTP-GGUF:UD-Q6_K
+spec-draft-n-max    = 3
+
+[Qwen3.6-27B]
+hf                  = unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K
+spec-draft-n-max    = 6
+