feat(halo): serve multiple llama models via models.ini preset

Replace the per-model llama-server units with a single service that uses llama-server's --models-preset (models.ini) and --models-max 2, so the 35B-A3B and 27B models are loaded on demand from one config. Drop the now-redundant 27B / 27B-MTP / coder-next variant files and the unused CacheDirectory + slot-save-path KV-slot handling.
2026-05-20 00:19:27 +02:00 · 2026-05-20 00:19:27 +02:00 · 0edf975c30
commit 0edf975c30
parent ae068cfd84
6 changed files with 34 additions and 199 deletions
--- a/systems/x86_64-linux/halo/default.nix
+++ b/systems/x86_64-linux/halo/default.nix
@ -10,8 +10,7 @@ with lib.metacfg;
    ./hardware-configuration.nix
    #./xremap.nix
    ./wyoming.nix
-    #./llama-server-coder-next.nix
+    ./llama-server.nix
    ./llama-server-27B-MTP.nix
  ];
  boot.lanzaboote.pkiBundle = "/var/lib/sbctl";
--- a/systems/x86_64-linux/halo/llama-server-27B-MTP.nix
+++ b/systems/x86_64-linux/halo/llama-server-27B-MTP.nix
@ -1,61 +0,0 @@
 {
  pkgs,
  lib,
  ...
 }:
 {
  systemd.services.llama-server = {
    description = "llama.cpp server (Qwen3.6-27B-MTP, ROCm)";
    after = [ "network-online.target" ];
    wants = [ "network-online.target" ];
    wantedBy = [ "multi-user.target" ];
    environment = {
      HOME = "%S/llama-server";
      HF_HOME = "%S/llama-server";
    };
    serviceConfig = {
      Type = "simple";
      DynamicUser = true;
      SupplementaryGroups = [
        "video"
        "render"
      ];
      StateDirectory = "llama-server";
      CacheDirectory = "llama-server";
      WorkingDirectory = "%S/llama-server";
      ExecStartPre = "${pkgs.coreutils}/bin/mkdir -p %C/llama-server/kv-slots-27B-MTP";
      ExecStart = lib.concatStringsSep " " [
        "${pkgs.llama-cpp-rocm}/bin/llama-server"
        "--flash-attn on"
        "--parallel 2"
        "--jinja"
        "--host 0.0.0.0"
        "--port 8000"
        "--no-mmap"
        "--n-gpu-layers 99"
        "-hf unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K"
        "--alias halo-8000"
        "--threads 8"
        "--ubatch-size 256"
        "-ctk bf16 -ctv bf16"
        "--temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00"
        "--no-context-shift"
        ''--chat-template-kwargs '{"preserve_thinking": true}' ''
        "-c 524288"
        "--fit on"
        "--slot-save-path %C/llama-server/kv-slots-27B-MTP"
        "--spec-type draft-mtp --spec-draft-n-max 6"
        #"--spec-type ngram-mod --spec-ngram-mod-n-match 24 --spec-ngram-mod-n-min 48 --spec-ngram-mod-n-max 64"
      ];
      Restart = "on-failure";
      RestartSec = 10;
      PrivateTmp = true;
      ProtectSystem = "strict";
      ProtectHome = true;
      NoNewPrivileges = true;
    };
  };
 }
--- a/systems/x86_64-linux/halo/llama-server-27B.nix
+++ b/systems/x86_64-linux/halo/llama-server-27B.nix
@ -1,60 +0,0 @@
 {
  pkgs,
  lib,
  ...
 }:
 {
  systemd.services.llama-server = {
    description = "llama.cpp server (Qwen3.6-27B, ROCm)";
    after = [ "network-online.target" ];
    wants = [ "network-online.target" ];
    wantedBy = [ "multi-user.target" ];
    environment = {
      HOME = "%S/llama-server";
      HF_HOME = "%S/llama-server";
    };
    serviceConfig = {
      Type = "simple";
      DynamicUser = true;
      SupplementaryGroups = [
        "video"
        "render"
      ];
      StateDirectory = "llama-server";
      CacheDirectory = "llama-server";
      WorkingDirectory = "%S/llama-server";
      ExecStartPre = "${pkgs.coreutils}/bin/mkdir -p %C/llama-server/kv-slots-27B";
      ExecStart = lib.concatStringsSep " " [
        "${pkgs.llama-cpp-rocm}/bin/llama-server"
        "--flash-attn on"
        "--parallel 2"
        "--jinja"
        "--host 0.0.0.0"
        "--port 8000"
        "--no-mmap"
        "--n-gpu-layers 99"
        "-hf unsloth/Qwen3.6-27B-GGUF:UD-Q8_K_XL"
        "--alias halo-8000"
        "--threads 8"
        "--ubatch-size 256"
        "-ctk bf16 -ctv bf16"
        "--temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00"
        "--no-context-shift"
        ''--chat-template-kwargs '{"preserve_thinking": true}' ''
        "-c 524288"
        "--fit on"
        "--slot-save-path %C/llama-server/kv-slots-27B"
        "--cache-ram 0"
      ];
      Restart = "on-failure";
      RestartSec = 10;
      PrivateTmp = true;
      ProtectSystem = "strict";
      ProtectHome = true;
      NoNewPrivileges = true;
    };
  };
 }
--- a/systems/x86_64-linux/halo/llama-server-coder-next.nix
+++ b/systems/x86_64-linux/halo/llama-server-coder-next.nix
@ -1,57 +0,0 @@
 {
  pkgs,
  lib,
  ...
 }:
 {
  systemd.services.llama-server = {
    description = "llama.cpp server (Qwen3-Coder-Next, ROCm)";
    after = [ "network-online.target" ];
    wants = [ "network-online.target" ];
    wantedBy = [ "multi-user.target" ];
    environment = {
      HOME = "%S/llama-server";
      HF_HOME = "%S/llama-server";
    };
    serviceConfig = {
      Type = "simple";
      DynamicUser = true;
      SupplementaryGroups = [
        "video"
        "render"
      ];
      StateDirectory = "llama-server";
      CacheDirectory = "llama-server";
      WorkingDirectory = "%S/llama-server";
      ExecStartPre = "${pkgs.coreutils}/bin/mkdir -p %C/llama-server/kv-slots-coder-next";
      ExecStart = lib.concatStringsSep " " [
        "${pkgs.llama-cpp-rocm}/bin/llama-server"
        "--flash-attn on"
        "--parallel 1"
        "--jinja"
        "--host 0.0.0.0"
        "--port 8000"
        "--no-mmap"
        "--n-gpu-layers 99"
        "--threads 8"
        "--ubatch-size 256"
        "-ctk bf16 -ctv bf16"
        "--fit on"
        "--no-context-shift"
        "-hf unsloth/Qwen3-Coder-Next-GGUF:UD-Q8_K_XL"
        "--alias halo-8000"
        "--temp 1.0 --top-p 0.95 --min-p 0.01 --top-k 40"
        "--slot-save-path %C/llama-server/kv-slots-coder-next"
      ];
      Restart = "on-failure";
      RestartSec = 10;
      PrivateTmp = true;
      ProtectSystem = "strict";
      ProtectHome = true;
      NoNewPrivileges = true;
    };
  };
 }
--- a/systems/x86_64-linux/halo/llama-server.nix
+++ b/systems/x86_64-linux/halo/llama-server.nix
@ -5,7 +5,7 @@
 }:
 {
  systemd.services.llama-server = {
-    description = "llama.cpp server (Qwen3.6-35B-A3B, ROCm)";
+    description = "llama.cpp server (multi-model preset, ROCm)";
    after = [ "network-online.target" ];
    wants = [ "network-online.target" ];
    wantedBy = [ "multi-user.target" ];
@ -23,29 +23,13 @@
        "render"
      ];
      StateDirectory = "llama-server";
      CacheDirectory = "llama-server";
      WorkingDirectory = "%S/llama-server";
      ExecStartPre = "${pkgs.coreutils}/bin/mkdir -p %C/llama-server/kv-slots";
      ExecStart = lib.concatStringsSep " " [
        "${pkgs.llama-cpp-rocm}/bin/llama-server"
        "--flash-attn on"
        "--parallel 1"
        "--jinja"
        "--host 0.0.0.0"
        "--port 8000"
-        "--no-mmap"
+        "--models-preset ${./models.ini}"
-        "--n-gpu-layers 99"
+        "--models-max 2"
        "-hf unsloth/Qwen3.6-35B-A3B-GGUF:UD-Q8_K_XL"
        "--alias halo-8000"
        "--threads 8"
        "--ubatch-size 256"
        "-ctk bf16 -ctv bf16"
        "--temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00"
        "--no-context-shift"
        ''--chat-template-kwargs '{"preserve_thinking": true}' ''
        "-c 262144"
        "--fit on"
        "--slot-save-path %C/llama-server/kv-slots"
      ];
      Restart = "on-failure";
      RestartSec = 10;
--- a/systems/x86_64-linux/halo/models.ini
+++ b/systems/x86_64-linux/halo/models.ini
@ -0,0 +1,30 @@
 version = 1
 [*]
 flash-attn          = on
 parallel            = 2
 jinja               = true
 n-gpu-layers        = 99
 threads             = 8
 ubatch-size         = 256
 cache-type-k        = bf16
 cache-type-v        = bf16
 temp                = 0.6
 top-p               = 0.95
 top-k               = 20
 min-p               = 0.0
 mmap                = false
 no-context-shift    = true
 chat-template-kwargs = {"preserve_thinking": true}
 c                   = 524288
 fit                 = on
 spec-type           = draft-mtp
 [Qwen3.6-35B-A3B]
 hf                  = unsloth/Qwen3.6-35B-A3B-MTP-GGUF:UD-Q6_K
 spec-draft-n-max    = 3
 [Qwen3.6-27B]
 hf                  = unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K
 spec-draft-n-max    = 6