feat(halo): serve multiple llama models via models.ini preset

Replace the per-model llama-server units with a single service that uses llama-server's --models-preset (models.ini) and --models-max 2, so the 35B-A3B and 27B models are loaded on demand from one config. Drop the now-redundant 27B / 27B-MTP / coder-next variant files and the unused CacheDirectory + slot-save-path KV-slot handling.
2026-05-20 00:19:27 +02:00 · 2026-05-20 00:19:27 +02:00 · 0edf975c30
commit 0edf975c30
parent ae068cfd84
6 changed files with 34 additions and 199 deletions
--- a/systems/x86_64-linux/halo/llama-server.nix
+++ b/systems/x86_64-linux/halo/llama-server.nix
@ -5,7 +5,7 @@
 }:
 {
  systemd.services.llama-server = {
-    description = "llama.cpp server (Qwen3.6-35B-A3B, ROCm)";
+    description = "llama.cpp server (multi-model preset, ROCm)";
    after = [ "network-online.target" ];
    wants = [ "network-online.target" ];
    wantedBy = [ "multi-user.target" ];
@ -23,29 +23,13 @@
        "render"
      ];
      StateDirectory = "llama-server";
-      CacheDirectory = "llama-server";
      WorkingDirectory = "%S/llama-server";
-      ExecStartPre = "${pkgs.coreutils}/bin/mkdir -p %C/llama-server/kv-slots";
      ExecStart = lib.concatStringsSep " " [
        "${pkgs.llama-cpp-rocm}/bin/llama-server"
-        "--flash-attn on"
-        "--parallel 1"
-        "--jinja"
        "--host 0.0.0.0"
        "--port 8000"
-        "--no-mmap"
-        "--n-gpu-layers 99"
-        "-hf unsloth/Qwen3.6-35B-A3B-GGUF:UD-Q8_K_XL"
-        "--alias halo-8000"
-        "--threads 8"
-        "--ubatch-size 256"
-        "-ctk bf16 -ctv bf16"
-        "--temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00"
-        "--no-context-shift"
-        ''--chat-template-kwargs '{"preserve_thinking": true}' ''
-        "-c 262144"
-        "--fit on"
-        "--slot-save-path %C/llama-server/kv-slots"
+        "--models-preset ${./models.ini}"
+        "--models-max 2"
      ];
      Restart = "on-failure";
      RestartSec = 10;