nixcfg/systems/x86_64-linux/halo/llama-server.nix

{
  pkgs,
  lib,
  ...
}:
{
  systemd.services.llama-server = {
    description = "llama.cpp server (Qwen3.6-35B-A3B, ROCm)";
    after = [ "network-online.target" ];
    wants = [ "network-online.target" ];
    wantedBy = [ "multi-user.target" ];

    environment = {
      HOME = "%S/llama-server";
      HF_HOME = "%S/llama-server";
    };

    serviceConfig = {
      Type = "simple";
      DynamicUser = true;
      SupplementaryGroups = [
        "video"
        "render"
      ];
      StateDirectory = "llama-server";
      CacheDirectory = "llama-server";
      WorkingDirectory = "%S/llama-server";
      ExecStartPre = "${pkgs.coreutils}/bin/mkdir -p %C/llama-server/kv-slots";
      ExecStart = lib.concatStringsSep " " [
        "${pkgs.llama-cpp-rocm}/bin/llama-server"
        "--flash-attn on"
        "--parallel 2"
        "--jinja"
        "--host 0.0.0.0"
        "--port 8000"
        "--no-mmap"
        "--n-gpu-layers 99"
        "-hf unsloth/Qwen3.6-35B-A3B-GGUF:UD-Q8_K_XL"
        "--alias qwen3.6-35b-a3b"
        "--threads 8"
        "--ubatch-size 256"
        "-ctk q8_0 -ctv q8_0"
        "--temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00"
        "--no-context-shift"
        ''--chat-template-kwargs '{"preserve_thinking": true}' ''
        "-c 524288"
        "--fit on"
        "--slot-save-path %C/llama-server/kv-slots"
      ];
      Restart = "on-failure";
      RestartSec = 10;

      PrivateTmp = true;
      ProtectSystem = "strict";
      ProtectHome = true;
      NoNewPrivileges = true;
    };
  };
}