diff --git a/systems/x86_64-linux/halo/default.nix b/systems/x86_64-linux/halo/default.nix index 1934cd8..4f613ae 100644 --- a/systems/x86_64-linux/halo/default.nix +++ b/systems/x86_64-linux/halo/default.nix @@ -10,7 +10,7 @@ with lib.metacfg; ./hardware-configuration.nix #./xremap.nix ./wyoming.nix - ./llama-server.nix + ./llama-server-code-next.nix ]; boot.lanzaboote.pkiBundle = "/var/lib/sbctl"; diff --git a/systems/x86_64-linux/halo/llama-server-27B-MTP.nix b/systems/x86_64-linux/halo/llama-server-27B-MTP.nix new file mode 100644 index 0000000..2b8283d --- /dev/null +++ b/systems/x86_64-linux/halo/llama-server-27B-MTP.nix @@ -0,0 +1,60 @@ +{ + pkgs, + lib, + ... +}: +{ + systemd.services.llama-server = { + description = "llama.cpp server (Qwen3.6-27B-MTP, ROCm)"; + after = [ "network-online.target" ]; + wants = [ "network-online.target" ]; + wantedBy = [ "multi-user.target" ]; + + environment = { + HOME = "%S/llama-server"; + HF_HOME = "%S/llama-server"; + }; + + serviceConfig = { + Type = "simple"; + DynamicUser = true; + SupplementaryGroups = [ + "video" + "render" + ]; + StateDirectory = "llama-server"; + CacheDirectory = "llama-server"; + WorkingDirectory = "%S/llama-server"; + ExecStartPre = "${pkgs.coreutils}/bin/mkdir -p %C/llama-server/kv-slots-27B-MTP"; + ExecStart = lib.concatStringsSep " " [ + "${pkgs.llama-cpp-rocm}/bin/llama-server" + "--flash-attn on" + "--parallel 1" + "--jinja" + "--host 0.0.0.0" + "--port 8000" + "--no-mmap" + "--n-gpu-layers 99" + "-hf am17an/Qwen3.6-27B-MTP-GGUF:Q8_0" + "--alias qwen3.6-27b" + "--threads 8" + "--ubatch-size 256" + "-ctk bf16 -ctv bf16" + "--temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00" + "--no-context-shift" + ''--chat-template-kwargs '{"preserve_thinking": true}' '' + "-c 262144" + "--fit on" + "--slot-save-path %C/llama-server/kv-slots-27B-MTP" + "--spec-type mtp --spec-draft-n-max 3" + ]; + Restart = "on-failure"; + RestartSec = 10; + + PrivateTmp = true; + ProtectSystem = "strict"; + ProtectHome = true; + NoNewPrivileges = true; + }; + }; +} diff --git a/systems/x86_64-linux/halo/llama-server-coder-next.nix b/systems/x86_64-linux/halo/llama-server-coder-next.nix new file mode 100644 index 0000000..d384f7c --- /dev/null +++ b/systems/x86_64-linux/halo/llama-server-coder-next.nix @@ -0,0 +1,57 @@ +{ + pkgs, + lib, + ... +}: +{ + systemd.services.llama-server = { + description = "llama.cpp server (Qwen3-Coder-Next, ROCm)"; + after = [ "network-online.target" ]; + wants = [ "network-online.target" ]; + wantedBy = [ "multi-user.target" ]; + + environment = { + HOME = "%S/llama-server"; + HF_HOME = "%S/llama-server"; + }; + + serviceConfig = { + Type = "simple"; + DynamicUser = true; + SupplementaryGroups = [ + "video" + "render" + ]; + StateDirectory = "llama-server"; + CacheDirectory = "llama-server"; + WorkingDirectory = "%S/llama-server"; + ExecStartPre = "${pkgs.coreutils}/bin/mkdir -p %C/llama-server/kv-slots-coder-next"; + ExecStart = lib.concatStringsSep " " [ + "${pkgs.llama-cpp-rocm}/bin/llama-server" + "--flash-attn on" + "--parallel 1" + "--jinja" + "--host 0.0.0.0" + "--port 8000" + "--no-mmap" + "--n-gpu-layers 99" + "--threads 8" + "--ubatch-size 256" + "-ctk bf16 -ctv bf16" + "--fit on" + "--no-context-shift" + "-hf unsloth/Qwen3-Coder-Next-GGUF:UD-Q8_K_XL" + "--alias qwen3-coder-next" + "--temp 1.0 --top-p 0.95 --min-p 0.01 --top-k 40" + "--slot-save-path %C/llama-server/kv-slots-coder-next" + ]; + Restart = "on-failure"; + RestartSec = 10; + + PrivateTmp = true; + ProtectSystem = "strict"; + ProtectHome = true; + NoNewPrivileges = true; + }; + }; +} diff --git a/systems/x86_64-linux/halo/llama-server.nix b/systems/x86_64-linux/halo/llama-server.nix index 9ea5870..84f1831 100644 --- a/systems/x86_64-linux/halo/llama-server.nix +++ b/systems/x86_64-linux/halo/llama-server.nix @@ -5,7 +5,7 @@ }: { systemd.services.llama-server = { - description = "llama.cpp server (Qwen3.6-27B-MTP, ROCm)"; + description = "llama.cpp server (Qwen3.6-35B-A3B, ROCm)"; after = [ "network-online.target" ]; wants = [ "network-online.target" ]; wantedBy = [ "multi-user.target" ]; @@ -35,7 +35,7 @@ "--port 8000" "--no-mmap" "--n-gpu-layers 99" - "-hf am17an/Qwen3.6-27B-MTP-GGUF:Q8_0" + "-hf unsloth/Qwen3.6-27B-GGUF:UD-Q8_K_XL" "--alias qwen3.6-27b" "--threads 8" "--ubatch-size 256" @@ -46,7 +46,6 @@ "-c 524288" "--fit on" "--slot-save-path %C/llama-server/kv-slots" - "--spec-type mtp --spec-draft-n-max 3" ]; Restart = "on-failure"; RestartSec = 10;