diff --git a/systems/x86_64-linux/halo/default.nix b/systems/x86_64-linux/halo/default.nix index 41b5ebd..1934cd8 100644 --- a/systems/x86_64-linux/halo/default.nix +++ b/systems/x86_64-linux/halo/default.nix @@ -10,8 +10,7 @@ with lib.metacfg; ./hardware-configuration.nix #./xremap.nix ./wyoming.nix - #./llama-server-coder-next.nix - ./llama-server-27B-MTP.nix + ./llama-server.nix ]; boot.lanzaboote.pkiBundle = "/var/lib/sbctl"; diff --git a/systems/x86_64-linux/halo/llama-server-27B-MTP.nix b/systems/x86_64-linux/halo/llama-server-27B-MTP.nix deleted file mode 100644 index 94e7efc..0000000 --- a/systems/x86_64-linux/halo/llama-server-27B-MTP.nix +++ /dev/null @@ -1,61 +0,0 @@ -{ - pkgs, - lib, - ... -}: -{ - systemd.services.llama-server = { - description = "llama.cpp server (Qwen3.6-27B-MTP, ROCm)"; - after = [ "network-online.target" ]; - wants = [ "network-online.target" ]; - wantedBy = [ "multi-user.target" ]; - - environment = { - HOME = "%S/llama-server"; - HF_HOME = "%S/llama-server"; - }; - - serviceConfig = { - Type = "simple"; - DynamicUser = true; - SupplementaryGroups = [ - "video" - "render" - ]; - StateDirectory = "llama-server"; - CacheDirectory = "llama-server"; - WorkingDirectory = "%S/llama-server"; - ExecStartPre = "${pkgs.coreutils}/bin/mkdir -p %C/llama-server/kv-slots-27B-MTP"; - ExecStart = lib.concatStringsSep " " [ - "${pkgs.llama-cpp-rocm}/bin/llama-server" - "--flash-attn on" - "--parallel 2" - "--jinja" - "--host 0.0.0.0" - "--port 8000" - "--no-mmap" - "--n-gpu-layers 99" - "-hf unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K" - "--alias halo-8000" - "--threads 8" - "--ubatch-size 256" - "-ctk bf16 -ctv bf16" - "--temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00" - "--no-context-shift" - ''--chat-template-kwargs '{"preserve_thinking": true}' '' - "-c 524288" - "--fit on" - "--slot-save-path %C/llama-server/kv-slots-27B-MTP" - "--spec-type draft-mtp --spec-draft-n-max 6" - #"--spec-type ngram-mod --spec-ngram-mod-n-match 24 --spec-ngram-mod-n-min 48 --spec-ngram-mod-n-max 64" - ]; - Restart = "on-failure"; - RestartSec = 10; - - PrivateTmp = true; - ProtectSystem = "strict"; - ProtectHome = true; - NoNewPrivileges = true; - }; - }; -} diff --git a/systems/x86_64-linux/halo/llama-server-27B.nix b/systems/x86_64-linux/halo/llama-server-27B.nix deleted file mode 100644 index 94f0cdc..0000000 --- a/systems/x86_64-linux/halo/llama-server-27B.nix +++ /dev/null @@ -1,60 +0,0 @@ -{ - pkgs, - lib, - ... -}: -{ - systemd.services.llama-server = { - description = "llama.cpp server (Qwen3.6-27B, ROCm)"; - after = [ "network-online.target" ]; - wants = [ "network-online.target" ]; - wantedBy = [ "multi-user.target" ]; - - environment = { - HOME = "%S/llama-server"; - HF_HOME = "%S/llama-server"; - }; - - serviceConfig = { - Type = "simple"; - DynamicUser = true; - SupplementaryGroups = [ - "video" - "render" - ]; - StateDirectory = "llama-server"; - CacheDirectory = "llama-server"; - WorkingDirectory = "%S/llama-server"; - ExecStartPre = "${pkgs.coreutils}/bin/mkdir -p %C/llama-server/kv-slots-27B"; - ExecStart = lib.concatStringsSep " " [ - "${pkgs.llama-cpp-rocm}/bin/llama-server" - "--flash-attn on" - "--parallel 2" - "--jinja" - "--host 0.0.0.0" - "--port 8000" - "--no-mmap" - "--n-gpu-layers 99" - "-hf unsloth/Qwen3.6-27B-GGUF:UD-Q8_K_XL" - "--alias halo-8000" - "--threads 8" - "--ubatch-size 256" - "-ctk bf16 -ctv bf16" - "--temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00" - "--no-context-shift" - ''--chat-template-kwargs '{"preserve_thinking": true}' '' - "-c 524288" - "--fit on" - "--slot-save-path %C/llama-server/kv-slots-27B" - "--cache-ram 0" - ]; - Restart = "on-failure"; - RestartSec = 10; - - PrivateTmp = true; - ProtectSystem = "strict"; - ProtectHome = true; - NoNewPrivileges = true; - }; - }; -} diff --git a/systems/x86_64-linux/halo/llama-server-coder-next.nix b/systems/x86_64-linux/halo/llama-server-coder-next.nix deleted file mode 100644 index 7f34a3c..0000000 --- a/systems/x86_64-linux/halo/llama-server-coder-next.nix +++ /dev/null @@ -1,57 +0,0 @@ -{ - pkgs, - lib, - ... -}: -{ - systemd.services.llama-server = { - description = "llama.cpp server (Qwen3-Coder-Next, ROCm)"; - after = [ "network-online.target" ]; - wants = [ "network-online.target" ]; - wantedBy = [ "multi-user.target" ]; - - environment = { - HOME = "%S/llama-server"; - HF_HOME = "%S/llama-server"; - }; - - serviceConfig = { - Type = "simple"; - DynamicUser = true; - SupplementaryGroups = [ - "video" - "render" - ]; - StateDirectory = "llama-server"; - CacheDirectory = "llama-server"; - WorkingDirectory = "%S/llama-server"; - ExecStartPre = "${pkgs.coreutils}/bin/mkdir -p %C/llama-server/kv-slots-coder-next"; - ExecStart = lib.concatStringsSep " " [ - "${pkgs.llama-cpp-rocm}/bin/llama-server" - "--flash-attn on" - "--parallel 1" - "--jinja" - "--host 0.0.0.0" - "--port 8000" - "--no-mmap" - "--n-gpu-layers 99" - "--threads 8" - "--ubatch-size 256" - "-ctk bf16 -ctv bf16" - "--fit on" - "--no-context-shift" - "-hf unsloth/Qwen3-Coder-Next-GGUF:UD-Q8_K_XL" - "--alias halo-8000" - "--temp 1.0 --top-p 0.95 --min-p 0.01 --top-k 40" - "--slot-save-path %C/llama-server/kv-slots-coder-next" - ]; - Restart = "on-failure"; - RestartSec = 10; - - PrivateTmp = true; - ProtectSystem = "strict"; - ProtectHome = true; - NoNewPrivileges = true; - }; - }; -} diff --git a/systems/x86_64-linux/halo/llama-server.nix b/systems/x86_64-linux/halo/llama-server.nix index 683286b..8264f3a 100644 --- a/systems/x86_64-linux/halo/llama-server.nix +++ b/systems/x86_64-linux/halo/llama-server.nix @@ -5,7 +5,7 @@ }: { systemd.services.llama-server = { - description = "llama.cpp server (Qwen3.6-35B-A3B, ROCm)"; + description = "llama.cpp server (multi-model preset, ROCm)"; after = [ "network-online.target" ]; wants = [ "network-online.target" ]; wantedBy = [ "multi-user.target" ]; @@ -23,29 +23,13 @@ "render" ]; StateDirectory = "llama-server"; - CacheDirectory = "llama-server"; WorkingDirectory = "%S/llama-server"; - ExecStartPre = "${pkgs.coreutils}/bin/mkdir -p %C/llama-server/kv-slots"; ExecStart = lib.concatStringsSep " " [ "${pkgs.llama-cpp-rocm}/bin/llama-server" - "--flash-attn on" - "--parallel 1" - "--jinja" "--host 0.0.0.0" "--port 8000" - "--no-mmap" - "--n-gpu-layers 99" - "-hf unsloth/Qwen3.6-35B-A3B-GGUF:UD-Q8_K_XL" - "--alias halo-8000" - "--threads 8" - "--ubatch-size 256" - "-ctk bf16 -ctv bf16" - "--temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00" - "--no-context-shift" - ''--chat-template-kwargs '{"preserve_thinking": true}' '' - "-c 262144" - "--fit on" - "--slot-save-path %C/llama-server/kv-slots" + "--models-preset ${./models.ini}" + "--models-max 2" ]; Restart = "on-failure"; RestartSec = 10; diff --git a/systems/x86_64-linux/halo/models.ini b/systems/x86_64-linux/halo/models.ini new file mode 100644 index 0000000..81de281 --- /dev/null +++ b/systems/x86_64-linux/halo/models.ini @@ -0,0 +1,30 @@ +version = 1 + +[*] +flash-attn = on +parallel = 2 +jinja = true +n-gpu-layers = 99 +threads = 8 +ubatch-size = 256 +cache-type-k = bf16 +cache-type-v = bf16 +temp = 0.6 +top-p = 0.95 +top-k = 20 +min-p = 0.0 +mmap = false +no-context-shift = true +chat-template-kwargs = {"preserve_thinking": true} +c = 524288 +fit = on +spec-type = draft-mtp + +[Qwen3.6-35B-A3B] +hf = unsloth/Qwen3.6-35B-A3B-MTP-GGUF:UD-Q6_K +spec-draft-n-max = 3 + +[Qwen3.6-27B] +hf = unsloth/Qwen3.6-27B-MTP-GGUF:Q6_K +spec-draft-n-max = 6 +