feat(halo): add different llama servers
This commit is contained in:
parent
b548126fb8
commit
2009ea96b3
4 changed files with 122 additions and 6 deletions
|
|
@ -10,7 +10,7 @@ with lib.metacfg;
|
|||
./hardware-configuration.nix
|
||||
#./xremap.nix
|
||||
./wyoming.nix
|
||||
./llama-server.nix
|
||||
./llama-server-code-next.nix
|
||||
];
|
||||
|
||||
boot.lanzaboote.pkiBundle = "/var/lib/sbctl";
|
||||
|
|
|
|||
60
systems/x86_64-linux/halo/llama-server-27B-MTP.nix
Normal file
60
systems/x86_64-linux/halo/llama-server-27B-MTP.nix
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
{
|
||||
pkgs,
|
||||
lib,
|
||||
...
|
||||
}:
|
||||
{
|
||||
systemd.services.llama-server = {
|
||||
description = "llama.cpp server (Qwen3.6-27B-MTP, ROCm)";
|
||||
after = [ "network-online.target" ];
|
||||
wants = [ "network-online.target" ];
|
||||
wantedBy = [ "multi-user.target" ];
|
||||
|
||||
environment = {
|
||||
HOME = "%S/llama-server";
|
||||
HF_HOME = "%S/llama-server";
|
||||
};
|
||||
|
||||
serviceConfig = {
|
||||
Type = "simple";
|
||||
DynamicUser = true;
|
||||
SupplementaryGroups = [
|
||||
"video"
|
||||
"render"
|
||||
];
|
||||
StateDirectory = "llama-server";
|
||||
CacheDirectory = "llama-server";
|
||||
WorkingDirectory = "%S/llama-server";
|
||||
ExecStartPre = "${pkgs.coreutils}/bin/mkdir -p %C/llama-server/kv-slots-27B-MTP";
|
||||
ExecStart = lib.concatStringsSep " " [
|
||||
"${pkgs.llama-cpp-rocm}/bin/llama-server"
|
||||
"--flash-attn on"
|
||||
"--parallel 1"
|
||||
"--jinja"
|
||||
"--host 0.0.0.0"
|
||||
"--port 8000"
|
||||
"--no-mmap"
|
||||
"--n-gpu-layers 99"
|
||||
"-hf am17an/Qwen3.6-27B-MTP-GGUF:Q8_0"
|
||||
"--alias qwen3.6-27b"
|
||||
"--threads 8"
|
||||
"--ubatch-size 256"
|
||||
"-ctk bf16 -ctv bf16"
|
||||
"--temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00"
|
||||
"--no-context-shift"
|
||||
''--chat-template-kwargs '{"preserve_thinking": true}' ''
|
||||
"-c 262144"
|
||||
"--fit on"
|
||||
"--slot-save-path %C/llama-server/kv-slots-27B-MTP"
|
||||
"--spec-type mtp --spec-draft-n-max 3"
|
||||
];
|
||||
Restart = "on-failure";
|
||||
RestartSec = 10;
|
||||
|
||||
PrivateTmp = true;
|
||||
ProtectSystem = "strict";
|
||||
ProtectHome = true;
|
||||
NoNewPrivileges = true;
|
||||
};
|
||||
};
|
||||
}
|
||||
57
systems/x86_64-linux/halo/llama-server-coder-next.nix
Normal file
57
systems/x86_64-linux/halo/llama-server-coder-next.nix
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
{
|
||||
pkgs,
|
||||
lib,
|
||||
...
|
||||
}:
|
||||
{
|
||||
systemd.services.llama-server = {
|
||||
description = "llama.cpp server (Qwen3-Coder-Next, ROCm)";
|
||||
after = [ "network-online.target" ];
|
||||
wants = [ "network-online.target" ];
|
||||
wantedBy = [ "multi-user.target" ];
|
||||
|
||||
environment = {
|
||||
HOME = "%S/llama-server";
|
||||
HF_HOME = "%S/llama-server";
|
||||
};
|
||||
|
||||
serviceConfig = {
|
||||
Type = "simple";
|
||||
DynamicUser = true;
|
||||
SupplementaryGroups = [
|
||||
"video"
|
||||
"render"
|
||||
];
|
||||
StateDirectory = "llama-server";
|
||||
CacheDirectory = "llama-server";
|
||||
WorkingDirectory = "%S/llama-server";
|
||||
ExecStartPre = "${pkgs.coreutils}/bin/mkdir -p %C/llama-server/kv-slots-coder-next";
|
||||
ExecStart = lib.concatStringsSep " " [
|
||||
"${pkgs.llama-cpp-rocm}/bin/llama-server"
|
||||
"--flash-attn on"
|
||||
"--parallel 1"
|
||||
"--jinja"
|
||||
"--host 0.0.0.0"
|
||||
"--port 8000"
|
||||
"--no-mmap"
|
||||
"--n-gpu-layers 99"
|
||||
"--threads 8"
|
||||
"--ubatch-size 256"
|
||||
"-ctk bf16 -ctv bf16"
|
||||
"--fit on"
|
||||
"--no-context-shift"
|
||||
"-hf unsloth/Qwen3-Coder-Next-GGUF:UD-Q8_K_XL"
|
||||
"--alias qwen3-coder-next"
|
||||
"--temp 1.0 --top-p 0.95 --min-p 0.01 --top-k 40"
|
||||
"--slot-save-path %C/llama-server/kv-slots-coder-next"
|
||||
];
|
||||
Restart = "on-failure";
|
||||
RestartSec = 10;
|
||||
|
||||
PrivateTmp = true;
|
||||
ProtectSystem = "strict";
|
||||
ProtectHome = true;
|
||||
NoNewPrivileges = true;
|
||||
};
|
||||
};
|
||||
}
|
||||
|
|
@ -5,7 +5,7 @@
|
|||
}:
|
||||
{
|
||||
systemd.services.llama-server = {
|
||||
description = "llama.cpp server (Qwen3.6-27B-MTP, ROCm)";
|
||||
description = "llama.cpp server (Qwen3.6-35B-A3B, ROCm)";
|
||||
after = [ "network-online.target" ];
|
||||
wants = [ "network-online.target" ];
|
||||
wantedBy = [ "multi-user.target" ];
|
||||
|
|
@ -29,13 +29,13 @@
|
|||
ExecStart = lib.concatStringsSep " " [
|
||||
"${pkgs.llama-cpp-rocm}/bin/llama-server"
|
||||
"--flash-attn on"
|
||||
"--parallel 1"
|
||||
"--parallel 2"
|
||||
"--jinja"
|
||||
"--host 0.0.0.0"
|
||||
"--port 8000"
|
||||
"--no-mmap"
|
||||
"--n-gpu-layers 99"
|
||||
"-hf am17an/Qwen3.6-27B-MTP-GGUF:Q8_0"
|
||||
"-hf unsloth/Qwen3.6-27B-GGUF:UD-Q8_K_XL"
|
||||
"--alias qwen3.6-27b"
|
||||
"--threads 8"
|
||||
"--ubatch-size 256"
|
||||
|
|
@ -43,10 +43,9 @@
|
|||
"--temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00"
|
||||
"--no-context-shift"
|
||||
''--chat-template-kwargs '{"preserve_thinking": true}' ''
|
||||
"-c 262144"
|
||||
"-c 524288"
|
||||
"--fit on"
|
||||
"--slot-save-path %C/llama-server/kv-slots"
|
||||
"--spec-type mtp --spec-draft-n-max 3"
|
||||
];
|
||||
Restart = "on-failure";
|
||||
RestartSec = 10;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue