perf(llama-cpp-rocm): tune for Strix Halo (gfx1151)
- Restrict rocmGpuTargets to gfx1151 (Radeon 8060S, RDNA 3.5) — smaller closure, faster compile, no wasted device kernels. - Enable GGML_HIP_ROCWMMA_FATTN: rocWMMA-backed flash attention is a major win on RDNA3+ for the GPU-offloaded attention path. - Enable GGML_HIP_GRAPHS to lower per-token launch overhead. - Add rocwmma to buildInputs to satisfy the WMMA path. llama-server on halo runs with -ngl 99 --flash-attn on, so these flags target the hot path. CPU-side AVX-512 was skipped intentionally — Zen 5 has it, but with full GPU offload the CPU paths barely run.
This commit is contained in:
parent
623a71f95f
commit
f62e8ac470
1 changed files with 27 additions and 12 deletions
|
|
@ -12,7 +12,13 @@ final: prev: {
|
||||||
# nodejs_20
|
# nodejs_20
|
||||||
;
|
;
|
||||||
|
|
||||||
llama-cpp-rocm = channels.unstable.llama-cpp-rocm.overrideAttrs (_: {
|
# Tuned for Strix Halo (Ryzen AI Max+ 395 / Radeon 8060S, gfx1151).
|
||||||
|
llama-cpp-rocm =
|
||||||
|
(channels.unstable.llama-cpp.override {
|
||||||
|
rocmSupport = true;
|
||||||
|
rocmGpuTargets = [ "gfx1151" ];
|
||||||
|
}).overrideAttrs
|
||||||
|
(prevAttrs: {
|
||||||
src = prev.fetchFromGitHub {
|
src = prev.fetchFromGitHub {
|
||||||
owner = "am17an";
|
owner = "am17an";
|
||||||
repo = "llama.cpp";
|
repo = "llama.cpp";
|
||||||
|
|
@ -23,6 +29,15 @@ final: prev: {
|
||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
npmDepsHash = "sha256-k62LIbyY2DXvs7XXbX0lNPiYxuYzeJUyQtS4eA+68f8=";
|
npmDepsHash = "sha256-k62LIbyY2DXvs7XXbX0lNPiYxuYzeJUyQtS4eA+68f8=";
|
||||||
|
|
||||||
|
buildInputs = (prevAttrs.buildInputs or [ ]) ++ [
|
||||||
|
channels.unstable.rocmPackages.rocwmma
|
||||||
|
];
|
||||||
|
|
||||||
|
cmakeFlags = (prevAttrs.cmakeFlags or [ ]) ++ [
|
||||||
|
"-DGGML_HIP_ROCWMMA_FATTN=ON"
|
||||||
|
"-DGGML_HIP_GRAPHS=ON"
|
||||||
|
];
|
||||||
});
|
});
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue