From 6c5ce8742c186ebaffa29c6834f209aba5201f45 Mon Sep 17 00:00:00 2001
From: Harald Hoyer <harald@hoyer.xyz>
Date: Wed, 20 May 2026 14:23:20 +0200
Subject: [PATCH] fix(halo): only one model

---
 systems/x86_64-linux/halo/llama-server.nix | 2 +-
 systems/x86_64-linux/halo/models.ini       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/systems/x86_64-linux/halo/llama-server.nix b/systems/x86_64-linux/halo/llama-server.nix
index 8264f3a..4cca9f8 100644
--- a/systems/x86_64-linux/halo/llama-server.nix
+++ b/systems/x86_64-linux/halo/llama-server.nix
@@ -29,7 +29,7 @@
         "--host 0.0.0.0"
         "--port 8000"
         "--models-preset ${./models.ini}"
-        "--models-max 2"
+        "--models-max 1"
       ];
       Restart = "on-failure";
       RestartSec = 10;
diff --git a/systems/x86_64-linux/halo/models.ini b/systems/x86_64-linux/halo/models.ini
index ddb3509..6632557 100644
--- a/systems/x86_64-linux/halo/models.ini
+++ b/systems/x86_64-linux/halo/models.ini
@@ -11,7 +11,6 @@ cache-type-k        = bf16
 cache-type-v        = bf16
 mmap                = false
 no-context-shift    = true
-chat-template-kwargs = {"preserve_thinking": true}
 fit                 = on
 c                   = 131072
 
@@ -27,3 +26,4 @@ min-p               = 0.0
 parallel            = 2
 c                   = 524288
 load-on-startup     = true
+chat-template-kwargs = {"preserve_thinking": true}