From f274fd575791af756ef0be60b7f8c71d1dd2097c Mon Sep 17 00:00:00 2001
From: Chummy <chumyin0912@gmail.com>
Date: Fri, 20 Feb 2026 12:28:05 +0800
Subject: [PATCH] fix(channel): prevent false timeout during multi-turn tool
 loops (#1037)

---
 docs/config-reference.md |  4 +++-
 src/channels/mod.rs      | 38 ++++++++++++++++++++++++++++++++++++--
 src/config/schema.rs     |  9 ++++++---
 3 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/docs/config-reference.md b/docs/config-reference.md
index 0d0da02..4a182f5 100644
--- a/docs/config-reference.md
+++ b/docs/config-reference.md
@@ -332,7 +332,7 @@ Top-level channel options are configured under `channels_config`.
 
 | Key | Default | Purpose |
 |---|---|---|
-| `message_timeout_secs` | `300` | Timeout in seconds for processing a single channel message (LLM + tools) |
+| `message_timeout_secs` | `300` | Base timeout in seconds for channel message processing; runtime scales this with tool-loop depth (up to 4x) |
 
 Examples:
 
@@ -344,6 +344,8 @@ Examples:
 Notes:
 
 - Default `300s` is optimized for on-device LLMs (Ollama) which are slower than cloud APIs.
+- Runtime timeout budget is `message_timeout_secs * scale`, where `scale = min(max_tool_iterations, 4)` and a minimum of `1`.
+- This scaling avoids false timeouts when the first LLM turn is slow/retried but later tool-loop turns still need to complete.
 - If using cloud APIs (OpenAI, Anthropic, etc.), you can reduce this to `60` or lower.
 - Values below `30` are clamped to `30` to avoid immediate timeout churn.
 - When a timeout occurs, users receive: `⚠️ Request timed out while waiting for the model. Please try again.`
diff --git a/src/channels/mod.rs b/src/channels/mod.rs
index 0379bea..b9ec121 100644
--- a/src/channels/mod.rs
+++ b/src/channels/mod.rs
@@ -95,6 +95,8 @@ const MIN_CHANNEL_MESSAGE_TIMEOUT_SECS: u64 = 30;
 /// Default timeout for processing a single channel message (LLM + tools).
 /// Used as fallback when not configured in channels_config.message_timeout_secs.
 const CHANNEL_MESSAGE_TIMEOUT_SECS: u64 = 300;
+/// Cap timeout scaling so large max_tool_iterations values do not create unbounded waits.
+const CHANNEL_MESSAGE_TIMEOUT_SCALE_CAP: u64 = 4;
 const CHANNEL_PARALLELISM_PER_CHANNEL: usize = 4;
 const CHANNEL_MIN_IN_FLIGHT_MESSAGES: usize = 8;
 const CHANNEL_MAX_IN_FLIGHT_MESSAGES: usize = 64;
@@ -114,6 +116,15 @@ fn effective_channel_message_timeout_secs(configured: u64) -> u64 {
     configured.max(MIN_CHANNEL_MESSAGE_TIMEOUT_SECS)
 }
 
+fn channel_message_timeout_budget_secs(
+    message_timeout_secs: u64,
+    max_tool_iterations: usize,
+) -> u64 {
+    let iterations = max_tool_iterations.max(1) as u64;
+    let scale = iterations.min(CHANNEL_MESSAGE_TIMEOUT_SCALE_CAP);
+    message_timeout_secs.saturating_mul(scale)
+}
+
 #[derive(Debug, Clone, PartialEq, Eq)]
 struct ChannelRouteSelection {
     provider: String,
@@ -1223,10 +1234,12 @@ async fn process_channel_message(
         Cancelled,
     }
 
+    let timeout_budget_secs =
+        channel_message_timeout_budget_secs(ctx.message_timeout_secs, ctx.max_tool_iterations);
     let llm_result = tokio::select! {
         () = cancellation_token.cancelled() => LlmExecutionResult::Cancelled,
         result = tokio::time::timeout(
-            Duration::from_secs(ctx.message_timeout_secs),
+            Duration::from_secs(timeout_budget_secs),
             run_tool_call_loop(
                 active_provider.as_ref(),
                 &mut history,
@@ -1385,7 +1398,10 @@ async fn process_channel_message(
             }
         }
         LlmExecutionResult::Completed(Err(_)) => {
-            let timeout_msg = format!("LLM response timed out after {}s", ctx.message_timeout_secs);
+            let timeout_msg = format!(
+                "LLM response timed out after {}s (base={}s, max_tool_iterations={})",
+                timeout_budget_secs, ctx.message_timeout_secs, ctx.max_tool_iterations
+            );
             eprintln!(
                 "  ❌ {} (elapsed: {}ms)",
                 timeout_msg,
@@ -2641,6 +2657,24 @@ mod tests {
         assert_eq!(effective_channel_message_timeout_secs(300), 300);
     }
 
+    #[test]
+    fn channel_message_timeout_budget_scales_with_tool_iterations() {
+        assert_eq!(channel_message_timeout_budget_secs(300, 1), 300);
+        assert_eq!(channel_message_timeout_budget_secs(300, 2), 600);
+        assert_eq!(channel_message_timeout_budget_secs(300, 3), 900);
+    }
+
+    #[test]
+    fn channel_message_timeout_budget_uses_safe_defaults_and_cap() {
+        // 0 iterations falls back to 1x timeout budget.
+        assert_eq!(channel_message_timeout_budget_secs(300, 0), 300);
+        // Large iteration counts are capped to avoid runaway waits.
+        assert_eq!(
+            channel_message_timeout_budget_secs(300, 10),
+            300 * CHANNEL_MESSAGE_TIMEOUT_SCALE_CAP
+        );
+    }
+
     #[test]
     fn context_window_overflow_error_detector_matches_known_messages() {
         let overflow_err = anyhow::anyhow!(
diff --git a/src/config/schema.rs b/src/config/schema.rs
index a31c811..7167ffb 100644
--- a/src/config/schema.rs
+++ b/src/config/schema.rs
@@ -7,9 +7,9 @@ use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
 use std::path::{Path, PathBuf};
 use std::sync::{OnceLock, RwLock};
-use tokio::fs::{self, OpenOptions};
 #[cfg(unix)]
 use tokio::fs::File;
+use tokio::fs::{self, OpenOptions};
 use tokio::io::AsyncWriteExt;
 
 const SUPPORTED_PROXY_SERVICE_KEYS: &[&str] = &[
@@ -2197,7 +2197,10 @@ pub struct ChannelsConfig {
     pub dingtalk: Option<DingTalkConfig>,
     /// QQ Official Bot channel configuration.
     pub qq: Option<QQConfig>,
-    /// Timeout in seconds for processing a single channel message (LLM + tools).
+    /// Base timeout in seconds for processing a single channel message (LLM + tools).
+    /// Runtime uses this as a per-turn budget that scales with tool-loop depth
+    /// (up to 4x, capped) so one slow/retried model call does not consume the
+    /// entire conversation budget.
     /// Default: 300s for on-device LLMs (Ollama) which are slower than cloud APIs.
     #[serde(default = "default_channel_message_timeout_secs")]
     pub message_timeout_secs: u64,
@@ -3544,9 +3547,9 @@ async fn sync_directory(_path: &Path) -> Result<()> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use std::path::PathBuf;
     #[cfg(unix)]
     use std::{fs::Permissions, os::unix::fs::PermissionsExt};
-    use std::path::PathBuf;
     use tokio::sync::{Mutex, MutexGuard};
     use tokio::test;
     use tokio_stream::wrappers::ReadDirStream;