fix(provider): surface actionable custom-provider failure diagnostics
This commit is contained in:
parent
63364a4bfe
commit
58acf1efd3
3 changed files with 218 additions and 38 deletions
|
|
@ -70,6 +70,15 @@ zeroclaw agent -m "test message"
|
|||
|
||||
- Confirm model name matches provider's available models
|
||||
- Check provider documentation for exact model identifiers
|
||||
- Ensure endpoint and model family match. Some custom gateways only expose a subset of models.
|
||||
- Verify available models from the same endpoint and key you configured:
|
||||
|
||||
```bash
|
||||
curl -sS https://your-api.com/models \
|
||||
-H "Authorization: Bearer $API_KEY"
|
||||
```
|
||||
|
||||
- If the gateway does not implement `/models`, send a minimal chat request and inspect the provider's returned model error text.
|
||||
|
||||
### Connection Issues
|
||||
|
||||
|
|
|
|||
|
|
@ -452,6 +452,34 @@ fn extract_responses_text(response: ResponsesResponse) -> Option<String> {
|
|||
None
|
||||
}
|
||||
|
||||
fn compact_sanitized_body_snippet(body: &str) -> String {
|
||||
super::sanitize_api_error(body)
|
||||
.split_whitespace()
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
}
|
||||
|
||||
fn parse_chat_response_body(provider_name: &str, body: &str) -> anyhow::Result<ApiChatResponse> {
|
||||
serde_json::from_str::<ApiChatResponse>(body).map_err(|error| {
|
||||
let snippet = compact_sanitized_body_snippet(body);
|
||||
anyhow::anyhow!(
|
||||
"{provider_name} API returned an unexpected chat-completions payload: {error}; body={snippet}"
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
fn parse_responses_response_body(
|
||||
provider_name: &str,
|
||||
body: &str,
|
||||
) -> anyhow::Result<ResponsesResponse> {
|
||||
serde_json::from_str::<ResponsesResponse>(body).map_err(|error| {
|
||||
let snippet = compact_sanitized_body_snippet(body);
|
||||
anyhow::anyhow!(
|
||||
"{provider_name} Responses API returned an unexpected payload: {error}; body={snippet}"
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
impl OpenAiCompatibleProvider {
|
||||
fn apply_auth_header(
|
||||
&self,
|
||||
|
|
@ -494,7 +522,8 @@ impl OpenAiCompatibleProvider {
|
|||
anyhow::bail!("{} Responses API error: {error}", self.name);
|
||||
}
|
||||
|
||||
let responses: ResponsesResponse = response.json().await?;
|
||||
let body = response.text().await?;
|
||||
let responses = parse_responses_response_body(&self.name, &body)?;
|
||||
|
||||
extract_responses_text(responses)
|
||||
.ok_or_else(|| anyhow::anyhow!("No response from {} Responses API", self.name))
|
||||
|
|
@ -573,7 +602,8 @@ impl Provider for OpenAiCompatibleProvider {
|
|||
anyhow::bail!("{} API error ({status}): {sanitized}", self.name);
|
||||
}
|
||||
|
||||
let chat_response: ApiChatResponse = response.json().await?;
|
||||
let body = response.text().await?;
|
||||
let chat_response = parse_chat_response_body(&self.name, &body)?;
|
||||
|
||||
chat_response
|
||||
.choices
|
||||
|
|
@ -663,7 +693,8 @@ impl Provider for OpenAiCompatibleProvider {
|
|||
return Err(super::api_error(&self.name, response).await);
|
||||
}
|
||||
|
||||
let chat_response: ApiChatResponse = response.json().await?;
|
||||
let body = response.text().await?;
|
||||
let chat_response = parse_chat_response_body(&self.name, &body)?;
|
||||
|
||||
chat_response
|
||||
.choices
|
||||
|
|
@ -737,7 +768,8 @@ impl Provider for OpenAiCompatibleProvider {
|
|||
return Err(super::api_error(&self.name, response).await);
|
||||
}
|
||||
|
||||
let chat_response: ApiChatResponse = response.json().await?;
|
||||
let body = response.text().await?;
|
||||
let chat_response = parse_chat_response_body(&self.name, &body)?;
|
||||
let choice = chat_response
|
||||
.choices
|
||||
.into_iter()
|
||||
|
|
@ -1033,6 +1065,30 @@ mod tests {
|
|||
assert!(resp.choices.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_chat_response_body_reports_sanitized_snippet() {
|
||||
let body = r#"{"choices":"invalid","api_key":"sk-test-secret-value"}"#;
|
||||
let err = parse_chat_response_body("custom", body).expect_err("payload should fail");
|
||||
let msg = err.to_string();
|
||||
|
||||
assert!(msg.contains("custom API returned an unexpected chat-completions payload"));
|
||||
assert!(msg.contains("body="));
|
||||
assert!(msg.contains("[REDACTED]"));
|
||||
assert!(!msg.contains("sk-test-secret-value"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_responses_response_body_reports_sanitized_snippet() {
|
||||
let body = r#"{"output_text":123,"api_key":"sk-another-secret"}"#;
|
||||
let err = parse_responses_response_body("custom", body).expect_err("payload should fail");
|
||||
let msg = err.to_string();
|
||||
|
||||
assert!(msg.contains("custom Responses API returned an unexpected payload"));
|
||||
assert!(msg.contains("body="));
|
||||
assert!(msg.contains("[REDACTED]"));
|
||||
assert!(!msg.contains("sk-another-secret"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn x_api_key_auth_style() {
|
||||
let p = OpenAiCompatibleProvider::new(
|
||||
|
|
|
|||
|
|
@ -22,7 +22,37 @@ fn is_non_retryable(err: &anyhow::Error) -> bool {
|
|||
}
|
||||
}
|
||||
}
|
||||
false
|
||||
|
||||
let msg_lower = msg.to_lowercase();
|
||||
let auth_failure_hints = [
|
||||
"invalid api key",
|
||||
"incorrect api key",
|
||||
"missing api key",
|
||||
"api key not set",
|
||||
"authentication failed",
|
||||
"auth failed",
|
||||
"unauthorized",
|
||||
"forbidden",
|
||||
"permission denied",
|
||||
"access denied",
|
||||
"invalid token",
|
||||
];
|
||||
|
||||
if auth_failure_hints
|
||||
.iter()
|
||||
.any(|hint| msg_lower.contains(hint))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
let model_catalog_mismatch = msg_lower.contains("model")
|
||||
&& (msg_lower.contains("not found")
|
||||
|| msg_lower.contains("unknown")
|
||||
|| msg_lower.contains("unsupported")
|
||||
|| msg_lower.contains("does not exist")
|
||||
|| msg_lower.contains("invalid"));
|
||||
|
||||
model_catalog_mismatch
|
||||
}
|
||||
|
||||
/// Check if an error is a rate-limit (429) error.
|
||||
|
|
@ -70,6 +100,37 @@ fn parse_retry_after_ms(err: &anyhow::Error) -> Option<u64> {
|
|||
None
|
||||
}
|
||||
|
||||
fn failure_reason(rate_limited: bool, non_retryable: bool) -> &'static str {
|
||||
if rate_limited {
|
||||
"rate_limited"
|
||||
} else if non_retryable {
|
||||
"non_retryable"
|
||||
} else {
|
||||
"retryable"
|
||||
}
|
||||
}
|
||||
|
||||
fn compact_error_detail(err: &anyhow::Error) -> String {
|
||||
super::sanitize_api_error(&err.to_string())
|
||||
.split_whitespace()
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
}
|
||||
|
||||
fn push_failure(
|
||||
failures: &mut Vec<String>,
|
||||
provider_name: &str,
|
||||
model: &str,
|
||||
attempt: u32,
|
||||
max_attempts: u32,
|
||||
reason: &str,
|
||||
error_detail: &str,
|
||||
) {
|
||||
failures.push(format!(
|
||||
"provider={provider_name} model={model} attempt {attempt}/{max_attempts}: {reason}; error={error_detail}"
|
||||
));
|
||||
}
|
||||
|
||||
/// Provider wrapper with retry, fallback, auth rotation, and model failover.
|
||||
pub struct ReliableProvider {
|
||||
providers: Vec<(String, Box<dyn Provider>)>,
|
||||
|
|
@ -185,25 +246,25 @@ impl Provider for ReliableProvider {
|
|||
Err(e) => {
|
||||
let non_retryable = is_non_retryable(&e);
|
||||
let rate_limited = is_rate_limited(&e);
|
||||
let failure_reason = failure_reason(rate_limited, non_retryable);
|
||||
let error_detail = compact_error_detail(&e);
|
||||
|
||||
let failure_reason = if rate_limited {
|
||||
"rate_limited"
|
||||
} else if non_retryable {
|
||||
"non_retryable"
|
||||
} else {
|
||||
"retryable"
|
||||
};
|
||||
failures.push(format!(
|
||||
"provider={provider_name} model={current_model} attempt {}/{}: {failure_reason}",
|
||||
push_failure(
|
||||
&mut failures,
|
||||
provider_name,
|
||||
current_model,
|
||||
attempt + 1,
|
||||
self.max_retries + 1
|
||||
));
|
||||
self.max_retries + 1,
|
||||
failure_reason,
|
||||
&error_detail,
|
||||
);
|
||||
|
||||
// On rate-limit, try rotating API key
|
||||
if rate_limited {
|
||||
if let Some(new_key) = self.rotate_key() {
|
||||
tracing::info!(
|
||||
provider = provider_name,
|
||||
error = %error_detail,
|
||||
"Rate limited, rotated API key (key ending ...{})",
|
||||
&new_key[new_key.len().saturating_sub(4)..]
|
||||
);
|
||||
|
|
@ -214,6 +275,7 @@ impl Provider for ReliableProvider {
|
|||
tracing::warn!(
|
||||
provider = provider_name,
|
||||
model = *current_model,
|
||||
error = %error_detail,
|
||||
"Non-retryable error, moving on"
|
||||
);
|
||||
break;
|
||||
|
|
@ -226,6 +288,8 @@ impl Provider for ReliableProvider {
|
|||
model = *current_model,
|
||||
attempt = attempt + 1,
|
||||
backoff_ms = wait,
|
||||
reason = failure_reason,
|
||||
error = %error_detail,
|
||||
"Provider call failed, retrying"
|
||||
);
|
||||
tokio::time::sleep(Duration::from_millis(wait)).await;
|
||||
|
|
@ -290,24 +354,24 @@ impl Provider for ReliableProvider {
|
|||
Err(e) => {
|
||||
let non_retryable = is_non_retryable(&e);
|
||||
let rate_limited = is_rate_limited(&e);
|
||||
let failure_reason = failure_reason(rate_limited, non_retryable);
|
||||
let error_detail = compact_error_detail(&e);
|
||||
|
||||
let failure_reason = if rate_limited {
|
||||
"rate_limited"
|
||||
} else if non_retryable {
|
||||
"non_retryable"
|
||||
} else {
|
||||
"retryable"
|
||||
};
|
||||
failures.push(format!(
|
||||
"provider={provider_name} model={current_model} attempt {}/{}: {failure_reason}",
|
||||
push_failure(
|
||||
&mut failures,
|
||||
provider_name,
|
||||
current_model,
|
||||
attempt + 1,
|
||||
self.max_retries + 1
|
||||
));
|
||||
self.max_retries + 1,
|
||||
failure_reason,
|
||||
&error_detail,
|
||||
);
|
||||
|
||||
if rate_limited {
|
||||
if let Some(new_key) = self.rotate_key() {
|
||||
tracing::info!(
|
||||
provider = provider_name,
|
||||
error = %error_detail,
|
||||
"Rate limited, rotated API key (key ending ...{})",
|
||||
&new_key[new_key.len().saturating_sub(4)..]
|
||||
);
|
||||
|
|
@ -318,6 +382,7 @@ impl Provider for ReliableProvider {
|
|||
tracing::warn!(
|
||||
provider = provider_name,
|
||||
model = *current_model,
|
||||
error = %error_detail,
|
||||
"Non-retryable error, moving on"
|
||||
);
|
||||
break;
|
||||
|
|
@ -330,6 +395,8 @@ impl Provider for ReliableProvider {
|
|||
model = *current_model,
|
||||
attempt = attempt + 1,
|
||||
backoff_ms = wait,
|
||||
reason = failure_reason,
|
||||
error = %error_detail,
|
||||
"Provider call failed, retrying"
|
||||
);
|
||||
tokio::time::sleep(Duration::from_millis(wait)).await;
|
||||
|
|
@ -394,24 +461,24 @@ impl Provider for ReliableProvider {
|
|||
Err(e) => {
|
||||
let non_retryable = is_non_retryable(&e);
|
||||
let rate_limited = is_rate_limited(&e);
|
||||
let failure_reason = failure_reason(rate_limited, non_retryable);
|
||||
let error_detail = compact_error_detail(&e);
|
||||
|
||||
let failure_reason = if rate_limited {
|
||||
"rate_limited"
|
||||
} else if non_retryable {
|
||||
"non_retryable"
|
||||
} else {
|
||||
"retryable"
|
||||
};
|
||||
failures.push(format!(
|
||||
"{provider_name}/{current_model} attempt {}/{}: {failure_reason}",
|
||||
push_failure(
|
||||
&mut failures,
|
||||
provider_name,
|
||||
current_model,
|
||||
attempt + 1,
|
||||
self.max_retries + 1
|
||||
));
|
||||
self.max_retries + 1,
|
||||
failure_reason,
|
||||
&error_detail,
|
||||
);
|
||||
|
||||
if rate_limited {
|
||||
if let Some(new_key) = self.rotate_key() {
|
||||
tracing::info!(
|
||||
provider = provider_name,
|
||||
error = %error_detail,
|
||||
"Rate limited, rotated API key (key ending ...{})",
|
||||
&new_key[new_key.len().saturating_sub(4)..]
|
||||
);
|
||||
|
|
@ -422,6 +489,7 @@ impl Provider for ReliableProvider {
|
|||
tracing::warn!(
|
||||
provider = provider_name,
|
||||
model = *current_model,
|
||||
error = %error_detail,
|
||||
"Non-retryable error, moving on"
|
||||
);
|
||||
break;
|
||||
|
|
@ -434,6 +502,8 @@ impl Provider for ReliableProvider {
|
|||
model = *current_model,
|
||||
attempt = attempt + 1,
|
||||
backoff_ms = wait,
|
||||
reason = failure_reason,
|
||||
error = %error_detail,
|
||||
"Provider call failed, retrying"
|
||||
);
|
||||
tokio::time::sleep(Duration::from_millis(wait)).await;
|
||||
|
|
@ -716,6 +786,9 @@ mod tests {
|
|||
assert!(msg.contains("All providers/models failed"));
|
||||
assert!(msg.contains("provider=p1 model=test"));
|
||||
assert!(msg.contains("provider=p2 model=test"));
|
||||
assert!(msg.contains("error=p1 error"));
|
||||
assert!(msg.contains("error=p2 error"));
|
||||
assert!(msg.contains("retryable"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -724,6 +797,16 @@ mod tests {
|
|||
assert!(is_non_retryable(&anyhow::anyhow!("401 Unauthorized")));
|
||||
assert!(is_non_retryable(&anyhow::anyhow!("403 Forbidden")));
|
||||
assert!(is_non_retryable(&anyhow::anyhow!("404 Not Found")));
|
||||
assert!(is_non_retryable(&anyhow::anyhow!(
|
||||
"invalid api key provided"
|
||||
)));
|
||||
assert!(is_non_retryable(&anyhow::anyhow!("authentication failed")));
|
||||
assert!(is_non_retryable(&anyhow::anyhow!(
|
||||
"model glm-4.7 not found"
|
||||
)));
|
||||
assert!(is_non_retryable(&anyhow::anyhow!(
|
||||
"unsupported model: glm-4.7"
|
||||
)));
|
||||
assert!(!is_non_retryable(&anyhow::anyhow!("429 Too Many Requests")));
|
||||
assert!(!is_non_retryable(&anyhow::anyhow!("408 Request Timeout")));
|
||||
assert!(!is_non_retryable(&anyhow::anyhow!(
|
||||
|
|
@ -732,6 +815,38 @@ mod tests {
|
|||
assert!(!is_non_retryable(&anyhow::anyhow!("502 Bad Gateway")));
|
||||
assert!(!is_non_retryable(&anyhow::anyhow!("timeout")));
|
||||
assert!(!is_non_retryable(&anyhow::anyhow!("connection reset")));
|
||||
assert!(!is_non_retryable(&anyhow::anyhow!(
|
||||
"model overloaded, try again later"
|
||||
)));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn aggregated_error_marks_non_retryable_model_mismatch_with_details() {
|
||||
let calls = Arc::new(AtomicUsize::new(0));
|
||||
let provider = ReliableProvider::new(
|
||||
vec![(
|
||||
"custom".into(),
|
||||
Box::new(MockProvider {
|
||||
calls: Arc::clone(&calls),
|
||||
fail_until_attempt: usize::MAX,
|
||||
response: "never",
|
||||
error: "unsupported model: glm-4.7",
|
||||
}),
|
||||
)],
|
||||
3,
|
||||
1,
|
||||
);
|
||||
|
||||
let err = provider
|
||||
.simple_chat("hello", "glm-4.7", 0.0)
|
||||
.await
|
||||
.expect_err("provider should fail");
|
||||
let msg = err.to_string();
|
||||
|
||||
assert!(msg.contains("non_retryable"));
|
||||
assert!(msg.contains("error=unsupported model: glm-4.7"));
|
||||
// Non-retryable errors should not consume retry budget.
|
||||
assert_eq!(calls.load(Ordering::SeqCst), 1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue