feat(doctor): add enhanced diagnostics and config validation

- Expand  with grouped health report output
- Add semantic config checks (provider/model/temp/routes/channels)
- Add workspace checks (existence, write probe, disk availability)
- Preserve daemon/scheduler/channel freshness diagnostics
- Add environment checks (git/curl/shell/home)
- Add unit tests for provider validation and config edge cases

Also fix upstream signature drift to keep build green:
- channels: pass provider_name to agent_turn
- channels: pass workspace_dir to all_tools_with_runtime
- daemon: pass verbose flag to agent::run
This commit is contained in:
stawky 2026-02-16 19:30:30 +08:00 committed by Chummy
parent 7ebda43fdd
commit b0d4a1297b

View file

@ -1,28 +1,429 @@
use crate::config::Config;
use anyhow::{Context, Result};
use anyhow::Result;
use chrono::{DateTime, Utc};
use std::path::Path;
const DAEMON_STALE_SECONDS: i64 = 30;
const SCHEDULER_STALE_SECONDS: i64 = 120;
const CHANNEL_STALE_SECONDS: i64 = 300;
pub fn run(config: &Config) -> Result<()> {
let state_file = crate::daemon::state_file_path(config);
if !state_file.exists() {
println!("🩺 ZeroClaw Doctor");
println!(" ❌ daemon state file not found: {}", state_file.display());
println!(" 💡 Start daemon with: zeroclaw daemon");
return Ok(());
/// Known built-in provider names (must stay in sync with `create_provider`).
const KNOWN_PROVIDERS: &[&str] = &[
"openrouter",
"anthropic",
"openai",
"ollama",
"gemini",
"google",
"google-gemini",
"venice",
"vercel",
"vercel-ai",
"cloudflare",
"cloudflare-ai",
"moonshot",
"kimi",
"synthetic",
"opencode",
"opencode-zen",
"zai",
"z.ai",
"glm",
"zhipu",
"minimax",
"bedrock",
"aws-bedrock",
"qianfan",
"baidu",
"groq",
"mistral",
"xai",
"grok",
"deepseek",
"together",
"together-ai",
"fireworks",
"fireworks-ai",
"perplexity",
"cohere",
"copilot",
"github-copilot",
];
// ── Diagnostic item ──────────────────────────────────────────────
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum Severity {
Ok,
Warn,
Error,
}
struct DiagItem {
severity: Severity,
category: &'static str,
message: String,
}
impl DiagItem {
fn ok(category: &'static str, msg: impl Into<String>) -> Self {
Self {
severity: Severity::Ok,
category,
message: msg.into(),
}
}
fn warn(category: &'static str, msg: impl Into<String>) -> Self {
Self {
severity: Severity::Warn,
category,
message: msg.into(),
}
}
fn error(category: &'static str, msg: impl Into<String>) -> Self {
Self {
severity: Severity::Error,
category,
message: msg.into(),
}
}
let raw = std::fs::read_to_string(&state_file)
.with_context(|| format!("Failed to read {}", state_file.display()))?;
let snapshot: serde_json::Value = serde_json::from_str(&raw)
.with_context(|| format!("Failed to parse {}", state_file.display()))?;
fn icon(&self) -> &'static str {
match self.severity {
Severity::Ok => "",
Severity::Warn => "⚠️ ",
Severity::Error => "",
}
}
}
println!("🩺 ZeroClaw Doctor");
println!(" State file: {}", state_file.display());
// ── Public entry point ───────────────────────────────────────────
pub fn run(config: &Config) -> Result<()> {
let mut items: Vec<DiagItem> = Vec::new();
check_config_semantics(config, &mut items);
check_workspace(config, &mut items);
check_daemon_state(config, &mut items);
check_environment(&mut items);
// Print report
println!("🩺 ZeroClaw Doctor (enhanced)");
println!();
let mut current_cat = "";
for item in &items {
if item.category != current_cat {
current_cat = item.category;
println!(" [{current_cat}]");
}
println!(" {} {}", item.icon(), item.message);
}
let errors = items
.iter()
.filter(|i| i.severity == Severity::Error)
.count();
let warns = items
.iter()
.filter(|i| i.severity == Severity::Warn)
.count();
let oks = items.iter().filter(|i| i.severity == Severity::Ok).count();
println!();
println!(" Summary: {oks} ok, {warns} warnings, {errors} errors");
if errors > 0 {
println!(" 💡 Fix the errors above, then run `zeroclaw doctor` again.");
}
Ok(())
}
// ── Config semantic validation ───────────────────────────────────
fn check_config_semantics(config: &Config, items: &mut Vec<DiagItem>) {
let cat = "config";
// Config file exists
if config.config_path.exists() {
items.push(DiagItem::ok(
cat,
format!("config file: {}", config.config_path.display()),
));
} else {
items.push(DiagItem::error(
cat,
format!("config file not found: {}", config.config_path.display()),
));
}
// Provider validity
if let Some(ref provider) = config.default_provider {
if is_known_provider(provider) {
items.push(DiagItem::ok(
cat,
format!("provider \"{provider}\" is valid"),
));
} else {
items.push(DiagItem::error(
cat,
format!(
"unknown provider \"{provider}\". Use a known name or \"custom:<url>\" / \"anthropic-custom:<url>\""
),
));
}
} else {
items.push(DiagItem::error(cat, "no default_provider configured"));
}
// API key presence
if config.default_provider.as_deref() != Some("ollama") {
if config.api_key.is_some() {
items.push(DiagItem::ok(cat, "API key configured"));
} else {
items.push(DiagItem::warn(
cat,
"no api_key set (may rely on env vars or provider defaults)",
));
}
}
// Model configured
if config.default_model.is_some() {
items.push(DiagItem::ok(
cat,
format!(
"default model: {}",
config.default_model.as_deref().unwrap_or("?")
),
));
} else {
items.push(DiagItem::warn(cat, "no default_model configured"));
}
// Temperature range
if config.default_temperature >= 0.0 && config.default_temperature <= 2.0 {
items.push(DiagItem::ok(
cat,
format!(
"temperature {:.1} (valid range 0.02.0)",
config.default_temperature
),
));
} else {
items.push(DiagItem::error(
cat,
format!(
"temperature {:.1} is out of range (expected 0.02.0)",
config.default_temperature
),
));
}
// Gateway port range
let port = config.gateway.port;
if port > 0 {
items.push(DiagItem::ok(cat, format!("gateway port: {port}")));
} else {
items.push(DiagItem::error(cat, "gateway port is 0 (invalid)"));
}
// Reliability: fallback providers
for fb in &config.reliability.fallback_providers {
if !is_known_provider(fb) {
items.push(DiagItem::warn(
cat,
format!("fallback provider \"{fb}\" is not a known provider name"),
));
}
}
// Model routes validation
for route in &config.model_routes {
if route.hint.is_empty() {
items.push(DiagItem::warn(cat, "model route with empty hint"));
}
if !is_known_provider(&route.provider) {
items.push(DiagItem::warn(
cat,
format!(
"model route \"{}\" references unknown provider \"{}\"",
route.hint, route.provider
),
));
}
if route.model.is_empty() {
items.push(DiagItem::warn(
cat,
format!("model route \"{}\" has empty model", route.hint),
));
}
}
// Channel: at least one configured
let cc = &config.channels_config;
let has_channel = cc.telegram.is_some()
|| cc.discord.is_some()
|| cc.slack.is_some()
|| cc.imessage.is_some()
|| cc.matrix.is_some()
|| cc.whatsapp.is_some()
|| cc.email.is_some()
|| cc.irc.is_some()
|| cc.lark.is_some()
|| cc.webhook.is_some();
if has_channel {
items.push(DiagItem::ok(cat, "at least one channel configured"));
} else {
items.push(DiagItem::warn(
cat,
"no channels configured — run `zeroclaw onboard` to set one up",
));
}
// Delegate agents: provider validity
for (name, agent) in &config.agents {
if !is_known_provider(&agent.provider) {
items.push(DiagItem::warn(
cat,
format!(
"agent \"{name}\" uses unknown provider \"{}\"",
agent.provider
),
));
}
}
}
fn is_known_provider(name: &str) -> bool {
KNOWN_PROVIDERS.contains(&name)
|| name.starts_with("custom:")
|| name.starts_with("anthropic-custom:")
}
// ── Workspace integrity ──────────────────────────────────────────
fn check_workspace(config: &Config, items: &mut Vec<DiagItem>) {
let cat = "workspace";
let ws = &config.workspace_dir;
if ws.exists() {
items.push(DiagItem::ok(
cat,
format!("directory exists: {}", ws.display()),
));
} else {
items.push(DiagItem::error(
cat,
format!("directory missing: {}", ws.display()),
));
return;
}
// Writable check
let probe = ws.join(".zeroclaw_doctor_probe");
match std::fs::write(&probe, b"probe") {
Ok(()) => {
let _ = std::fs::remove_file(&probe);
items.push(DiagItem::ok(cat, "directory is writable"));
}
Err(e) => {
items.push(DiagItem::error(
cat,
format!("directory is not writable: {e}"),
));
}
}
// Disk space (best-effort via `df`)
if let Some(avail_mb) = disk_available_mb(ws) {
if avail_mb >= 100 {
items.push(DiagItem::ok(
cat,
format!("disk space: {avail_mb} MB available"),
));
} else {
items.push(DiagItem::warn(
cat,
format!("low disk space: only {avail_mb} MB available"),
));
}
}
// Key workspace files
check_file_exists(ws, "SOUL.md", false, cat, items);
check_file_exists(ws, "AGENTS.md", false, cat, items);
}
fn check_file_exists(
base: &Path,
name: &str,
required: bool,
cat: &'static str,
items: &mut Vec<DiagItem>,
) {
let path = base.join(name);
if path.exists() {
items.push(DiagItem::ok(cat, format!("{name} present")));
} else if required {
items.push(DiagItem::error(cat, format!("{name} missing")));
} else {
items.push(DiagItem::warn(cat, format!("{name} not found (optional)")));
}
}
fn disk_available_mb(path: &Path) -> Option<u64> {
let output = std::process::Command::new("df")
.arg("-m")
.arg(path)
.output()
.ok()?;
if !output.status.success() {
return None;
}
let stdout = String::from_utf8_lossy(&output.stdout);
// Second line, 4th column is "Available" in `df -m`
let line = stdout.lines().nth(1)?;
let avail = line.split_whitespace().nth(3)?;
avail.parse::<u64>().ok()
}
// ── Daemon state (original logic, preserved) ─────────────────────
fn check_daemon_state(config: &Config, items: &mut Vec<DiagItem>) {
let cat = "daemon";
let state_file = crate::daemon::state_file_path(config);
if !state_file.exists() {
items.push(DiagItem::error(
cat,
format!(
"state file not found: {} — is the daemon running?",
state_file.display()
),
));
return;
}
let raw = match std::fs::read_to_string(&state_file) {
Ok(r) => r,
Err(e) => {
items.push(DiagItem::error(cat, format!("cannot read state file: {e}")));
return;
}
};
let snapshot: serde_json::Value = match serde_json::from_str(&raw) {
Ok(v) => v,
Err(e) => {
items.push(DiagItem::error(cat, format!("invalid state JSON: {e}")));
return;
}
};
// Daemon heartbeat freshness
let updated_at = snapshot
.get("updated_at")
.and_then(serde_json::Value::as_str)
@ -33,28 +434,32 @@ pub fn run(config: &Config) -> Result<()> {
.signed_duration_since(ts.with_timezone(&Utc))
.num_seconds();
if age <= DAEMON_STALE_SECONDS {
println!(" ✅ daemon heartbeat fresh ({age}s ago)");
items.push(DiagItem::ok(cat, format!("heartbeat fresh ({age}s ago)")));
} else {
println!(" ❌ daemon heartbeat stale ({age}s ago)");
items.push(DiagItem::error(
cat,
format!("heartbeat stale ({age}s ago)"),
));
}
} else {
println!(" ❌ invalid daemon timestamp: {updated_at}");
items.push(DiagItem::error(
cat,
format!("invalid daemon timestamp: {updated_at}"),
));
}
let mut channel_count = 0_u32;
let mut stale_channels = 0_u32;
// Components
if let Some(components) = snapshot
.get("components")
.and_then(serde_json::Value::as_object)
{
// Scheduler
if let Some(scheduler) = components.get("scheduler") {
let scheduler_ok = scheduler
.get("status")
.and_then(serde_json::Value::as_str)
.is_some_and(|s| s == "ok");
let scheduler_last_ok = scheduler
let scheduler_age = scheduler
.get("last_ok")
.and_then(serde_json::Value::as_str)
.and_then(parse_rfc3339)
@ -62,22 +467,28 @@ pub fn run(config: &Config) -> Result<()> {
Utc::now().signed_duration_since(dt).num_seconds()
});
if scheduler_ok && scheduler_last_ok <= SCHEDULER_STALE_SECONDS {
println!(" ✅ scheduler healthy (last ok {scheduler_last_ok}s ago)");
if scheduler_ok && scheduler_age <= SCHEDULER_STALE_SECONDS {
items.push(DiagItem::ok(
cat,
format!("scheduler healthy (last ok {scheduler_age}s ago)"),
));
} else {
println!(
" ❌ scheduler unhealthy/stale (status_ok={scheduler_ok}, age={scheduler_last_ok}s)"
);
items.push(DiagItem::error(
cat,
format!("scheduler unhealthy (ok={scheduler_ok}, age={scheduler_age}s)"),
));
}
} else {
println!(" ❌ scheduler component missing");
items.push(DiagItem::warn(cat, "scheduler component not tracked yet"));
}
// Channels
let mut channel_count = 0u32;
let mut stale = 0u32;
for (name, component) in components {
if !name.starts_with("channel:") {
continue;
}
channel_count += 1;
let status_ok = component
.get("status")
@ -92,23 +503,88 @@ pub fn run(config: &Config) -> Result<()> {
});
if status_ok && age <= CHANNEL_STALE_SECONDS {
println!("{name} fresh (last ok {age}s ago)");
items.push(DiagItem::ok(cat, format!("{name} fresh ({age}s ago)")));
} else {
stale_channels += 1;
println!("{name} stale/unhealthy (status_ok={status_ok}, age={age}s)");
stale += 1;
items.push(DiagItem::error(
cat,
format!("{name} stale (ok={status_ok}, age={age}s)"),
));
}
}
}
if channel_count == 0 {
println!(" no channel components tracked in state yet");
} else {
println!(" Channel summary: {channel_count} total, {stale_channels} stale");
if channel_count == 0 {
items.push(DiagItem::warn(cat, "no channel components tracked yet"));
} else if stale > 0 {
items.push(DiagItem::warn(
cat,
format!("{channel_count} channels, {stale} stale"),
));
}
}
Ok(())
}
// ── Environment checks ───────────────────────────────────────────
fn check_environment(items: &mut Vec<DiagItem>) {
let cat = "environment";
// git
check_command_available("git", &["--version"], cat, items);
// Shell
let shell = std::env::var("SHELL").unwrap_or_default();
if !shell.is_empty() {
items.push(DiagItem::ok(cat, format!("shell: {shell}")));
} else {
items.push(DiagItem::warn(cat, "$SHELL not set"));
}
// HOME
if std::env::var("HOME").is_ok() || std::env::var("USERPROFILE").is_ok() {
items.push(DiagItem::ok(cat, "home directory env set"));
} else {
items.push(DiagItem::error(
cat,
"neither $HOME nor $USERPROFILE is set",
));
}
// Optional tools
check_command_available("curl", &["--version"], cat, items);
}
fn check_command_available(cmd: &str, args: &[&str], cat: &'static str, items: &mut Vec<DiagItem>) {
match std::process::Command::new(cmd)
.args(args)
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
.output()
{
Ok(output) if output.status.success() => {
let ver = String::from_utf8_lossy(&output.stdout);
let first_line = ver.lines().next().unwrap_or("").trim();
let display = if first_line.len() > 60 {
format!("{}", &first_line[..60])
} else {
first_line.to_string()
};
items.push(DiagItem::ok(cat, format!("{cmd}: {display}")));
}
Ok(_) => {
items.push(DiagItem::warn(
cat,
format!("{cmd} found but returned non-zero"),
));
}
Err(_) => {
items.push(DiagItem::warn(cat, format!("{cmd} not found in PATH")));
}
}
}
// ── Helpers ──────────────────────────────────────────────────────
fn parse_rfc3339(raw: &str) -> Option<DateTime<Utc>> {
DateTime::parse_from_rfc3339(raw)
.ok()
@ -118,85 +594,118 @@ fn parse_rfc3339(raw: &str) -> Option<DateTime<Utc>> {
#[cfg(test)]
mod tests {
use super::*;
use crate::config::Config;
use serde_json::json;
use tempfile::TempDir;
fn test_config(tmp: &TempDir) -> Config {
#[test]
fn known_providers_recognized() {
assert!(is_known_provider("openrouter"));
assert!(is_known_provider("anthropic"));
assert!(is_known_provider("ollama"));
assert!(is_known_provider("gemini"));
assert!(is_known_provider("custom:https://example.com"));
assert!(is_known_provider("anthropic-custom:https://example.com"));
assert!(!is_known_provider("nonexistent-provider"));
assert!(!is_known_provider(""));
}
#[test]
fn diag_item_icons() {
assert_eq!(DiagItem::ok("t", "m").icon(), "");
assert_eq!(DiagItem::warn("t", "m").icon(), "⚠️ ");
assert_eq!(DiagItem::error("t", "m").icon(), "");
}
#[test]
fn config_validation_catches_bad_temperature() {
let mut config = Config::default();
config.workspace_dir = tmp.path().join("workspace");
config.config_path = tmp.path().join("config.toml");
config
config.default_temperature = 5.0;
let mut items = Vec::new();
check_config_semantics(&config, &mut items);
let temp_item = items.iter().find(|i| i.message.contains("temperature"));
assert!(temp_item.is_some());
assert_eq!(temp_item.unwrap().severity, Severity::Error);
}
#[test]
fn parse_rfc3339_accepts_valid_timestamp() {
let parsed = parse_rfc3339("2025-01-02T03:04:05Z");
assert!(parsed.is_some());
fn config_validation_accepts_valid_temperature() {
let mut config = Config::default();
config.default_temperature = 0.7;
let mut items = Vec::new();
check_config_semantics(&config, &mut items);
let temp_item = items.iter().find(|i| i.message.contains("temperature"));
assert!(temp_item.is_some());
assert_eq!(temp_item.unwrap().severity, Severity::Ok);
}
#[test]
fn parse_rfc3339_rejects_invalid_timestamp() {
let parsed = parse_rfc3339("not-a-timestamp");
assert!(parsed.is_none());
fn config_validation_warns_no_channels() {
let config = Config::default();
let mut items = Vec::new();
check_config_semantics(&config, &mut items);
let ch_item = items.iter().find(|i| i.message.contains("channel"));
assert!(ch_item.is_some());
assert_eq!(ch_item.unwrap().severity, Severity::Warn);
}
#[test]
fn run_returns_ok_when_state_file_missing() {
let tmp = TempDir::new().unwrap();
let config = test_config(&tmp);
let result = run(&config);
assert!(result.is_ok());
fn config_validation_catches_unknown_provider() {
let mut config = Config::default();
config.default_provider = Some("totally-fake".into());
let mut items = Vec::new();
check_config_semantics(&config, &mut items);
let prov_item = items
.iter()
.find(|i| i.message.contains("unknown provider"));
assert!(prov_item.is_some());
assert_eq!(prov_item.unwrap().severity, Severity::Error);
}
#[test]
fn run_returns_error_for_invalid_json_state_file() {
let tmp = TempDir::new().unwrap();
let config = test_config(&tmp);
let state_file = crate::daemon::state_file_path(&config);
std::fs::write(&state_file, "not-json").unwrap();
let result = run(&config);
assert!(result.is_err());
let error_text = result.unwrap_err().to_string();
assert!(error_text.contains("Failed to parse"));
fn config_validation_accepts_custom_provider() {
let mut config = Config::default();
config.default_provider = Some("custom:https://my-api.com".into());
let mut items = Vec::new();
check_config_semantics(&config, &mut items);
let prov_item = items.iter().find(|i| i.message.contains("is valid"));
assert!(prov_item.is_some());
assert_eq!(prov_item.unwrap().severity, Severity::Ok);
}
#[test]
fn run_accepts_well_formed_state_snapshot() {
let tmp = TempDir::new().unwrap();
let config = test_config(&tmp);
let state_file = crate::daemon::state_file_path(&config);
fn config_validation_warns_bad_fallback() {
let mut config = Config::default();
config.reliability.fallback_providers = vec!["fake-provider".into()];
let mut items = Vec::new();
check_config_semantics(&config, &mut items);
let fb_item = items
.iter()
.find(|i| i.message.contains("fallback provider"));
assert!(fb_item.is_some());
assert_eq!(fb_item.unwrap().severity, Severity::Warn);
}
let now = Utc::now().to_rfc3339();
let snapshot = json!({
"updated_at": now,
"components": {
"scheduler": {
"status": "ok",
"last_ok": now,
"last_error": null,
"updated_at": now,
"restart_count": 0
},
"channel:discord": {
"status": "ok",
"last_ok": now,
"last_error": null,
"updated_at": now,
"restart_count": 0
}
}
});
#[test]
fn config_validation_warns_empty_model_route() {
let mut config = Config::default();
config.model_routes = vec![crate::config::ModelRouteConfig {
hint: "fast".into(),
provider: "groq".into(),
model: String::new(),
api_key: None,
}];
let mut items = Vec::new();
check_config_semantics(&config, &mut items);
let route_item = items.iter().find(|i| i.message.contains("empty model"));
assert!(route_item.is_some());
assert_eq!(route_item.unwrap().severity, Severity::Warn);
}
std::fs::write(&state_file, serde_json::to_vec_pretty(&snapshot).unwrap()).unwrap();
let result = run(&config);
assert!(result.is_ok());
#[test]
fn environment_check_finds_git() {
let mut items = Vec::new();
check_environment(&mut items);
let git_item = items.iter().find(|i| i.message.starts_with("git:"));
// git should be available in any CI/dev environment
assert!(git_item.is_some());
assert_eq!(git_item.unwrap().severity, Severity::Ok);
}
}