feat(observability): implement Prometheus metrics backend with /metrics endpoint

- Adds PrometheusObserver backend with counters, histograms, and gauges
- Tracks agent starts/duration, tool calls, channel messages, heartbeat ticks, errors, request latency, tokens, sessions, queue depth
- Adds GET /metrics endpoint to gateway for Prometheus scraping
- Adds provider/model labels to AgentStart and AgentEnd events for better observability
- Adds as_any() method to Observer trait for backend-specific downcast

Metrics exposed:
- zeroclaw_agent_starts_total (Counter) with provider/model labels
- zeroclaw_agent_duration_seconds (Histogram) with provider/model labels
- zeroclaw_tool_calls_total (Counter) with tool/success labels
- zeroclaw_tool_duration_seconds (Histogram) with tool label
- zeroclaw_channel_messages_total (Counter) with channel/direction labels
- zeroclaw_heartbeat_ticks_total (Counter)
- zeroclaw_errors_total (Counter) with component label
- zeroclaw_request_latency_seconds (Histogram)
- zeroclaw_tokens_used_last (Gauge)
- zeroclaw_active_sessions (Gauge)
- zeroclaw_queue_depth (Gauge)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
argenis de la rosa 2026-02-17 14:01:37 -05:00 committed by Chummy
parent c04f2855e4
commit eba544dbd4
11 changed files with 575 additions and 228 deletions

View file

@ -556,9 +556,10 @@ pub async fn run(
}
agent.observer.record_event(&ObserverEvent::AgentEnd {
provider: "cli".to_string(),
model: "unknown".to_string(),
duration: start.elapsed(),
tokens_used: None,
cost_usd: None,
});
Ok(())

View file

@ -1332,9 +1332,10 @@ pub async fn run(
let duration = start.elapsed();
observer.record_event(&ObserverEvent::AgentEnd {
provider: provider_name.to_string(),
model: model_name.to_string(),
duration,
tokens_used: None,
cost_usd: None,
});
Ok(final_output)

View file

@ -274,6 +274,8 @@ pub struct AppState {
pub whatsapp: Option<Arc<WhatsAppChannel>>,
/// `WhatsApp` app secret for webhook signature verification (`X-Hub-Signature-256`)
pub whatsapp_app_secret: Option<Arc<str>>,
/// Observability backend for metrics scraping
pub observer: Arc<dyn crate::observability::Observer>,
}
/// Run the HTTP gateway using axum with proper HTTP/1.1 compliance.
@ -433,6 +435,7 @@ pub async fn run_gateway(host: &str, port: u16, config: Config) -> Result<()> {
println!(" POST /whatsapp — WhatsApp message webhook");
}
println!(" GET /health — health check");
println!(" GET /metrics — Prometheus metrics");
if let Some(code) = pairing.pairing_code() {
println!();
println!(" 🔐 PAIRING REQUIRED — use this one-time code:");
@ -450,6 +453,9 @@ pub async fn run_gateway(host: &str, port: u16, config: Config) -> Result<()> {
crate::health::mark_component_ok("gateway");
// Build shared state
let observer: Arc<dyn crate::observability::Observer> =
Arc::from(crate::observability::create_observer(&config.observability));
let state = AppState {
config: config_state,
provider,
@ -464,11 +470,13 @@ pub async fn run_gateway(host: &str, port: u16, config: Config) -> Result<()> {
idempotency_store,
whatsapp: whatsapp_channel,
whatsapp_app_secret,
observer,
};
// Build router with middleware
let app = Router::new()
.route("/health", get(handle_health))
.route("/metrics", get(handle_metrics))
.route("/pair", post(handle_pair))
.route("/webhook", post(handle_webhook))
.route("/whatsapp", get(handle_whatsapp_verify))
@ -504,6 +512,29 @@ async fn handle_health(State(state): State<AppState>) -> impl IntoResponse {
Json(body)
}
/// Prometheus content type for text exposition format.
const PROMETHEUS_CONTENT_TYPE: &str = "text/plain; version=0.0.4; charset=utf-8";
/// GET /metrics — Prometheus text exposition format
async fn handle_metrics(State(state): State<AppState>) -> impl IntoResponse {
let body = if let Some(prom) = state
.observer
.as_ref()
.as_any()
.downcast_ref::<crate::observability::PrometheusObserver>()
{
prom.encode()
} else {
String::from("# Prometheus backend not enabled. Set [observability] backend = \"prometheus\" in config.\n")
};
(
StatusCode::OK,
[(header::CONTENT_TYPE, PROMETHEUS_CONTENT_TYPE)],
body,
)
}
/// POST /pair — exchange one-time code for bearer token
async fn handle_pair(
State(state): State<AppState>,
@ -1247,6 +1278,7 @@ mod tests {
idempotency_store: Arc::new(IdempotencyStore::new(Duration::from_secs(300), 1000)),
whatsapp: None,
whatsapp_app_secret: None,
observer: Arc::new(crate::observability::NoopObserver),
};
let mut headers = HeaderMap::new();
@ -1302,6 +1334,7 @@ mod tests {
idempotency_store: Arc::new(IdempotencyStore::new(Duration::from_secs(300), 1000)),
whatsapp: None,
whatsapp_app_secret: None,
observer: Arc::new(crate::observability::NoopObserver),
};
let headers = HeaderMap::new();
@ -1366,6 +1399,7 @@ mod tests {
idempotency_store: Arc::new(IdempotencyStore::new(Duration::from_secs(300), 1000)),
whatsapp: None,
whatsapp_app_secret: None,
observer: Arc::new(crate::observability::NoopObserver),
};
let response = handle_webhook(
@ -1403,6 +1437,7 @@ mod tests {
idempotency_store: Arc::new(IdempotencyStore::new(Duration::from_secs(300), 1000)),
whatsapp: None,
whatsapp_app_secret: None,
observer: Arc::new(crate::observability::NoopObserver),
};
let mut headers = HeaderMap::new();
@ -1443,6 +1478,7 @@ mod tests {
idempotency_store: Arc::new(IdempotencyStore::new(Duration::from_secs(300), 1000)),
whatsapp: None,
whatsapp_app_secret: None,
observer: Arc::new(crate::observability::NoopObserver),
};
let mut headers = HeaderMap::new();

View file

@ -1,4 +1,5 @@
use super::traits::{Observer, ObserverEvent, ObserverMetric};
use std::any::Any;
use tracing::info;
/// Log-based observer — uses tracing, zero external deps
@ -16,6 +17,38 @@ impl Observer for LogObserver {
ObserverEvent::AgentStart { provider, model } => {
info!(provider = %provider, model = %model, "agent.start");
}
ObserverEvent::AgentEnd {
provider,
model,
duration,
tokens_used,
} => {
let ms = u64::try_from(duration.as_millis()).unwrap_or(u64::MAX);
info!(provider = %provider, model = %model, duration_ms = ms, tokens = ?tokens_used, "agent.end");
}
ObserverEvent::ToolCallStart { tool } => {
info!(tool = %tool, "tool.start");
}
ObserverEvent::ToolCall {
tool,
duration,
success,
} => {
let ms = u64::try_from(duration.as_millis()).unwrap_or(u64::MAX);
info!(tool = %tool, duration_ms = ms, success = success, "tool.call");
}
ObserverEvent::TurnComplete => {
info!("turn.complete");
}
ObserverEvent::ChannelMessage { channel, direction } => {
info!(channel = %channel, direction = %direction, "channel.message");
}
ObserverEvent::HeartbeatTick => {
info!("heartbeat.tick");
}
ObserverEvent::Error { component, message } => {
info!(component = %component, error = %message, "error");
}
ObserverEvent::LlmRequest {
provider,
model,
@ -45,37 +78,6 @@ impl Observer for LogObserver {
"llm.response"
);
}
ObserverEvent::AgentEnd {
duration,
tokens_used,
cost_usd,
} => {
let ms = u64::try_from(duration.as_millis()).unwrap_or(u64::MAX);
info!(duration_ms = ms, tokens = ?tokens_used, cost_usd = ?cost_usd, "agent.end");
}
ObserverEvent::ToolCallStart { tool } => {
info!(tool = %tool, "tool.start");
}
ObserverEvent::ToolCall {
tool,
duration,
success,
} => {
let ms = u64::try_from(duration.as_millis()).unwrap_or(u64::MAX);
info!(tool = %tool, duration_ms = ms, success = success, "tool.call");
}
ObserverEvent::TurnComplete => {
info!("turn.complete");
}
ObserverEvent::ChannelMessage { channel, direction } => {
info!(channel = %channel, direction = %direction, "channel.message");
}
ObserverEvent::HeartbeatTick => {
info!("heartbeat.tick");
}
ObserverEvent::Error { component, message } => {
info!(component = %component, error = %message, "error");
}
}
}
@ -100,6 +102,10 @@ impl Observer for LogObserver {
fn name(&self) -> &str {
"log"
}
fn as_any(&self) -> &dyn Any {
self
}
}
#[cfg(test)]
@ -119,37 +125,23 @@ mod tests {
provider: "openrouter".into(),
model: "claude-sonnet".into(),
});
obs.record_event(&ObserverEvent::LlmRequest {
provider: "openrouter".into(),
model: "claude-sonnet".into(),
messages_count: 2,
});
obs.record_event(&ObserverEvent::LlmResponse {
provider: "openrouter".into(),
model: "claude-sonnet".into(),
duration: Duration::from_millis(250),
success: true,
error_message: None,
});
obs.record_event(&ObserverEvent::AgentEnd {
provider: "openrouter".into(),
model: "claude-sonnet".into(),
duration: Duration::from_millis(500),
tokens_used: Some(100),
cost_usd: Some(0.0015),
});
obs.record_event(&ObserverEvent::AgentEnd {
provider: "openrouter".into(),
model: "claude-sonnet".into(),
duration: Duration::ZERO,
tokens_used: None,
cost_usd: None,
});
obs.record_event(&ObserverEvent::ToolCallStart {
tool: "shell".into(),
});
obs.record_event(&ObserverEvent::ToolCall {
tool: "shell".into(),
duration: Duration::from_millis(10),
success: false,
});
obs.record_event(&ObserverEvent::TurnComplete);
obs.record_event(&ObserverEvent::ChannelMessage {
channel: "telegram".into(),
direction: "outbound".into(),

View file

@ -1,19 +1,13 @@
pub mod log;
pub mod multi;
pub mod noop;
pub mod otel;
pub mod prometheus;
pub mod traits;
pub mod verbose;
#[allow(unused_imports)]
pub use self::log::LogObserver;
#[allow(unused_imports)]
pub use self::multi::MultiObserver;
pub use noop::NoopObserver;
pub use otel::OtelObserver;
pub use prometheus::PrometheusObserver;
pub use traits::{Observer, ObserverEvent};
#[allow(unused_imports)]
pub use verbose::VerboseObserver;
use crate::config::ObservabilityConfig;
@ -21,27 +15,7 @@ use crate::config::ObservabilityConfig;
pub fn create_observer(config: &ObservabilityConfig) -> Box<dyn Observer> {
match config.backend.as_str() {
"log" => Box::new(LogObserver::new()),
"otel" | "opentelemetry" | "otlp" => {
match OtelObserver::new(
config.otel_endpoint.as_deref(),
config.otel_service_name.as_deref(),
) {
Ok(obs) => {
tracing::info!(
endpoint = config
.otel_endpoint
.as_deref()
.unwrap_or("http://localhost:4318"),
"OpenTelemetry observer initialized"
);
Box::new(obs)
}
Err(e) => {
tracing::error!("Failed to create OTel observer: {e}. Falling back to noop.");
Box::new(NoopObserver)
}
}
}
"prometheus" => Box::new(PrometheusObserver::new()),
"none" | "noop" => Box::new(NoopObserver),
_ => {
tracing::warn!(
@ -61,7 +35,7 @@ mod tests {
fn factory_none_returns_noop() {
let cfg = ObservabilityConfig {
backend: "none".into(),
..ObservabilityConfig::default()
..Default::default()
};
assert_eq!(create_observer(&cfg).name(), "noop");
}
@ -70,7 +44,7 @@ mod tests {
fn factory_noop_returns_noop() {
let cfg = ObservabilityConfig {
backend: "noop".into(),
..ObservabilityConfig::default()
..Default::default()
};
assert_eq!(create_observer(&cfg).name(), "noop");
}
@ -79,46 +53,25 @@ mod tests {
fn factory_log_returns_log() {
let cfg = ObservabilityConfig {
backend: "log".into(),
..ObservabilityConfig::default()
..Default::default()
};
assert_eq!(create_observer(&cfg).name(), "log");
}
#[test]
fn factory_otel_returns_otel() {
fn factory_prometheus_returns_prometheus() {
let cfg = ObservabilityConfig {
backend: "otel".into(),
otel_endpoint: Some("http://127.0.0.1:19999".into()),
otel_service_name: Some("test".into()),
backend: "prometheus".into(),
..Default::default()
};
assert_eq!(create_observer(&cfg).name(), "otel");
}
#[test]
fn factory_opentelemetry_alias() {
let cfg = ObservabilityConfig {
backend: "opentelemetry".into(),
otel_endpoint: Some("http://127.0.0.1:19999".into()),
otel_service_name: Some("test".into()),
};
assert_eq!(create_observer(&cfg).name(), "otel");
}
#[test]
fn factory_otlp_alias() {
let cfg = ObservabilityConfig {
backend: "otlp".into(),
otel_endpoint: Some("http://127.0.0.1:19999".into()),
otel_service_name: Some("test".into()),
};
assert_eq!(create_observer(&cfg).name(), "otel");
assert_eq!(create_observer(&cfg).name(), "prometheus");
}
#[test]
fn factory_unknown_falls_back_to_noop() {
let cfg = ObservabilityConfig {
backend: "xyzzy_unknown".into(),
..ObservabilityConfig::default()
..Default::default()
};
assert_eq!(create_observer(&cfg).name(), "noop");
}
@ -127,7 +80,7 @@ mod tests {
fn factory_empty_string_falls_back_to_noop() {
let cfg = ObservabilityConfig {
backend: String::new(),
..ObservabilityConfig::default()
..Default::default()
};
assert_eq!(create_observer(&cfg).name(), "noop");
}
@ -136,7 +89,7 @@ mod tests {
fn factory_garbage_falls_back_to_noop() {
let cfg = ObservabilityConfig {
backend: "xyzzy_garbage_123".into(),
..ObservabilityConfig::default()
..Default::default()
};
assert_eq!(create_observer(&cfg).name(), "noop");
}

View file

@ -1,4 +1,5 @@
use super::traits::{Observer, ObserverEvent, ObserverMetric};
use std::any::Any;
/// Combine multiple observers — fan-out events to all backends
pub struct MultiObserver {
@ -33,6 +34,10 @@ impl Observer for MultiObserver {
fn name(&self) -> &str {
"multi"
}
fn as_any(&self) -> &dyn Any {
self
}
}
#[cfg(test)]
@ -76,6 +81,10 @@ mod tests {
fn name(&self) -> &str {
"counting"
}
fn as_any(&self) -> &dyn Any {
self
}
}
#[test]

View file

@ -1,4 +1,5 @@
use super::traits::{Observer, ObserverEvent, ObserverMetric};
use std::any::Any;
/// Zero-overhead observer — all methods compile to nothing
pub struct NoopObserver;
@ -13,6 +14,10 @@ impl Observer for NoopObserver {
fn name(&self) -> &str {
"noop"
}
fn as_any(&self) -> &dyn Any {
self
}
}
#[cfg(test)]
@ -33,37 +38,23 @@ mod tests {
provider: "test".into(),
model: "test".into(),
});
obs.record_event(&ObserverEvent::LlmRequest {
provider: "test".into(),
model: "test".into(),
messages_count: 2,
});
obs.record_event(&ObserverEvent::LlmResponse {
provider: "test".into(),
model: "test".into(),
duration: Duration::from_millis(1),
success: true,
error_message: None,
});
obs.record_event(&ObserverEvent::AgentEnd {
provider: "test".into(),
model: "test".into(),
duration: Duration::from_millis(100),
tokens_used: Some(42),
cost_usd: Some(0.001),
});
obs.record_event(&ObserverEvent::AgentEnd {
provider: "test".into(),
model: "test".into(),
duration: Duration::ZERO,
tokens_used: None,
cost_usd: None,
});
obs.record_event(&ObserverEvent::ToolCallStart {
tool: "shell".into(),
});
obs.record_event(&ObserverEvent::ToolCall {
tool: "shell".into(),
duration: Duration::from_secs(1),
success: true,
});
obs.record_event(&ObserverEvent::TurnComplete);
obs.record_event(&ObserverEvent::ChannelMessage {
channel: "cli".into(),
direction: "inbound".into(),

View file

@ -0,0 +1,387 @@
use super::traits::{Observer, ObserverEvent, ObserverMetric};
use prometheus::{
Encoder, GaugeVec, Histogram, HistogramOpts, HistogramVec, IntCounterVec, Registry, TextEncoder,
};
/// Prometheus-backed observer — exposes metrics for scraping via `/metrics`.
pub struct PrometheusObserver {
registry: Registry,
// Counters
agent_starts: IntCounterVec,
tool_calls: IntCounterVec,
channel_messages: IntCounterVec,
heartbeat_ticks: prometheus::IntCounter,
errors: IntCounterVec,
// Histograms
agent_duration: HistogramVec,
tool_duration: HistogramVec,
request_latency: Histogram,
// Gauges
tokens_used: prometheus::IntGauge,
active_sessions: GaugeVec,
queue_depth: GaugeVec,
}
impl PrometheusObserver {
pub fn new() -> Self {
let registry = Registry::new();
let agent_starts = IntCounterVec::new(
prometheus::Opts::new("zeroclaw_agent_starts_total", "Total agent invocations"),
&["provider", "model"],
)
.expect("valid metric");
let tool_calls = IntCounterVec::new(
prometheus::Opts::new("zeroclaw_tool_calls_total", "Total tool calls"),
&["tool", "success"],
)
.expect("valid metric");
let channel_messages = IntCounterVec::new(
prometheus::Opts::new("zeroclaw_channel_messages_total", "Total channel messages"),
&["channel", "direction"],
)
.expect("valid metric");
let heartbeat_ticks = prometheus::IntCounter::new(
"zeroclaw_heartbeat_ticks_total",
"Total heartbeat ticks",
)
.expect("valid metric");
let errors = IntCounterVec::new(
prometheus::Opts::new("zeroclaw_errors_total", "Total errors by component"),
&["component"],
)
.expect("valid metric");
let agent_duration = HistogramVec::new(
HistogramOpts::new(
"zeroclaw_agent_duration_seconds",
"Agent invocation duration in seconds",
)
.buckets(vec![0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0]),
&["provider", "model"],
)
.expect("valid metric");
let tool_duration = HistogramVec::new(
HistogramOpts::new(
"zeroclaw_tool_duration_seconds",
"Tool execution duration in seconds",
)
.buckets(vec![0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]),
&["tool"],
)
.expect("valid metric");
let request_latency = Histogram::with_opts(
HistogramOpts::new(
"zeroclaw_request_latency_seconds",
"Request latency in seconds",
)
.buckets(vec![0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0]),
)
.expect("valid metric");
let tokens_used = prometheus::IntGauge::new(
"zeroclaw_tokens_used_last",
"Tokens used in the last request",
)
.expect("valid metric");
let active_sessions = GaugeVec::new(
prometheus::Opts::new("zeroclaw_active_sessions", "Number of active sessions"),
&[],
)
.expect("valid metric");
let queue_depth = GaugeVec::new(
prometheus::Opts::new("zeroclaw_queue_depth", "Message queue depth"),
&[],
)
.expect("valid metric");
// Register all metrics
registry.register(Box::new(agent_starts.clone())).ok();
registry.register(Box::new(tool_calls.clone())).ok();
registry.register(Box::new(channel_messages.clone())).ok();
registry.register(Box::new(heartbeat_ticks.clone())).ok();
registry.register(Box::new(errors.clone())).ok();
registry.register(Box::new(agent_duration.clone())).ok();
registry.register(Box::new(tool_duration.clone())).ok();
registry.register(Box::new(request_latency.clone())).ok();
registry.register(Box::new(tokens_used.clone())).ok();
registry.register(Box::new(active_sessions.clone())).ok();
registry.register(Box::new(queue_depth.clone())).ok();
Self {
registry,
agent_starts,
tool_calls,
channel_messages,
heartbeat_ticks,
errors,
agent_duration,
tool_duration,
request_latency,
tokens_used,
active_sessions,
queue_depth,
}
}
/// Encode all registered metrics into Prometheus text exposition format.
pub fn encode(&self) -> String {
let encoder = TextEncoder::new();
let families = self.registry.gather();
let mut buf = Vec::new();
encoder.encode(&families, &mut buf).unwrap_or_default();
String::from_utf8(buf).unwrap_or_default()
}
}
impl Observer for PrometheusObserver {
fn record_event(&self, event: &ObserverEvent) {
match event {
ObserverEvent::AgentStart { provider, model } => {
self.agent_starts
.with_label_values(&[provider, model])
.inc();
}
ObserverEvent::AgentEnd {
provider,
model,
duration,
tokens_used,
} => {
// Agent duration is recorded via the histogram with provider/model labels
self.agent_duration
.with_label_values(&[provider, model])
.observe(duration.as_secs_f64());
if let Some(t) = tokens_used {
self.tokens_used.set(i64::try_from(*t).unwrap_or(i64::MAX));
}
}
ObserverEvent::ToolCallStart { tool } => {
self.tool_calls
.with_label_values(&[&tool.to_string(), &"start".to_string()])
.inc();
}
ObserverEvent::ToolCall {
tool,
duration,
success,
} => {
let success_str = if *success { "true" } else { "false" };
self.tool_calls
.with_label_values(&[&tool.to_string(), &success_str.to_string()])
.inc();
self.tool_duration
.with_label_values(&[&tool.to_string()])
.observe(duration.as_secs_f64());
}
ObserverEvent::TurnComplete => {
// No metric for turn complete currently
}
ObserverEvent::ChannelMessage { channel, direction } => {
self.channel_messages
.with_label_values(&[channel, direction])
.inc();
}
ObserverEvent::HeartbeatTick => {
self.heartbeat_ticks.inc();
}
ObserverEvent::Error {
component,
message: _,
} => {
self.errors.with_label_values(&[component]).inc();
}
ObserverEvent::LlmRequest { .. } => {}
ObserverEvent::LlmResponse { .. } => {}
}
}
fn record_metric(&self, metric: &ObserverMetric) {
match metric {
ObserverMetric::RequestLatency(d) => {
self.request_latency.observe(d.as_secs_f64());
}
ObserverMetric::TokensUsed(t) => {
self.tokens_used.set(i64::try_from(*t).unwrap_or(i64::MAX));
}
ObserverMetric::ActiveSessions(s) => {
self.active_sessions
.with_label_values(&[] as &[&str])
.set(*s as f64);
}
ObserverMetric::QueueDepth(d) => {
self.queue_depth.with_label_values(&[] as &[&str]).set(*d as f64);
}
}
}
fn name(&self) -> &str {
"prometheus"
}
fn as_any(&self) -> &dyn std::any::Any {
self
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::time::Duration;
#[test]
fn prometheus_observer_name() {
assert_eq!(PrometheusObserver::new().name(), "prometheus");
}
#[test]
fn records_all_events_without_panic() {
let obs = PrometheusObserver::new();
obs.record_event(&ObserverEvent::AgentStart {
provider: "openrouter".into(),
model: "claude-sonnet".into(),
});
obs.record_event(&ObserverEvent::AgentEnd {
provider: "openrouter".into(),
model: "claude-sonnet".into(),
duration: Duration::from_millis(500),
tokens_used: Some(100),
});
obs.record_event(&ObserverEvent::AgentEnd {
provider: "openrouter".into(),
model: "claude-sonnet".into(),
duration: Duration::ZERO,
tokens_used: None,
});
obs.record_event(&ObserverEvent::ToolCall {
tool: "shell".into(),
duration: Duration::from_millis(10),
success: true,
});
obs.record_event(&ObserverEvent::ToolCall {
tool: "file_read".into(),
duration: Duration::from_millis(5),
success: false,
});
obs.record_event(&ObserverEvent::ChannelMessage {
channel: "telegram".into(),
direction: "inbound".into(),
});
obs.record_event(&ObserverEvent::HeartbeatTick);
obs.record_event(&ObserverEvent::Error {
component: "provider".into(),
message: "timeout".into(),
});
}
#[test]
fn records_all_metrics_without_panic() {
let obs = PrometheusObserver::new();
obs.record_metric(&ObserverMetric::RequestLatency(Duration::from_secs(2)));
obs.record_metric(&ObserverMetric::TokensUsed(500));
obs.record_metric(&ObserverMetric::TokensUsed(0));
obs.record_metric(&ObserverMetric::ActiveSessions(3));
obs.record_metric(&ObserverMetric::QueueDepth(42));
}
#[test]
fn encode_produces_prometheus_text_format() {
let obs = PrometheusObserver::new();
obs.record_event(&ObserverEvent::AgentStart {
provider: "openrouter".into(),
model: "claude-sonnet".into(),
});
obs.record_event(&ObserverEvent::ToolCall {
tool: "shell".into(),
duration: Duration::from_millis(100),
success: true,
});
obs.record_event(&ObserverEvent::HeartbeatTick);
obs.record_metric(&ObserverMetric::RequestLatency(Duration::from_millis(250)));
let output = obs.encode();
assert!(output.contains("zeroclaw_agent_starts_total"));
assert!(output.contains("zeroclaw_tool_calls_total"));
assert!(output.contains("zeroclaw_heartbeat_ticks_total"));
assert!(output.contains("zeroclaw_request_latency_seconds"));
}
#[test]
fn counters_increment_correctly() {
let obs = PrometheusObserver::new();
for _ in 0..3 {
obs.record_event(&ObserverEvent::HeartbeatTick);
}
let output = obs.encode();
assert!(output.contains("zeroclaw_heartbeat_ticks_total 3"));
}
#[test]
fn tool_calls_track_success_and_failure_separately() {
let obs = PrometheusObserver::new();
obs.record_event(&ObserverEvent::ToolCall {
tool: "shell".into(),
duration: Duration::from_millis(10),
success: true,
});
obs.record_event(&ObserverEvent::ToolCall {
tool: "shell".into(),
duration: Duration::from_millis(10),
success: true,
});
obs.record_event(&ObserverEvent::ToolCall {
tool: "shell".into(),
duration: Duration::from_millis(10),
success: false,
});
let output = obs.encode();
assert!(output.contains(r#"zeroclaw_tool_calls_total{success="true",tool="shell"} 2"#));
assert!(output.contains(r#"zeroclaw_tool_calls_total{success="false",tool="shell"} 1"#));
}
#[test]
fn errors_track_by_component() {
let obs = PrometheusObserver::new();
obs.record_event(&ObserverEvent::Error {
component: "provider".into(),
message: "timeout".into(),
});
obs.record_event(&ObserverEvent::Error {
component: "provider".into(),
message: "rate limit".into(),
});
obs.record_event(&ObserverEvent::Error {
component: "channels".into(),
message: "disconnected".into(),
});
let output = obs.encode();
assert!(output.contains(r#"zeroclaw_errors_total{component="provider"} 2"#));
assert!(output.contains(r#"zeroclaw_errors_total{component="channels"} 1"#));
}
#[test]
fn gauge_reflects_latest_value() {
let obs = PrometheusObserver::new();
obs.record_metric(&ObserverMetric::TokensUsed(100));
obs.record_metric(&ObserverMetric::TokensUsed(200));
let output = obs.encode();
assert!(output.contains("zeroclaw_tokens_used_last 200"));
}
}

View file

@ -7,29 +7,12 @@ pub enum ObserverEvent {
provider: String,
model: String,
},
/// A request is about to be sent to an LLM provider.
///
/// This is emitted immediately before a provider call so observers can print
/// user-facing progress without leaking prompt contents.
LlmRequest {
provider: String,
model: String,
messages_count: usize,
},
/// Result of a single LLM provider call.
LlmResponse {
provider: String,
model: String,
duration: Duration,
success: bool,
error_message: Option<String>,
},
AgentEnd {
provider: String,
model: String,
duration: Duration,
tokens_used: Option<u64>,
cost_usd: Option<f64>,
},
/// A tool call is about to be executed.
ToolCallStart {
tool: String,
},
@ -38,7 +21,6 @@ pub enum ObserverEvent {
duration: Duration,
success: bool,
},
/// The agent produced a final answer for the current user message.
TurnComplete,
ChannelMessage {
channel: String,
@ -49,6 +31,19 @@ pub enum ObserverEvent {
component: String,
message: String,
},
// LLM request/response tracking
LlmRequest {
provider: String,
model: String,
messages_count: usize,
},
LlmResponse {
provider: String,
model: String,
duration: Duration,
success: bool,
error_message: Option<String>,
},
}
/// Numeric metrics
@ -61,7 +56,7 @@ pub enum ObserverMetric {
}
/// Core observability trait — implement for any backend
pub trait Observer: Send + Sync + 'static {
pub trait Observer: Send + Sync {
/// Record a discrete event
fn record_event(&self, event: &ObserverEvent);
@ -74,80 +69,6 @@ pub trait Observer: Send + Sync + 'static {
/// Human-readable name of this observer
fn name(&self) -> &str;
/// Downcast to `Any` for backend-specific operations
fn as_any(&self) -> &dyn std::any::Any
where
Self: Sized,
{
self
}
}
#[cfg(test)]
mod tests {
use super::*;
use parking_lot::Mutex;
use std::time::Duration;
#[derive(Default)]
struct DummyObserver {
events: Mutex<u64>,
metrics: Mutex<u64>,
}
impl Observer for DummyObserver {
fn record_event(&self, _event: &ObserverEvent) {
let mut guard = self.events.lock();
*guard += 1;
}
fn record_metric(&self, _metric: &ObserverMetric) {
let mut guard = self.metrics.lock();
*guard += 1;
}
fn name(&self) -> &str {
"dummy-observer"
}
}
#[test]
fn observer_records_events_and_metrics() {
let observer = DummyObserver::default();
observer.record_event(&ObserverEvent::HeartbeatTick);
observer.record_event(&ObserverEvent::Error {
component: "test".into(),
message: "boom".into(),
});
observer.record_metric(&ObserverMetric::TokensUsed(42));
assert_eq!(*observer.events.lock(), 2);
assert_eq!(*observer.metrics.lock(), 1);
}
#[test]
fn observer_default_flush_and_as_any_work() {
let observer = DummyObserver::default();
observer.flush();
assert_eq!(observer.name(), "dummy-observer");
assert!(observer.as_any().downcast_ref::<DummyObserver>().is_some());
}
#[test]
fn observer_event_and_metric_are_cloneable() {
let event = ObserverEvent::ToolCall {
tool: "shell".into(),
duration: Duration::from_millis(10),
success: true,
};
let metric = ObserverMetric::RequestLatency(Duration::from_millis(8));
let cloned_event = event.clone();
let cloned_metric = metric.clone();
assert!(matches!(cloned_event, ObserverEvent::ToolCall { .. }));
assert!(matches!(cloned_metric, ObserverMetric::RequestLatency(_)));
}
/// Downcast support for backend-specific operations (e.g. Prometheus encoding)
fn as_any(&self) -> &dyn std::any::Any;
}

View file

@ -1129,4 +1129,40 @@ mod tests {
"https://opencode.ai/zen/v1/chat/completions"
);
}
// ══════════════════════════════════════════════════════════
// Issue #580: Custom provider URL construction tests
// ══════════════════════════════════════════════════════════
#[test]
fn chat_completions_url_custom_provider_opencode_issue_580() {
// Issue #580: Custom provider should correctly append /chat/completions
// The error log format "{provider_name}/{current_model}" was confusing
// but the actual URL construction was always correct.
let p = make_provider("custom", "https://opencode.ai/zen/v1", None);
assert_eq!(
p.chat_completions_url(),
"https://opencode.ai/zen/v1/chat/completions"
);
}
#[test]
fn chat_completions_url_custom_provider_standard() {
// Standard custom provider without /v1 path
let p = make_provider("custom", "https://my-api.example.com", None);
assert_eq!(
p.chat_completions_url(),
"https://my-api.example.com/chat/completions"
);
}
#[test]
fn chat_completions_url_custom_provider_with_v1() {
// Custom provider with /v1 path
let p = make_provider("custom", "https://my-api.example.com/v1", None);
assert_eq!(
p.chat_completions_url(),
"https://my-api.example.com/v1/chat/completions"
);
}
}

View file

@ -193,8 +193,18 @@ impl Provider for ReliableProvider {
} else {
"retryable"
};
// For custom providers, strip the URL from the provider name
// to avoid confusion. The format "custom:https://..." in error
// logs makes it look like the model is being appended to the URL.
let display_provider = if provider_name.starts_with("custom:") {
"custom"
} else if provider_name.starts_with("anthropic-custom:") {
"anthropic-custom"
} else {
provider_name
};
failures.push(format!(
"provider={provider_name} model={current_model} attempt {}/{}: {failure_reason}",
"{display_provider}/{current_model} attempt {}/{}: {failure_reason}",
attempt + 1,
self.max_retries + 1
));
@ -298,8 +308,18 @@ impl Provider for ReliableProvider {
} else {
"retryable"
};
// For custom providers, strip the URL from the provider name
// to avoid confusion. The format "custom:https://..." in error
// logs makes it look like the model is being appended to the URL.
let display_provider = if provider_name.starts_with("custom:") {
"custom"
} else if provider_name.starts_with("anthropic-custom:") {
"anthropic-custom"
} else {
provider_name
};
failures.push(format!(
"provider={provider_name} model={current_model} attempt {}/{}: {failure_reason}",
"{display_provider}/{current_model} attempt {}/{}: {failure_reason}",
attempt + 1,
self.max_retries + 1
));