feat(observability): focus PR 596 on Prometheus backend

This commit is contained in:
Chummy 2026-02-18 11:32:51 +08:00
parent eba544dbd4
commit 2560399423
12 changed files with 358 additions and 103 deletions

View file

@ -22,9 +22,10 @@ impl Observer for LogObserver {
model,
duration,
tokens_used,
cost_usd,
} => {
let ms = u64::try_from(duration.as_millis()).unwrap_or(u64::MAX);
info!(provider = %provider, model = %model, duration_ms = ms, tokens = ?tokens_used, "agent.end");
info!(provider = %provider, model = %model, duration_ms = ms, tokens = ?tokens_used, cost_usd = ?cost_usd, "agent.end");
}
ObserverEvent::ToolCallStart { tool } => {
info!(tool = %tool, "tool.start");
@ -130,12 +131,14 @@ mod tests {
model: "claude-sonnet".into(),
duration: Duration::from_millis(500),
tokens_used: Some(100),
cost_usd: Some(0.0015),
});
obs.record_event(&ObserverEvent::AgentEnd {
provider: "openrouter".into(),
model: "claude-sonnet".into(),
duration: Duration::ZERO,
tokens_used: None,
cost_usd: None,
});
obs.record_event(&ObserverEvent::ToolCall {
tool: "shell".into(),

View file

@ -1,13 +1,21 @@
pub mod log;
pub mod multi;
pub mod noop;
pub mod otel;
pub mod prometheus;
pub mod traits;
pub mod verbose;
#[allow(unused_imports)]
pub use self::log::LogObserver;
#[allow(unused_imports)]
pub use self::multi::MultiObserver;
pub use noop::NoopObserver;
pub use otel::OtelObserver;
pub use prometheus::PrometheusObserver;
pub use traits::{Observer, ObserverEvent};
#[allow(unused_imports)]
pub use verbose::VerboseObserver;
use crate::config::ObservabilityConfig;
@ -16,6 +24,27 @@ pub fn create_observer(config: &ObservabilityConfig) -> Box<dyn Observer> {
match config.backend.as_str() {
"log" => Box::new(LogObserver::new()),
"prometheus" => Box::new(PrometheusObserver::new()),
"otel" | "opentelemetry" | "otlp" => {
match OtelObserver::new(
config.otel_endpoint.as_deref(),
config.otel_service_name.as_deref(),
) {
Ok(obs) => {
tracing::info!(
endpoint = config
.otel_endpoint
.as_deref()
.unwrap_or("http://localhost:4318"),
"OpenTelemetry observer initialized"
);
Box::new(obs)
}
Err(e) => {
tracing::error!("Failed to create OTel observer: {e}. Falling back to noop.");
Box::new(NoopObserver)
}
}
}
"none" | "noop" => Box::new(NoopObserver),
_ => {
tracing::warn!(
@ -35,7 +64,7 @@ mod tests {
fn factory_none_returns_noop() {
let cfg = ObservabilityConfig {
backend: "none".into(),
..Default::default()
..ObservabilityConfig::default()
};
assert_eq!(create_observer(&cfg).name(), "noop");
}
@ -44,7 +73,7 @@ mod tests {
fn factory_noop_returns_noop() {
let cfg = ObservabilityConfig {
backend: "noop".into(),
..Default::default()
..ObservabilityConfig::default()
};
assert_eq!(create_observer(&cfg).name(), "noop");
}
@ -53,7 +82,7 @@ mod tests {
fn factory_log_returns_log() {
let cfg = ObservabilityConfig {
backend: "log".into(),
..Default::default()
..ObservabilityConfig::default()
};
assert_eq!(create_observer(&cfg).name(), "log");
}
@ -62,16 +91,46 @@ mod tests {
fn factory_prometheus_returns_prometheus() {
let cfg = ObservabilityConfig {
backend: "prometheus".into(),
..Default::default()
..ObservabilityConfig::default()
};
assert_eq!(create_observer(&cfg).name(), "prometheus");
}
#[test]
fn factory_otel_returns_otel() {
let cfg = ObservabilityConfig {
backend: "otel".into(),
otel_endpoint: Some("http://127.0.0.1:19999".into()),
otel_service_name: Some("test".into()),
};
assert_eq!(create_observer(&cfg).name(), "otel");
}
#[test]
fn factory_opentelemetry_alias() {
let cfg = ObservabilityConfig {
backend: "opentelemetry".into(),
otel_endpoint: Some("http://127.0.0.1:19999".into()),
otel_service_name: Some("test".into()),
};
assert_eq!(create_observer(&cfg).name(), "otel");
}
#[test]
fn factory_otlp_alias() {
let cfg = ObservabilityConfig {
backend: "otlp".into(),
otel_endpoint: Some("http://127.0.0.1:19999".into()),
otel_service_name: Some("test".into()),
};
assert_eq!(create_observer(&cfg).name(), "otel");
}
#[test]
fn factory_unknown_falls_back_to_noop() {
let cfg = ObservabilityConfig {
backend: "xyzzy_unknown".into(),
..Default::default()
..ObservabilityConfig::default()
};
assert_eq!(create_observer(&cfg).name(), "noop");
}
@ -80,7 +139,7 @@ mod tests {
fn factory_empty_string_falls_back_to_noop() {
let cfg = ObservabilityConfig {
backend: String::new(),
..Default::default()
..ObservabilityConfig::default()
};
assert_eq!(create_observer(&cfg).name(), "noop");
}
@ -89,7 +148,7 @@ mod tests {
fn factory_garbage_falls_back_to_noop() {
let cfg = ObservabilityConfig {
backend: "xyzzy_garbage_123".into(),
..Default::default()
..ObservabilityConfig::default()
};
assert_eq!(create_observer(&cfg).name(), "noop");
}

View file

@ -43,12 +43,14 @@ mod tests {
model: "test".into(),
duration: Duration::from_millis(100),
tokens_used: Some(42),
cost_usd: Some(0.001),
});
obs.record_event(&ObserverEvent::AgentEnd {
provider: "test".into(),
model: "test".into(),
duration: Duration::ZERO,
tokens_used: None,
cost_usd: None,
});
obs.record_event(&ObserverEvent::ToolCall {
tool: "shell".into(),

View file

@ -5,6 +5,7 @@ use opentelemetry::{global, KeyValue};
use opentelemetry_otlp::WithExportConfig;
use opentelemetry_sdk::metrics::SdkMeterProvider;
use opentelemetry_sdk::trace::SdkTracerProvider;
use std::any::Any;
use std::time::SystemTime;
/// OpenTelemetry-backed observer — exports traces and metrics via OTLP.
@ -225,6 +226,8 @@ impl Observer for OtelObserver {
span.end();
}
ObserverEvent::AgentEnd {
provider,
model,
duration,
tokens_used,
cost_usd,
@ -239,7 +242,11 @@ impl Observer for OtelObserver {
opentelemetry::trace::SpanBuilder::from_name("agent.invocation")
.with_kind(SpanKind::Internal)
.with_start_time(start_time)
.with_attributes(vec![KeyValue::new("duration_s", secs)]),
.with_attributes(vec![
KeyValue::new("provider", provider.clone()),
KeyValue::new("model", model.clone()),
KeyValue::new("duration_s", secs),
]),
);
if let Some(t) = tokens_used {
span.set_attribute(KeyValue::new("tokens_used", *t as i64));
@ -249,7 +256,13 @@ impl Observer for OtelObserver {
}
span.end();
self.agent_duration.record(secs, &[]);
self.agent_duration.record(
secs,
&[
KeyValue::new("provider", provider.clone()),
KeyValue::new("model", model.clone()),
],
);
// Note: tokens are recorded via record_metric(TokensUsed) to avoid
// double-counting. AgentEnd only records duration.
}
@ -350,6 +363,10 @@ impl Observer for OtelObserver {
fn name(&self) -> &str {
"otel"
}
fn as_any(&self) -> &dyn Any {
self
}
}
#[cfg(test)]
@ -396,11 +413,15 @@ mod tests {
error_message: None,
});
obs.record_event(&ObserverEvent::AgentEnd {
provider: "openrouter".into(),
model: "claude-sonnet".into(),
duration: Duration::from_millis(500),
tokens_used: Some(100),
cost_usd: Some(0.0015),
});
obs.record_event(&ObserverEvent::AgentEnd {
provider: "openrouter".into(),
model: "claude-sonnet".into(),
duration: Duration::ZERO,
tokens_used: None,
cost_usd: None,

View file

@ -47,11 +47,9 @@ impl PrometheusObserver {
)
.expect("valid metric");
let heartbeat_ticks = prometheus::IntCounter::new(
"zeroclaw_heartbeat_ticks_total",
"Total heartbeat ticks",
)
.expect("valid metric");
let heartbeat_ticks =
prometheus::IntCounter::new("zeroclaw_heartbeat_ticks_total", "Total heartbeat ticks")
.expect("valid metric");
let errors = IntCounterVec::new(
prometheus::Opts::new("zeroclaw_errors_total", "Total errors by component"),
@ -158,6 +156,7 @@ impl Observer for PrometheusObserver {
model,
duration,
tokens_used,
cost_usd: _,
} => {
// Agent duration is recorded via the histogram with provider/model labels
self.agent_duration
@ -167,11 +166,7 @@ impl Observer for PrometheusObserver {
self.tokens_used.set(i64::try_from(*t).unwrap_or(i64::MAX));
}
}
ObserverEvent::ToolCallStart { tool } => {
self.tool_calls
.with_label_values(&[&tool.to_string(), &"start".to_string()])
.inc();
}
ObserverEvent::ToolCallStart { tool: _ } => {}
ObserverEvent::ToolCall {
tool,
duration,
@ -179,10 +174,10 @@ impl Observer for PrometheusObserver {
} => {
let success_str = if *success { "true" } else { "false" };
self.tool_calls
.with_label_values(&[&tool.to_string(), &success_str.to_string()])
.with_label_values(&[tool.as_str(), success_str])
.inc();
self.tool_duration
.with_label_values(&[&tool.to_string()])
.with_label_values(&[tool.as_str()])
.observe(duration.as_secs_f64());
}
ObserverEvent::TurnComplete => {
@ -221,7 +216,9 @@ impl Observer for PrometheusObserver {
.set(*s as f64);
}
ObserverMetric::QueueDepth(d) => {
self.queue_depth.with_label_values(&[] as &[&str]).set(*d as f64);
self.queue_depth
.with_label_values(&[] as &[&str])
.set(*d as f64);
}
}
}
@ -257,12 +254,14 @@ mod tests {
model: "claude-sonnet".into(),
duration: Duration::from_millis(500),
tokens_used: Some(100),
cost_usd: None,
});
obs.record_event(&ObserverEvent::AgentEnd {
provider: "openrouter".into(),
model: "claude-sonnet".into(),
duration: Duration::ZERO,
tokens_used: None,
cost_usd: None,
});
obs.record_event(&ObserverEvent::ToolCall {
tool: "shell".into(),

View file

@ -7,12 +7,31 @@ pub enum ObserverEvent {
provider: String,
model: String,
},
/// A request is about to be sent to an LLM provider.
///
/// This is emitted immediately before a provider call so observers can print
/// user-facing progress without leaking prompt contents.
LlmRequest {
provider: String,
model: String,
messages_count: usize,
},
/// Result of a single LLM provider call.
LlmResponse {
provider: String,
model: String,
duration: Duration,
success: bool,
error_message: Option<String>,
},
AgentEnd {
provider: String,
model: String,
duration: Duration,
tokens_used: Option<u64>,
cost_usd: Option<f64>,
},
/// A tool call is about to be executed.
ToolCallStart {
tool: String,
},
@ -21,6 +40,7 @@ pub enum ObserverEvent {
duration: Duration,
success: bool,
},
/// The agent produced a final answer for the current user message.
TurnComplete,
ChannelMessage {
channel: String,
@ -31,19 +51,6 @@ pub enum ObserverEvent {
component: String,
message: String,
},
// LLM request/response tracking
LlmRequest {
provider: String,
model: String,
messages_count: usize,
},
LlmResponse {
provider: String,
model: String,
duration: Duration,
success: bool,
error_message: Option<String>,
},
}
/// Numeric metrics
@ -56,7 +63,7 @@ pub enum ObserverMetric {
}
/// Core observability trait — implement for any backend
pub trait Observer: Send + Sync {
pub trait Observer: Send + Sync + 'static {
/// Record a discrete event
fn record_event(&self, event: &ObserverEvent);
@ -69,6 +76,79 @@ pub trait Observer: Send + Sync {
/// Human-readable name of this observer
fn name(&self) -> &str;
/// Downcast support for backend-specific operations (e.g. Prometheus encoding)
/// Downcast to `Any` for backend-specific operations
fn as_any(&self) -> &dyn std::any::Any;
}
#[cfg(test)]
mod tests {
use super::*;
use parking_lot::Mutex;
use std::time::Duration;
#[derive(Default)]
struct DummyObserver {
events: Mutex<u64>,
metrics: Mutex<u64>,
}
impl Observer for DummyObserver {
fn record_event(&self, _event: &ObserverEvent) {
let mut guard = self.events.lock();
*guard += 1;
}
fn record_metric(&self, _metric: &ObserverMetric) {
let mut guard = self.metrics.lock();
*guard += 1;
}
fn name(&self) -> &str {
"dummy-observer"
}
fn as_any(&self) -> &dyn std::any::Any {
self
}
}
#[test]
fn observer_records_events_and_metrics() {
let observer = DummyObserver::default();
observer.record_event(&ObserverEvent::HeartbeatTick);
observer.record_event(&ObserverEvent::Error {
component: "test".into(),
message: "boom".into(),
});
observer.record_metric(&ObserverMetric::TokensUsed(42));
assert_eq!(*observer.events.lock(), 2);
assert_eq!(*observer.metrics.lock(), 1);
}
#[test]
fn observer_default_flush_and_as_any_work() {
let observer = DummyObserver::default();
observer.flush();
assert_eq!(observer.name(), "dummy-observer");
assert!(observer.as_any().downcast_ref::<DummyObserver>().is_some());
}
#[test]
fn observer_event_and_metric_are_cloneable() {
let event = ObserverEvent::ToolCall {
tool: "shell".into(),
duration: Duration::from_millis(10),
success: true,
};
let metric = ObserverMetric::RequestLatency(Duration::from_millis(8));
let cloned_event = event.clone();
let cloned_metric = metric.clone();
assert!(matches!(cloned_event, ObserverEvent::ToolCall { .. }));
assert!(matches!(cloned_metric, ObserverMetric::RequestLatency(_)));
}
}

View file

@ -1,4 +1,5 @@
use super::traits::{Observer, ObserverEvent, ObserverMetric};
use std::any::Any;
/// Human-readable progress observer for interactive CLI sessions.
///
@ -56,6 +57,10 @@ impl Observer for VerboseObserver {
fn name(&self) -> &str {
"verbose"
}
fn as_any(&self) -> &dyn Any {
self
}
}
#[cfg(test)]