feat(observability): implement Prometheus metrics backend with /metrics endpoint

- Adds PrometheusObserver backend with counters, histograms, and gauges
- Tracks agent starts/duration, tool calls, channel messages, heartbeat ticks, errors, request latency, tokens, sessions, queue depth
- Adds GET /metrics endpoint to gateway for Prometheus scraping
- Adds provider/model labels to AgentStart and AgentEnd events for better observability
- Adds as_any() method to Observer trait for backend-specific downcast

Metrics exposed:
- zeroclaw_agent_starts_total (Counter) with provider/model labels
- zeroclaw_agent_duration_seconds (Histogram) with provider/model labels
- zeroclaw_tool_calls_total (Counter) with tool/success labels
- zeroclaw_tool_duration_seconds (Histogram) with tool label
- zeroclaw_channel_messages_total (Counter) with channel/direction labels
- zeroclaw_heartbeat_ticks_total (Counter)
- zeroclaw_errors_total (Counter) with component label
- zeroclaw_request_latency_seconds (Histogram)
- zeroclaw_tokens_used_last (Gauge)
- zeroclaw_active_sessions (Gauge)
- zeroclaw_queue_depth (Gauge)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
argenis de la rosa 2026-02-17 14:01:37 -05:00 committed by Chummy
parent c04f2855e4
commit eba544dbd4
11 changed files with 575 additions and 228 deletions

View file

@ -7,29 +7,12 @@ pub enum ObserverEvent {
provider: String,
model: String,
},
/// A request is about to be sent to an LLM provider.
///
/// This is emitted immediately before a provider call so observers can print
/// user-facing progress without leaking prompt contents.
LlmRequest {
provider: String,
model: String,
messages_count: usize,
},
/// Result of a single LLM provider call.
LlmResponse {
provider: String,
model: String,
duration: Duration,
success: bool,
error_message: Option<String>,
},
AgentEnd {
provider: String,
model: String,
duration: Duration,
tokens_used: Option<u64>,
cost_usd: Option<f64>,
},
/// A tool call is about to be executed.
ToolCallStart {
tool: String,
},
@ -38,7 +21,6 @@ pub enum ObserverEvent {
duration: Duration,
success: bool,
},
/// The agent produced a final answer for the current user message.
TurnComplete,
ChannelMessage {
channel: String,
@ -49,6 +31,19 @@ pub enum ObserverEvent {
component: String,
message: String,
},
// LLM request/response tracking
LlmRequest {
provider: String,
model: String,
messages_count: usize,
},
LlmResponse {
provider: String,
model: String,
duration: Duration,
success: bool,
error_message: Option<String>,
},
}
/// Numeric metrics
@ -61,7 +56,7 @@ pub enum ObserverMetric {
}
/// Core observability trait — implement for any backend
pub trait Observer: Send + Sync + 'static {
pub trait Observer: Send + Sync {
/// Record a discrete event
fn record_event(&self, event: &ObserverEvent);
@ -74,80 +69,6 @@ pub trait Observer: Send + Sync + 'static {
/// Human-readable name of this observer
fn name(&self) -> &str;
/// Downcast to `Any` for backend-specific operations
fn as_any(&self) -> &dyn std::any::Any
where
Self: Sized,
{
self
}
}
#[cfg(test)]
mod tests {
use super::*;
use parking_lot::Mutex;
use std::time::Duration;
#[derive(Default)]
struct DummyObserver {
events: Mutex<u64>,
metrics: Mutex<u64>,
}
impl Observer for DummyObserver {
fn record_event(&self, _event: &ObserverEvent) {
let mut guard = self.events.lock();
*guard += 1;
}
fn record_metric(&self, _metric: &ObserverMetric) {
let mut guard = self.metrics.lock();
*guard += 1;
}
fn name(&self) -> &str {
"dummy-observer"
}
}
#[test]
fn observer_records_events_and_metrics() {
let observer = DummyObserver::default();
observer.record_event(&ObserverEvent::HeartbeatTick);
observer.record_event(&ObserverEvent::Error {
component: "test".into(),
message: "boom".into(),
});
observer.record_metric(&ObserverMetric::TokensUsed(42));
assert_eq!(*observer.events.lock(), 2);
assert_eq!(*observer.metrics.lock(), 1);
}
#[test]
fn observer_default_flush_and_as_any_work() {
let observer = DummyObserver::default();
observer.flush();
assert_eq!(observer.name(), "dummy-observer");
assert!(observer.as_any().downcast_ref::<DummyObserver>().is_some());
}
#[test]
fn observer_event_and_metric_are_cloneable() {
let event = ObserverEvent::ToolCall {
tool: "shell".into(),
duration: Duration::from_millis(10),
success: true,
};
let metric = ObserverMetric::RequestLatency(Duration::from_millis(8));
let cloned_event = event.clone();
let cloned_metric = metric.clone();
assert!(matches!(cloned_event, ObserverEvent::ToolCall { .. }));
assert!(matches!(cloned_metric, ObserverMetric::RequestLatency(_)));
}
/// Downcast support for backend-specific operations (e.g. Prometheus encoding)
fn as_any(&self) -> &dyn std::any::Any;
}