feat(observability): implement Prometheus metrics backend with /metrics endpoint

- Adds PrometheusObserver backend with counters, histograms, and gauges
- Tracks agent starts/duration, tool calls, channel messages, heartbeat ticks, errors, request latency, tokens, sessions, queue depth
- Adds GET /metrics endpoint to gateway for Prometheus scraping
- Adds provider/model labels to AgentStart and AgentEnd events for better observability
- Adds as_any() method to Observer trait for backend-specific downcast

Metrics exposed:
- zeroclaw_agent_starts_total (Counter) with provider/model labels
- zeroclaw_agent_duration_seconds (Histogram) with provider/model labels
- zeroclaw_tool_calls_total (Counter) with tool/success labels
- zeroclaw_tool_duration_seconds (Histogram) with tool label
- zeroclaw_channel_messages_total (Counter) with channel/direction labels
- zeroclaw_heartbeat_ticks_total (Counter)
- zeroclaw_errors_total (Counter) with component label
- zeroclaw_request_latency_seconds (Histogram)
- zeroclaw_tokens_used_last (Gauge)
- zeroclaw_active_sessions (Gauge)
- zeroclaw_queue_depth (Gauge)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
argenis de la rosa 2026-02-17 14:01:37 -05:00 committed by Chummy
parent c04f2855e4
commit eba544dbd4
11 changed files with 575 additions and 228 deletions

View file

@ -274,6 +274,8 @@ pub struct AppState {
pub whatsapp: Option<Arc<WhatsAppChannel>>,
/// `WhatsApp` app secret for webhook signature verification (`X-Hub-Signature-256`)
pub whatsapp_app_secret: Option<Arc<str>>,
/// Observability backend for metrics scraping
pub observer: Arc<dyn crate::observability::Observer>,
}
/// Run the HTTP gateway using axum with proper HTTP/1.1 compliance.
@ -433,6 +435,7 @@ pub async fn run_gateway(host: &str, port: u16, config: Config) -> Result<()> {
println!(" POST /whatsapp — WhatsApp message webhook");
}
println!(" GET /health — health check");
println!(" GET /metrics — Prometheus metrics");
if let Some(code) = pairing.pairing_code() {
println!();
println!(" 🔐 PAIRING REQUIRED — use this one-time code:");
@ -450,6 +453,9 @@ pub async fn run_gateway(host: &str, port: u16, config: Config) -> Result<()> {
crate::health::mark_component_ok("gateway");
// Build shared state
let observer: Arc<dyn crate::observability::Observer> =
Arc::from(crate::observability::create_observer(&config.observability));
let state = AppState {
config: config_state,
provider,
@ -464,11 +470,13 @@ pub async fn run_gateway(host: &str, port: u16, config: Config) -> Result<()> {
idempotency_store,
whatsapp: whatsapp_channel,
whatsapp_app_secret,
observer,
};
// Build router with middleware
let app = Router::new()
.route("/health", get(handle_health))
.route("/metrics", get(handle_metrics))
.route("/pair", post(handle_pair))
.route("/webhook", post(handle_webhook))
.route("/whatsapp", get(handle_whatsapp_verify))
@ -504,6 +512,29 @@ async fn handle_health(State(state): State<AppState>) -> impl IntoResponse {
Json(body)
}
/// Prometheus content type for text exposition format.
const PROMETHEUS_CONTENT_TYPE: &str = "text/plain; version=0.0.4; charset=utf-8";
/// GET /metrics — Prometheus text exposition format
async fn handle_metrics(State(state): State<AppState>) -> impl IntoResponse {
let body = if let Some(prom) = state
.observer
.as_ref()
.as_any()
.downcast_ref::<crate::observability::PrometheusObserver>()
{
prom.encode()
} else {
String::from("# Prometheus backend not enabled. Set [observability] backend = \"prometheus\" in config.\n")
};
(
StatusCode::OK,
[(header::CONTENT_TYPE, PROMETHEUS_CONTENT_TYPE)],
body,
)
}
/// POST /pair — exchange one-time code for bearer token
async fn handle_pair(
State(state): State<AppState>,
@ -1247,6 +1278,7 @@ mod tests {
idempotency_store: Arc::new(IdempotencyStore::new(Duration::from_secs(300), 1000)),
whatsapp: None,
whatsapp_app_secret: None,
observer: Arc::new(crate::observability::NoopObserver),
};
let mut headers = HeaderMap::new();
@ -1302,6 +1334,7 @@ mod tests {
idempotency_store: Arc::new(IdempotencyStore::new(Duration::from_secs(300), 1000)),
whatsapp: None,
whatsapp_app_secret: None,
observer: Arc::new(crate::observability::NoopObserver),
};
let headers = HeaderMap::new();
@ -1366,6 +1399,7 @@ mod tests {
idempotency_store: Arc::new(IdempotencyStore::new(Duration::from_secs(300), 1000)),
whatsapp: None,
whatsapp_app_secret: None,
observer: Arc::new(crate::observability::NoopObserver),
};
let response = handle_webhook(
@ -1403,6 +1437,7 @@ mod tests {
idempotency_store: Arc::new(IdempotencyStore::new(Duration::from_secs(300), 1000)),
whatsapp: None,
whatsapp_app_secret: None,
observer: Arc::new(crate::observability::NoopObserver),
};
let mut headers = HeaderMap::new();
@ -1443,6 +1478,7 @@ mod tests {
idempotency_store: Arc::new(IdempotencyStore::new(Duration::from_secs(300), 1000)),
whatsapp: None,
whatsapp_app_secret: None,
observer: Arc::new(crate::observability::NoopObserver),
};
let mut headers = HeaderMap::new();