Skip to main content

zeroclaw_runtime/observability/
prometheus.rs

1use super::traits::{Observer, ObserverEvent, ObserverMetric};
2use prometheus::{
3    Encoder, GaugeVec, Histogram, HistogramOpts, HistogramVec, IntCounterVec, Registry, TextEncoder,
4};
5use std::sync::{Arc, OnceLock};
6
7/// Prometheus-backed observer — exposes metrics for scraping via `/metrics`.
8pub struct PrometheusObserver {
9    registry: Registry,
10
11    // Counters
12    agent_starts: IntCounterVec,
13    llm_requests: IntCounterVec,
14    tokens_input_total: IntCounterVec,
15    tokens_output_total: IntCounterVec,
16    tool_calls: IntCounterVec,
17    channel_messages: IntCounterVec,
18    heartbeat_ticks: prometheus::IntCounter,
19    errors: IntCounterVec,
20    cache_hits: IntCounterVec,
21    cache_misses: IntCounterVec,
22    cache_tokens_saved: IntCounterVec,
23
24    // Histograms
25    agent_duration: HistogramVec,
26    tool_duration: HistogramVec,
27    request_latency: Histogram,
28
29    // Gauges
30    tokens_used: prometheus::IntGauge,
31    active_sessions: GaugeVec,
32    queue_depth: GaugeVec,
33
34    // DORA
35    deployments_total: IntCounterVec,
36    deployment_lead_time: Histogram,
37    deployment_failure_rate: prometheus::Gauge,
38    recovery_time: Histogram,
39    mttr: prometheus::Gauge,
40    deploy_success_count: std::sync::atomic::AtomicU64,
41    deploy_failure_count: std::sync::atomic::AtomicU64,
42}
43
44impl Default for PrometheusObserver {
45    fn default() -> Self {
46        Self::new()
47    }
48}
49
50impl PrometheusObserver {
51    pub fn new() -> Self {
52        let registry = Registry::new();
53
54        let agent_starts = IntCounterVec::new(
55            prometheus::Opts::new("zeroclaw_agent_starts_total", "Total agent invocations"),
56            &["model_provider", "model"],
57        )
58        .expect("valid metric");
59
60        let llm_requests = IntCounterVec::new(
61            prometheus::Opts::new(
62                "zeroclaw_llm_requests_total",
63                "Total LLM model_provider requests",
64            ),
65            &["model_provider", "model", "success"],
66        )
67        .expect("valid metric");
68
69        let tokens_input_total = IntCounterVec::new(
70            prometheus::Opts::new("zeroclaw_tokens_input_total", "Total input tokens consumed"),
71            &["model_provider", "model"],
72        )
73        .expect("valid metric");
74
75        let tokens_output_total = IntCounterVec::new(
76            prometheus::Opts::new(
77                "zeroclaw_tokens_output_total",
78                "Total output tokens consumed",
79            ),
80            &["model_provider", "model"],
81        )
82        .expect("valid metric");
83
84        let tool_calls = IntCounterVec::new(
85            prometheus::Opts::new("zeroclaw_tool_calls_total", "Total tool calls"),
86            &["tool", "success"],
87        )
88        .expect("valid metric");
89
90        let channel_messages = IntCounterVec::new(
91            prometheus::Opts::new("zeroclaw_channel_messages_total", "Total channel messages"),
92            &["channel", "direction"],
93        )
94        .expect("valid metric");
95
96        let heartbeat_ticks =
97            prometheus::IntCounter::new("zeroclaw_heartbeat_ticks_total", "Total heartbeat ticks")
98                .expect("valid metric");
99
100        let errors = IntCounterVec::new(
101            prometheus::Opts::new("zeroclaw_errors_total", "Total errors by component"),
102            &["component"],
103        )
104        .expect("valid metric");
105
106        let cache_hits = IntCounterVec::new(
107            prometheus::Opts::new("zeroclaw_cache_hits_total", "Total response cache hits"),
108            &["cache_type"],
109        )
110        .expect("valid metric");
111
112        let cache_misses = IntCounterVec::new(
113            prometheus::Opts::new("zeroclaw_cache_misses_total", "Total response cache misses"),
114            &["cache_type"],
115        )
116        .expect("valid metric");
117
118        let cache_tokens_saved = IntCounterVec::new(
119            prometheus::Opts::new(
120                "zeroclaw_cache_tokens_saved_total",
121                "Total tokens saved by response cache",
122            ),
123            &["cache_type"],
124        )
125        .expect("valid metric");
126
127        let agent_duration = HistogramVec::new(
128            HistogramOpts::new(
129                "zeroclaw_agent_duration_seconds",
130                "Agent invocation duration in seconds",
131            )
132            .buckets(vec![0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0]),
133            &["model_provider", "model"],
134        )
135        .expect("valid metric");
136
137        let tool_duration = HistogramVec::new(
138            HistogramOpts::new(
139                "zeroclaw_tool_duration_seconds",
140                "Tool execution duration in seconds",
141            )
142            .buckets(vec![0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]),
143            &["tool"],
144        )
145        .expect("valid metric");
146
147        let request_latency = Histogram::with_opts(
148            HistogramOpts::new(
149                "zeroclaw_request_latency_seconds",
150                "Request latency in seconds",
151            )
152            .buckets(vec![0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0]),
153        )
154        .expect("valid metric");
155
156        let tokens_used = prometheus::IntGauge::new(
157            "zeroclaw_tokens_used_last",
158            "Tokens used in the last request",
159        )
160        .expect("valid metric");
161
162        let active_sessions = GaugeVec::new(
163            prometheus::Opts::new("zeroclaw_active_sessions", "Number of active sessions"),
164            &[],
165        )
166        .expect("valid metric");
167
168        let queue_depth = GaugeVec::new(
169            prometheus::Opts::new("zeroclaw_queue_depth", "Message queue depth"),
170            &[],
171        )
172        .expect("valid metric");
173
174        let deployments_total = IntCounterVec::new(
175            prometheus::Opts::new("zeroclaw_deployments_total", "Total deployments by status"),
176            &["status"],
177        )
178        .expect("valid metric");
179
180        let deployment_lead_time = Histogram::with_opts(
181            HistogramOpts::new(
182                "zeroclaw_deployment_lead_time_seconds",
183                "Deployment lead time from commit to deploy in seconds",
184            )
185            .buckets(vec![
186                60.0, 300.0, 600.0, 1800.0, 3600.0, 7200.0, 14400.0, 43200.0, 86400.0,
187            ]),
188        )
189        .expect("valid metric");
190
191        let deployment_failure_rate = prometheus::Gauge::new(
192            "zeroclaw_deployment_failure_rate",
193            "Ratio of failed deployments to total deployments",
194        )
195        .expect("valid metric");
196
197        let recovery_time = Histogram::with_opts(
198            HistogramOpts::new(
199                "zeroclaw_recovery_time_seconds",
200                "Time to recover from a failed deployment in seconds",
201            )
202            .buckets(vec![
203                60.0, 300.0, 600.0, 1800.0, 3600.0, 7200.0, 14400.0, 43200.0, 86400.0,
204            ]),
205        )
206        .expect("valid metric");
207
208        let mttr =
209            prometheus::Gauge::new("zeroclaw_mttr_seconds", "Mean time to recovery in seconds")
210                .expect("valid metric");
211
212        // Register all metrics
213        registry.register(Box::new(agent_starts.clone())).ok();
214        registry.register(Box::new(llm_requests.clone())).ok();
215        registry.register(Box::new(tokens_input_total.clone())).ok();
216        registry
217            .register(Box::new(tokens_output_total.clone()))
218            .ok();
219        registry.register(Box::new(tool_calls.clone())).ok();
220        registry.register(Box::new(channel_messages.clone())).ok();
221        registry.register(Box::new(heartbeat_ticks.clone())).ok();
222        registry.register(Box::new(errors.clone())).ok();
223        registry.register(Box::new(cache_hits.clone())).ok();
224        registry.register(Box::new(cache_misses.clone())).ok();
225        registry.register(Box::new(cache_tokens_saved.clone())).ok();
226        registry.register(Box::new(agent_duration.clone())).ok();
227        registry.register(Box::new(tool_duration.clone())).ok();
228        registry.register(Box::new(request_latency.clone())).ok();
229        registry.register(Box::new(tokens_used.clone())).ok();
230        registry.register(Box::new(active_sessions.clone())).ok();
231        registry.register(Box::new(queue_depth.clone())).ok();
232        registry.register(Box::new(deployments_total.clone())).ok();
233        registry
234            .register(Box::new(deployment_lead_time.clone()))
235            .ok();
236        registry
237            .register(Box::new(deployment_failure_rate.clone()))
238            .ok();
239        registry.register(Box::new(recovery_time.clone())).ok();
240        registry.register(Box::new(mttr.clone())).ok();
241
242        Self {
243            registry,
244            agent_starts,
245            llm_requests,
246            tokens_input_total,
247            tokens_output_total,
248            tool_calls,
249            channel_messages,
250            heartbeat_ticks,
251            errors,
252            cache_hits,
253            cache_misses,
254            cache_tokens_saved,
255            agent_duration,
256            tool_duration,
257            request_latency,
258            tokens_used,
259            active_sessions,
260            queue_depth,
261            deployments_total,
262            deployment_lead_time,
263            deployment_failure_rate,
264            recovery_time,
265            mttr,
266            deploy_success_count: std::sync::atomic::AtomicU64::new(0),
267            deploy_failure_count: std::sync::atomic::AtomicU64::new(0),
268        }
269    }
270
271    /// Encode all registered metrics into Prometheus text exposition format.
272    pub fn encode(&self) -> String {
273        let encoder = TextEncoder::new();
274        let families = self.registry.gather();
275        let mut buf = Vec::new();
276        encoder.encode(&families, &mut buf).unwrap_or_default();
277        String::from_utf8(buf).unwrap_or_default()
278    }
279
280    /// Process-wide singleton handle. All call sites that obtain a Prometheus
281    /// observer through this function share the same `Registry` and the same
282    /// underlying counters, so events recorded by the channel orchestrator and
283    /// events recorded by the gateway accumulate into the same time series and
284    /// are visible on a single `/metrics` scrape.
285    ///
286    /// `PrometheusObserver::new()` still returns a fresh, isolated instance —
287    /// kept for tests so parallel test cases don't see each other's counts.
288    pub fn shared() -> Arc<Self> {
289        static SINGLETON: OnceLock<Arc<PrometheusObserver>> = OnceLock::new();
290        SINGLETON.get_or_init(|| Arc::new(Self::new())).clone()
291    }
292}
293
294impl Observer for PrometheusObserver {
295    fn record_event(&self, event: &ObserverEvent) {
296        match event {
297            ObserverEvent::AgentStart {
298                model_provider,
299                model,
300            } => {
301                self.agent_starts
302                    .with_label_values(&[model_provider, model])
303                    .inc();
304            }
305            ObserverEvent::AgentEnd {
306                model_provider,
307                model,
308                duration,
309                tokens_used,
310                cost_usd: _,
311            } => {
312                // Agent duration is recorded via the histogram with model_provider/model labels
313                self.agent_duration
314                    .with_label_values(&[model_provider, model])
315                    .observe(duration.as_secs_f64());
316                if let Some(t) = tokens_used {
317                    self.tokens_used.set(i64::try_from(*t).unwrap_or(i64::MAX));
318                }
319            }
320            ObserverEvent::LlmResponse {
321                model_provider,
322                model,
323                success,
324                input_tokens,
325                output_tokens,
326                ..
327            } => {
328                let success_str = if *success { "true" } else { "false" };
329                self.llm_requests
330                    .with_label_values(&[model_provider.as_str(), model.as_str(), success_str])
331                    .inc();
332                if let Some(input) = input_tokens {
333                    self.tokens_input_total
334                        .with_label_values(&[model_provider.as_str(), model.as_str()])
335                        .inc_by(*input);
336                }
337                if let Some(output) = output_tokens {
338                    self.tokens_output_total
339                        .with_label_values(&[model_provider.as_str(), model.as_str()])
340                        .inc_by(*output);
341                }
342            }
343            ObserverEvent::ToolCallStart { .. }
344            | ObserverEvent::TurnComplete
345            | ObserverEvent::LlmRequest { .. }
346            | ObserverEvent::DeploymentStarted { .. }
347            | ObserverEvent::RecoveryCompleted { .. } => {}
348            ObserverEvent::ToolCall {
349                tool,
350                duration,
351                success,
352                ..
353            } => {
354                let success_str = if *success { "true" } else { "false" };
355                self.tool_calls
356                    .with_label_values(&[tool.as_str(), success_str])
357                    .inc();
358                self.tool_duration
359                    .with_label_values(&[tool.as_str()])
360                    .observe(duration.as_secs_f64());
361            }
362            ObserverEvent::ChannelMessage { channel, direction } => {
363                self.channel_messages
364                    .with_label_values(&[channel, direction])
365                    .inc();
366            }
367            ObserverEvent::HeartbeatTick => {
368                self.heartbeat_ticks.inc();
369            }
370            ObserverEvent::CacheHit {
371                cache_type,
372                tokens_saved,
373            } => {
374                self.cache_hits.with_label_values(&[cache_type]).inc();
375                self.cache_tokens_saved
376                    .with_label_values(&[cache_type])
377                    .inc_by(*tokens_saved);
378            }
379            ObserverEvent::CacheMiss { cache_type } => {
380                self.cache_misses.with_label_values(&[cache_type]).inc();
381            }
382            ObserverEvent::Error {
383                component,
384                message: _,
385            } => {
386                self.errors.with_label_values(&[component]).inc();
387            }
388            ObserverEvent::DeploymentCompleted { .. } => {
389                self.deployments_total.with_label_values(&["success"]).inc();
390                let s = self
391                    .deploy_success_count
392                    .fetch_add(1, std::sync::atomic::Ordering::Relaxed)
393                    + 1;
394                let f = self
395                    .deploy_failure_count
396                    .load(std::sync::atomic::Ordering::Relaxed);
397                let total = s + f;
398                if total > 0 {
399                    self.deployment_failure_rate.set(f as f64 / total as f64);
400                }
401            }
402            ObserverEvent::DeploymentFailed { .. } => {
403                self.deployments_total.with_label_values(&["failure"]).inc();
404                let f = self
405                    .deploy_failure_count
406                    .fetch_add(1, std::sync::atomic::Ordering::Relaxed)
407                    + 1;
408                let s = self
409                    .deploy_success_count
410                    .load(std::sync::atomic::Ordering::Relaxed);
411                let total = s + f;
412                if total > 0 {
413                    self.deployment_failure_rate.set(f as f64 / total as f64);
414                }
415            }
416        }
417    }
418
419    fn record_metric(&self, metric: &ObserverMetric) {
420        match metric {
421            ObserverMetric::RequestLatency(d) => {
422                self.request_latency.observe(d.as_secs_f64());
423            }
424            ObserverMetric::TokensUsed(t) => {
425                self.tokens_used.set(i64::try_from(*t).unwrap_or(i64::MAX));
426            }
427            ObserverMetric::ActiveSessions(s) => {
428                self.active_sessions
429                    .with_label_values(&[] as &[&str])
430                    .set(*s as f64);
431            }
432            ObserverMetric::QueueDepth(d) => {
433                self.queue_depth
434                    .with_label_values(&[] as &[&str])
435                    .set(*d as f64);
436            }
437            ObserverMetric::DeploymentLeadTime(d) => {
438                self.deployment_lead_time.observe(d.as_secs_f64());
439            }
440            ObserverMetric::RecoveryTime(d) => {
441                self.recovery_time.observe(d.as_secs_f64());
442                self.mttr.set(d.as_secs_f64());
443            }
444        }
445    }
446
447    fn name(&self) -> &str {
448        "prometheus"
449    }
450
451    fn as_any(&self) -> &dyn std::any::Any {
452        self
453    }
454}
455
456#[cfg(test)]
457mod tests {
458    use super::*;
459    use std::time::Duration;
460
461    #[test]
462    fn prometheus_observer_name() {
463        assert_eq!(PrometheusObserver::new().name(), "prometheus");
464    }
465
466    #[test]
467    fn records_all_events_without_panic() {
468        let obs = PrometheusObserver::new();
469        obs.record_event(&ObserverEvent::AgentStart {
470            model_provider: "openrouter".into(),
471            model: "claude-sonnet".into(),
472        });
473        obs.record_event(&ObserverEvent::AgentEnd {
474            model_provider: "openrouter".into(),
475            model: "claude-sonnet".into(),
476            duration: Duration::from_millis(500),
477            tokens_used: Some(100),
478            cost_usd: None,
479        });
480        obs.record_event(&ObserverEvent::AgentEnd {
481            model_provider: "openrouter".into(),
482            model: "claude-sonnet".into(),
483            duration: Duration::ZERO,
484            tokens_used: None,
485            cost_usd: None,
486        });
487        obs.record_event(&ObserverEvent::ToolCall {
488            tool: "shell".into(),
489            tool_call_id: None,
490            duration: Duration::from_millis(10),
491            success: true,
492            arguments: None,
493            result: None,
494        });
495        obs.record_event(&ObserverEvent::ToolCall {
496            tool: "file_read".into(),
497            tool_call_id: None,
498            duration: Duration::from_millis(5),
499            success: false,
500            arguments: None,
501            result: None,
502        });
503        obs.record_event(&ObserverEvent::ChannelMessage {
504            channel: "telegram".into(),
505            direction: "inbound".into(),
506        });
507        obs.record_event(&ObserverEvent::HeartbeatTick);
508        obs.record_event(&ObserverEvent::Error {
509            component: "model_provider".into(),
510            message: "timeout".into(),
511        });
512    }
513
514    #[test]
515    fn records_all_metrics_without_panic() {
516        let obs = PrometheusObserver::new();
517        obs.record_metric(&ObserverMetric::RequestLatency(Duration::from_secs(2)));
518        obs.record_metric(&ObserverMetric::TokensUsed(500));
519        obs.record_metric(&ObserverMetric::TokensUsed(0));
520        obs.record_metric(&ObserverMetric::ActiveSessions(3));
521        obs.record_metric(&ObserverMetric::QueueDepth(42));
522    }
523
524    #[test]
525    fn encode_produces_prometheus_text_format() {
526        let obs = PrometheusObserver::new();
527        obs.record_event(&ObserverEvent::AgentStart {
528            model_provider: "openrouter".into(),
529            model: "claude-sonnet".into(),
530        });
531        obs.record_event(&ObserverEvent::ToolCall {
532            tool: "shell".into(),
533            tool_call_id: None,
534            duration: Duration::from_millis(100),
535            success: true,
536            arguments: None,
537            result: None,
538        });
539        obs.record_event(&ObserverEvent::HeartbeatTick);
540        obs.record_metric(&ObserverMetric::RequestLatency(Duration::from_millis(250)));
541
542        let output = obs.encode();
543        assert!(output.contains("zeroclaw_agent_starts_total"));
544        assert!(output.contains("zeroclaw_tool_calls_total"));
545        assert!(output.contains("zeroclaw_heartbeat_ticks_total"));
546        assert!(output.contains("zeroclaw_request_latency_seconds"));
547    }
548
549    #[test]
550    fn counters_increment_correctly() {
551        let obs = PrometheusObserver::new();
552
553        for _ in 0..3 {
554            obs.record_event(&ObserverEvent::HeartbeatTick);
555        }
556
557        let output = obs.encode();
558        assert!(output.contains("zeroclaw_heartbeat_ticks_total 3"));
559    }
560
561    #[test]
562    fn tool_calls_track_success_and_failure_separately() {
563        let obs = PrometheusObserver::new();
564
565        obs.record_event(&ObserverEvent::ToolCall {
566            tool: "shell".into(),
567            tool_call_id: None,
568            duration: Duration::from_millis(10),
569            success: true,
570            arguments: None,
571            result: None,
572        });
573        obs.record_event(&ObserverEvent::ToolCall {
574            tool: "shell".into(),
575            tool_call_id: None,
576            duration: Duration::from_millis(10),
577            success: true,
578            arguments: None,
579            result: None,
580        });
581        obs.record_event(&ObserverEvent::ToolCall {
582            tool: "shell".into(),
583            tool_call_id: None,
584            duration: Duration::from_millis(10),
585            success: false,
586            arguments: None,
587            result: None,
588        });
589
590        let output = obs.encode();
591        assert!(output.contains(r#"zeroclaw_tool_calls_total{success="true",tool="shell"} 2"#));
592        assert!(output.contains(r#"zeroclaw_tool_calls_total{success="false",tool="shell"} 1"#));
593    }
594
595    #[test]
596    fn errors_track_by_component() {
597        let obs = PrometheusObserver::new();
598        obs.record_event(&ObserverEvent::Error {
599            component: "model_provider".into(),
600            message: "timeout".into(),
601        });
602        obs.record_event(&ObserverEvent::Error {
603            component: "model_provider".into(),
604            message: "rate limit".into(),
605        });
606        obs.record_event(&ObserverEvent::Error {
607            component: "channels".into(),
608            message: "disconnected".into(),
609        });
610
611        let output = obs.encode();
612        assert!(output.contains(r#"zeroclaw_errors_total{component="model_provider"} 2"#));
613        assert!(output.contains(r#"zeroclaw_errors_total{component="channels"} 1"#));
614    }
615
616    #[test]
617    fn gauge_reflects_latest_value() {
618        let obs = PrometheusObserver::new();
619        obs.record_metric(&ObserverMetric::TokensUsed(100));
620        obs.record_metric(&ObserverMetric::TokensUsed(200));
621
622        let output = obs.encode();
623        assert!(output.contains("zeroclaw_tokens_used_last 200"));
624    }
625
626    #[test]
627    fn llm_response_tracks_request_count_and_tokens() {
628        let obs = PrometheusObserver::new();
629
630        obs.record_event(&ObserverEvent::LlmResponse {
631            model_provider: "openrouter".into(),
632            model: "claude-sonnet".into(),
633            duration: Duration::from_millis(200),
634            success: true,
635            error_message: None,
636            input_tokens: Some(100),
637            output_tokens: Some(50),
638        });
639        obs.record_event(&ObserverEvent::LlmResponse {
640            model_provider: "openrouter".into(),
641            model: "claude-sonnet".into(),
642            duration: Duration::from_millis(300),
643            success: true,
644            error_message: None,
645            input_tokens: Some(200),
646            output_tokens: Some(80),
647        });
648
649        let output = obs.encode();
650        assert!(output.contains(
651            r#"zeroclaw_llm_requests_total{model="claude-sonnet",model_provider="openrouter",success="true"} 2"#
652        ));
653        assert!(output.contains(
654            r#"zeroclaw_tokens_input_total{model="claude-sonnet",model_provider="openrouter"} 300"#
655        ));
656        assert!(output.contains(
657            r#"zeroclaw_tokens_output_total{model="claude-sonnet",model_provider="openrouter"} 130"#
658        ));
659    }
660
661    #[test]
662    fn llm_response_without_tokens_increments_request_only() {
663        let obs = PrometheusObserver::new();
664
665        obs.record_event(&ObserverEvent::LlmResponse {
666            model_provider: "ollama".into(),
667            model: "llama3".into(),
668            duration: Duration::from_millis(100),
669            success: false,
670            error_message: Some("timeout".into()),
671            input_tokens: None,
672            output_tokens: None,
673        });
674
675        let output = obs.encode();
676        assert!(output.contains(
677            r#"zeroclaw_llm_requests_total{model="llama3",model_provider="ollama",success="false"} 1"#
678        ));
679        // Token counters should not appear (no data recorded)
680        assert!(!output.contains("zeroclaw_tokens_input_total{"));
681        assert!(!output.contains("zeroclaw_tokens_output_total{"));
682    }
683
684    #[test]
685    fn dora_deployment_events_track_counters() {
686        let obs = PrometheusObserver::new();
687
688        obs.record_event(&ObserverEvent::DeploymentCompleted {
689            deploy_id: "d1".into(),
690            commit_sha: "abc123".into(),
691        });
692        obs.record_event(&ObserverEvent::DeploymentCompleted {
693            deploy_id: "d2".into(),
694            commit_sha: "def456".into(),
695        });
696        obs.record_event(&ObserverEvent::DeploymentFailed {
697            deploy_id: "d3".into(),
698            reason: "timeout".into(),
699        });
700
701        let output = obs.encode();
702        assert!(output.contains(r#"zeroclaw_deployments_total{status="success"} 2"#));
703        assert!(output.contains(r#"zeroclaw_deployments_total{status="failure"} 1"#));
704    }
705
706    #[test]
707    fn dora_failure_rate_gauge_updates() {
708        let obs = PrometheusObserver::new();
709
710        obs.record_event(&ObserverEvent::DeploymentCompleted {
711            deploy_id: "d1".into(),
712            commit_sha: "abc".into(),
713        });
714        obs.record_event(&ObserverEvent::DeploymentFailed {
715            deploy_id: "d2".into(),
716            reason: "error".into(),
717        });
718
719        let output = obs.encode();
720        // 1 failure out of 2 total = 0.5
721        assert!(output.contains("zeroclaw_deployment_failure_rate 0.5"));
722    }
723
724    #[test]
725    fn dora_lead_time_and_recovery_metrics() {
726        let obs = PrometheusObserver::new();
727
728        obs.record_metric(&ObserverMetric::DeploymentLeadTime(Duration::from_secs(
729            3600,
730        )));
731        obs.record_metric(&ObserverMetric::RecoveryTime(Duration::from_secs(600)));
732
733        let output = obs.encode();
734        assert!(output.contains("zeroclaw_deployment_lead_time_seconds"));
735        assert!(output.contains("zeroclaw_recovery_time_seconds"));
736        assert!(output.contains("zeroclaw_mttr_seconds 600"));
737    }
738
739    #[test]
740    fn dora_started_and_recovery_events_no_panic() {
741        let obs = PrometheusObserver::new();
742
743        obs.record_event(&ObserverEvent::DeploymentStarted {
744            deploy_id: "d1".into(),
745        });
746        obs.record_event(&ObserverEvent::RecoveryCompleted {
747            deploy_id: "d1".into(),
748        });
749    }
750
751    #[test]
752    fn shared_returns_the_same_registry_across_calls() {
753        let a = PrometheusObserver::shared();
754        let b = PrometheusObserver::shared();
755        assert!(
756            Arc::ptr_eq(&a, &b),
757            "PrometheusObserver::shared() must hand out the same underlying \
758             instance to every caller, otherwise the gateway's /metrics scrape \
759             cannot see counters incremented by the channel orchestrator"
760        );
761    }
762
763    #[test]
764    fn arc_blanket_observer_impl_routes_to_inner() {
765        let shared_a = PrometheusObserver::shared();
766        let shared_b = PrometheusObserver::shared();
767
768        Observer::record_event(
769            &shared_a,
770            &ObserverEvent::ChannelMessage {
771                channel: "test-channel".into(),
772                direction: "inbound".into(),
773            },
774        );
775
776        let output = shared_b.encode();
777        assert!(
778            output.contains(
779                r#"zeroclaw_channel_messages_total{channel="test-channel",direction="inbound"} 1"#
780            ),
781            "an event recorded through one Arc handle must be visible when \
782             scraping through any other handle — output was: {output}"
783        );
784    }
785
786    #[test]
787    fn arc_blanket_observer_impl_preserves_downcast() {
788        let shared: Arc<PrometheusObserver> = PrometheusObserver::shared();
789        let observer: &dyn Observer = &shared;
790        assert!(
791            observer
792                .as_any()
793                .downcast_ref::<PrometheusObserver>()
794                .is_some(),
795            "the /metrics resolver downcasts through `as_any` — Arc<T> must \
796             surface the inner T, not the Arc wrapper"
797        );
798    }
799}