Skip to main content

zeroclaw_api/
observability_traits.rs

1use std::time::Duration;
2
3/// Discrete events emitted by the agent runtime for observability.
4///
5/// Each variant represents a lifecycle event that observers can record,
6/// aggregate, or forward to external monitoring systems. Events carry
7/// just enough context for tracing and diagnostics without exposing
8/// sensitive prompt or response content.
9#[derive(Debug, Clone)]
10pub enum ObserverEvent {
11    /// The agent orchestration loop has started a new session.
12    AgentStart {
13        model_provider: String,
14        model: String,
15    },
16    /// A request is about to be sent to an LLM model_provider.
17    ///
18    /// This is emitted immediately before a model_provider call so observers can print
19    /// user-facing progress without leaking prompt contents.
20    LlmRequest {
21        model_provider: String,
22        model: String,
23        messages_count: usize,
24    },
25    /// Result of a single LLM model_provider call.
26    LlmResponse {
27        model_provider: String,
28        model: String,
29        duration: Duration,
30        success: bool,
31        error_message: Option<String>,
32        input_tokens: Option<u64>,
33        output_tokens: Option<u64>,
34    },
35    /// The agent session has finished.
36    ///
37    /// Carries aggregate usage data (tokens, cost) when the model_provider reports it.
38    AgentEnd {
39        model_provider: String,
40        model: String,
41        duration: Duration,
42        tokens_used: Option<u64>,
43        cost_usd: Option<f64>,
44    },
45    /// A tool call is about to be executed.
46    ToolCallStart {
47        tool: String,
48        /// Provider-assigned tool call identifier, when the underlying tool
49        /// call originated from a native structured tool call block (e.g.
50        /// OpenAI `tool_calls[].id`, Anthropic `tool_use.id`). `None` for
51        /// text-parsed (XML/markdown) tool calls.
52        ///
53        /// Observers can correlate `ToolCallStart` → `ToolCall` → the
54        /// emitting LLM response via this id.
55        tool_call_id: Option<String>,
56        /// Full JSON arguments the agent passed to the tool. `None` when
57        /// arguments are unavailable at the call site.
58        arguments: Option<String>,
59    },
60    /// A tool call has completed with a success/failure outcome.
61    ToolCall {
62        tool: String,
63        /// Provider-assigned tool call identifier, when present. See
64        /// [`ObserverEvent::ToolCallStart::tool_call_id`].
65        tool_call_id: Option<String>,
66        duration: Duration,
67        success: bool,
68        /// Full JSON arguments the agent passed to the tool.
69        ///
70        /// Carried here (in addition to `ToolCallStart`) so observers that
71        /// build a single completed span per tool call — e.g. the OTel
72        /// exporter — can attach arguments at span-end time without holding
73        /// per-call state.
74        arguments: Option<String>,
75        /// Scrubbed tool output or error reason. Populated for both success
76        /// and failure outcomes so backends can show the actual tool result
77        /// in trace viewers. Credentials are scrubbed before this field is
78        /// emitted.
79        result: Option<String>,
80    },
81    /// The agent produced a final answer for the current user message.
82    TurnComplete,
83    /// A message was sent or received through a channel.
84    ChannelMessage {
85        /// Channel name (e.g., `"telegram"`, `"discord"`).
86        channel: String,
87        /// `"inbound"` or `"outbound"`.
88        direction: String,
89    },
90    /// Periodic heartbeat tick from the runtime keep-alive loop.
91    HeartbeatTick,
92    /// Response cache hit — an LLM call was avoided.
93    CacheHit {
94        /// `"hot"` (in-memory) or `"warm"` (SQLite).
95        cache_type: String,
96        /// Estimated tokens saved by this cache hit.
97        tokens_saved: u64,
98    },
99    /// Response cache miss — the prompt was not found in cache.
100    CacheMiss {
101        /// `"response"` cache layer that was checked.
102        cache_type: String,
103    },
104    /// An error occurred in a named component.
105    Error {
106        /// Subsystem where the error originated (e.g., `"model_provider"`, `"gateway"`).
107        component: String,
108        /// Human-readable error description. Must not contain secrets or tokens.
109        message: String,
110    },
111    /// A deployment has started.
112    DeploymentStarted {
113        /// Identifier for the deployment (e.g., commit SHA or release tag).
114        deploy_id: String,
115    },
116    /// A deployment has completed successfully.
117    DeploymentCompleted {
118        deploy_id: String,
119        /// Commit SHA that was deployed.
120        commit_sha: String,
121    },
122    /// A deployment has failed.
123    DeploymentFailed {
124        deploy_id: String,
125        /// Human-readable failure reason.
126        reason: String,
127    },
128    /// Recovery from a failed deployment has completed.
129    RecoveryCompleted { deploy_id: String },
130}
131
132/// Numeric metrics emitted by the agent runtime.
133///
134/// Observers can aggregate these into dashboards, alerts, or structured logs.
135/// Each variant carries a single scalar value with implicit units.
136#[derive(Debug, Clone)]
137pub enum ObserverMetric {
138    /// Time elapsed for a single LLM or tool request.
139    RequestLatency(Duration),
140    /// Number of tokens consumed by an LLM call.
141    TokensUsed(u64),
142    /// Current number of active concurrent sessions.
143    ActiveSessions(u64),
144    /// Current depth of the inbound message queue.
145    QueueDepth(u64),
146    /// Time elapsed from commit to deployment (lead time for changes).
147    DeploymentLeadTime(Duration),
148    /// Time elapsed to recover from a failed deployment.
149    RecoveryTime(Duration),
150}
151
152/// Core observability trait for recording agent runtime telemetry.
153///
154/// Implement this trait to integrate with any monitoring backend (structured
155/// logging, Prometheus, OpenTelemetry, etc.). The agent runtime holds one or
156/// more `Observer` instances and calls [`record_event`](Observer::record_event)
157/// and [`record_metric`](Observer::record_metric) at key lifecycle points.
158///
159/// Implementations must be `Send + Sync + 'static` because the observer is
160/// shared across async tasks via `Arc`.
161pub trait Observer: Send + Sync + 'static {
162    /// Record a discrete lifecycle event.
163    ///
164    /// Called synchronously on the hot path; implementations should avoid
165    /// blocking I/O. Buffer events internally and flush asynchronously
166    /// when possible.
167    fn record_event(&self, event: &ObserverEvent);
168
169    /// Record a numeric metric sample.
170    ///
171    /// Called synchronously; same non-blocking guidance as
172    /// [`record_event`](Observer::record_event).
173    fn record_metric(&self, metric: &ObserverMetric);
174
175    /// Flush any buffered telemetry data to the backend.
176    ///
177    /// The runtime calls this during graceful shutdown. The default
178    /// implementation is a no-op, which is appropriate for backends
179    /// that write synchronously.
180    fn flush(&self) {}
181
182    /// Return the human-readable name of this observer backend.
183    ///
184    /// Used in logs and diagnostics (e.g., `"console"`, `"prometheus"`,
185    /// `"opentelemetry"`).
186    fn name(&self) -> &str;
187
188    /// Downcast to `Any` for backend-specific operations.
189    ///
190    /// Enables callers to access concrete observer types when needed
191    /// (e.g., retrieving a Prometheus registry handle for custom metrics).
192    fn as_any(&self) -> &dyn std::any::Any;
193}
194
195/// Blanket implementation: `Arc<T>` delegates all `Observer` methods to `T`.
196///
197/// Lets a singleton observer be handed out as `Arc<MyObserver>` and still be
198/// used wherever `Box<dyn Observer>` is expected (e.g.
199/// `Box::new(MyObserver::shared())`). `as_any` deliberately delegates to the
200/// inner `T` so downcasts in handlers like `/metrics` recover the concrete
201/// type rather than the `Arc` wrapper.
202impl<T: Observer + ?Sized> Observer for std::sync::Arc<T> {
203    fn record_event(&self, event: &ObserverEvent) {
204        self.as_ref().record_event(event);
205    }
206
207    fn record_metric(&self, metric: &ObserverMetric) {
208        self.as_ref().record_metric(metric);
209    }
210
211    fn flush(&self) {
212        self.as_ref().flush();
213    }
214
215    fn name(&self) -> &str {
216        self.as_ref().name()
217    }
218
219    fn as_any(&self) -> &dyn std::any::Any {
220        self.as_ref().as_any()
221    }
222}
223
224#[cfg(test)]
225mod tests {
226    use super::*;
227    use parking_lot::Mutex;
228    use std::time::Duration;
229
230    #[derive(Default)]
231    struct DummyObserver {
232        events: Mutex<u64>,
233        metrics: Mutex<u64>,
234    }
235
236    impl Observer for DummyObserver {
237        fn record_event(&self, _event: &ObserverEvent) {
238            let mut guard = self.events.lock();
239            *guard += 1;
240        }
241
242        fn record_metric(&self, _metric: &ObserverMetric) {
243            let mut guard = self.metrics.lock();
244            *guard += 1;
245        }
246
247        fn name(&self) -> &str {
248            "dummy-observer"
249        }
250
251        fn as_any(&self) -> &dyn std::any::Any {
252            self
253        }
254    }
255
256    #[test]
257    fn observer_records_events_and_metrics() {
258        let observer = DummyObserver::default();
259
260        observer.record_event(&ObserverEvent::HeartbeatTick);
261        observer.record_event(&ObserverEvent::Error {
262            component: "test".into(),
263            message: "boom".into(),
264        });
265        observer.record_metric(&ObserverMetric::TokensUsed(42));
266
267        assert_eq!(*observer.events.lock(), 2);
268        assert_eq!(*observer.metrics.lock(), 1);
269    }
270
271    #[test]
272    fn observer_default_flush_and_as_any_work() {
273        let observer = DummyObserver::default();
274
275        observer.flush();
276        assert_eq!(observer.name(), "dummy-observer");
277        assert!(observer.as_any().downcast_ref::<DummyObserver>().is_some());
278    }
279
280    #[test]
281    fn observer_event_and_metric_are_cloneable() {
282        let event = ObserverEvent::ToolCall {
283            tool: "shell".into(),
284            tool_call_id: Some("call_abc123".into()),
285            duration: Duration::from_millis(10),
286            success: true,
287            arguments: Some(r#"{"command":"date"}"#.into()),
288            result: Some("Mon Apr 22 12:00:00 UTC 2026\n".into()),
289        };
290        let metric = ObserverMetric::RequestLatency(Duration::from_millis(8));
291
292        let cloned_event = event.clone();
293        let cloned_metric = metric.clone();
294
295        assert!(matches!(cloned_event, ObserverEvent::ToolCall { .. }));
296        assert!(matches!(cloned_metric, ObserverMetric::RequestLatency(_)));
297    }
298}