zeroclaw_api/observability_traits.rs
1use std::time::Duration;
2
3/// Discrete events emitted by the agent runtime for observability.
4///
5/// Each variant represents a lifecycle event that observers can record,
6/// aggregate, or forward to external monitoring systems. Events carry
7/// just enough context for tracing and diagnostics without exposing
8/// sensitive prompt or response content.
9#[derive(Debug, Clone)]
10pub enum ObserverEvent {
11 /// The agent orchestration loop has started a new session.
12 AgentStart {
13 model_provider: String,
14 model: String,
15 },
16 /// A request is about to be sent to an LLM model_provider.
17 ///
18 /// This is emitted immediately before a model_provider call so observers can print
19 /// user-facing progress without leaking prompt contents.
20 LlmRequest {
21 model_provider: String,
22 model: String,
23 messages_count: usize,
24 },
25 /// Result of a single LLM model_provider call.
26 LlmResponse {
27 model_provider: String,
28 model: String,
29 duration: Duration,
30 success: bool,
31 error_message: Option<String>,
32 input_tokens: Option<u64>,
33 output_tokens: Option<u64>,
34 },
35 /// The agent session has finished.
36 ///
37 /// Carries aggregate usage data (tokens, cost) when the model_provider reports it.
38 AgentEnd {
39 model_provider: String,
40 model: String,
41 duration: Duration,
42 tokens_used: Option<u64>,
43 cost_usd: Option<f64>,
44 },
45 /// A tool call is about to be executed.
46 ToolCallStart {
47 tool: String,
48 /// Provider-assigned tool call identifier, when the underlying tool
49 /// call originated from a native structured tool call block (e.g.
50 /// OpenAI `tool_calls[].id`, Anthropic `tool_use.id`). `None` for
51 /// text-parsed (XML/markdown) tool calls.
52 ///
53 /// Observers can correlate `ToolCallStart` → `ToolCall` → the
54 /// emitting LLM response via this id.
55 tool_call_id: Option<String>,
56 /// Full JSON arguments the agent passed to the tool. `None` when
57 /// arguments are unavailable at the call site.
58 arguments: Option<String>,
59 },
60 /// A tool call has completed with a success/failure outcome.
61 ToolCall {
62 tool: String,
63 /// Provider-assigned tool call identifier, when present. See
64 /// [`ObserverEvent::ToolCallStart::tool_call_id`].
65 tool_call_id: Option<String>,
66 duration: Duration,
67 success: bool,
68 /// Full JSON arguments the agent passed to the tool.
69 ///
70 /// Carried here (in addition to `ToolCallStart`) so observers that
71 /// build a single completed span per tool call — e.g. the OTel
72 /// exporter — can attach arguments at span-end time without holding
73 /// per-call state.
74 arguments: Option<String>,
75 /// Scrubbed tool output or error reason. Populated for both success
76 /// and failure outcomes so backends can show the actual tool result
77 /// in trace viewers. Credentials are scrubbed before this field is
78 /// emitted.
79 result: Option<String>,
80 },
81 /// The agent produced a final answer for the current user message.
82 TurnComplete,
83 /// A message was sent or received through a channel.
84 ChannelMessage {
85 /// Channel name (e.g., `"telegram"`, `"discord"`).
86 channel: String,
87 /// `"inbound"` or `"outbound"`.
88 direction: String,
89 },
90 /// Periodic heartbeat tick from the runtime keep-alive loop.
91 HeartbeatTick,
92 /// Response cache hit — an LLM call was avoided.
93 CacheHit {
94 /// `"hot"` (in-memory) or `"warm"` (SQLite).
95 cache_type: String,
96 /// Estimated tokens saved by this cache hit.
97 tokens_saved: u64,
98 },
99 /// Response cache miss — the prompt was not found in cache.
100 CacheMiss {
101 /// `"response"` cache layer that was checked.
102 cache_type: String,
103 },
104 /// An error occurred in a named component.
105 Error {
106 /// Subsystem where the error originated (e.g., `"model_provider"`, `"gateway"`).
107 component: String,
108 /// Human-readable error description. Must not contain secrets or tokens.
109 message: String,
110 },
111 /// A deployment has started.
112 DeploymentStarted {
113 /// Identifier for the deployment (e.g., commit SHA or release tag).
114 deploy_id: String,
115 },
116 /// A deployment has completed successfully.
117 DeploymentCompleted {
118 deploy_id: String,
119 /// Commit SHA that was deployed.
120 commit_sha: String,
121 },
122 /// A deployment has failed.
123 DeploymentFailed {
124 deploy_id: String,
125 /// Human-readable failure reason.
126 reason: String,
127 },
128 /// Recovery from a failed deployment has completed.
129 RecoveryCompleted { deploy_id: String },
130}
131
132/// Numeric metrics emitted by the agent runtime.
133///
134/// Observers can aggregate these into dashboards, alerts, or structured logs.
135/// Each variant carries a single scalar value with implicit units.
136#[derive(Debug, Clone)]
137pub enum ObserverMetric {
138 /// Time elapsed for a single LLM or tool request.
139 RequestLatency(Duration),
140 /// Number of tokens consumed by an LLM call.
141 TokensUsed(u64),
142 /// Current number of active concurrent sessions.
143 ActiveSessions(u64),
144 /// Current depth of the inbound message queue.
145 QueueDepth(u64),
146 /// Time elapsed from commit to deployment (lead time for changes).
147 DeploymentLeadTime(Duration),
148 /// Time elapsed to recover from a failed deployment.
149 RecoveryTime(Duration),
150}
151
152/// Core observability trait for recording agent runtime telemetry.
153///
154/// Implement this trait to integrate with any monitoring backend (structured
155/// logging, Prometheus, OpenTelemetry, etc.). The agent runtime holds one or
156/// more `Observer` instances and calls [`record_event`](Observer::record_event)
157/// and [`record_metric`](Observer::record_metric) at key lifecycle points.
158///
159/// Implementations must be `Send + Sync + 'static` because the observer is
160/// shared across async tasks via `Arc`.
161pub trait Observer: Send + Sync + 'static {
162 /// Record a discrete lifecycle event.
163 ///
164 /// Called synchronously on the hot path; implementations should avoid
165 /// blocking I/O. Buffer events internally and flush asynchronously
166 /// when possible.
167 fn record_event(&self, event: &ObserverEvent);
168
169 /// Record a numeric metric sample.
170 ///
171 /// Called synchronously; same non-blocking guidance as
172 /// [`record_event`](Observer::record_event).
173 fn record_metric(&self, metric: &ObserverMetric);
174
175 /// Flush any buffered telemetry data to the backend.
176 ///
177 /// The runtime calls this during graceful shutdown. The default
178 /// implementation is a no-op, which is appropriate for backends
179 /// that write synchronously.
180 fn flush(&self) {}
181
182 /// Return the human-readable name of this observer backend.
183 ///
184 /// Used in logs and diagnostics (e.g., `"console"`, `"prometheus"`,
185 /// `"opentelemetry"`).
186 fn name(&self) -> &str;
187
188 /// Downcast to `Any` for backend-specific operations.
189 ///
190 /// Enables callers to access concrete observer types when needed
191 /// (e.g., retrieving a Prometheus registry handle for custom metrics).
192 fn as_any(&self) -> &dyn std::any::Any;
193}
194
195/// Blanket implementation: `Arc<T>` delegates all `Observer` methods to `T`.
196///
197/// Lets a singleton observer be handed out as `Arc<MyObserver>` and still be
198/// used wherever `Box<dyn Observer>` is expected (e.g.
199/// `Box::new(MyObserver::shared())`). `as_any` deliberately delegates to the
200/// inner `T` so downcasts in handlers like `/metrics` recover the concrete
201/// type rather than the `Arc` wrapper.
202impl<T: Observer + ?Sized> Observer for std::sync::Arc<T> {
203 fn record_event(&self, event: &ObserverEvent) {
204 self.as_ref().record_event(event);
205 }
206
207 fn record_metric(&self, metric: &ObserverMetric) {
208 self.as_ref().record_metric(metric);
209 }
210
211 fn flush(&self) {
212 self.as_ref().flush();
213 }
214
215 fn name(&self) -> &str {
216 self.as_ref().name()
217 }
218
219 fn as_any(&self) -> &dyn std::any::Any {
220 self.as_ref().as_any()
221 }
222}
223
224#[cfg(test)]
225mod tests {
226 use super::*;
227 use parking_lot::Mutex;
228 use std::time::Duration;
229
230 #[derive(Default)]
231 struct DummyObserver {
232 events: Mutex<u64>,
233 metrics: Mutex<u64>,
234 }
235
236 impl Observer for DummyObserver {
237 fn record_event(&self, _event: &ObserverEvent) {
238 let mut guard = self.events.lock();
239 *guard += 1;
240 }
241
242 fn record_metric(&self, _metric: &ObserverMetric) {
243 let mut guard = self.metrics.lock();
244 *guard += 1;
245 }
246
247 fn name(&self) -> &str {
248 "dummy-observer"
249 }
250
251 fn as_any(&self) -> &dyn std::any::Any {
252 self
253 }
254 }
255
256 #[test]
257 fn observer_records_events_and_metrics() {
258 let observer = DummyObserver::default();
259
260 observer.record_event(&ObserverEvent::HeartbeatTick);
261 observer.record_event(&ObserverEvent::Error {
262 component: "test".into(),
263 message: "boom".into(),
264 });
265 observer.record_metric(&ObserverMetric::TokensUsed(42));
266
267 assert_eq!(*observer.events.lock(), 2);
268 assert_eq!(*observer.metrics.lock(), 1);
269 }
270
271 #[test]
272 fn observer_default_flush_and_as_any_work() {
273 let observer = DummyObserver::default();
274
275 observer.flush();
276 assert_eq!(observer.name(), "dummy-observer");
277 assert!(observer.as_any().downcast_ref::<DummyObserver>().is_some());
278 }
279
280 #[test]
281 fn observer_event_and_metric_are_cloneable() {
282 let event = ObserverEvent::ToolCall {
283 tool: "shell".into(),
284 tool_call_id: Some("call_abc123".into()),
285 duration: Duration::from_millis(10),
286 success: true,
287 arguments: Some(r#"{"command":"date"}"#.into()),
288 result: Some("Mon Apr 22 12:00:00 UTC 2026\n".into()),
289 };
290 let metric = ObserverMetric::RequestLatency(Duration::from_millis(8));
291
292 let cloned_event = event.clone();
293 let cloned_metric = metric.clone();
294
295 assert!(matches!(cloned_event, ObserverEvent::ToolCall { .. }));
296 assert!(matches!(cloned_metric, ObserverMetric::RequestLatency(_)));
297 }
298}