zeroclaw_runtime/health/
mod.rs1use chrono::Utc;
2use parking_lot::Mutex;
3use serde::Serialize;
4use std::collections::BTreeMap;
5use std::sync::OnceLock;
6use std::time::Instant;
7
8#[derive(Debug, Clone, Serialize)]
9pub struct ComponentHealth {
10 pub status: String,
11 pub updated_at: String,
12 pub last_ok: Option<String>,
13 pub last_error: Option<String>,
14 pub restart_count: u64,
15}
16
17#[derive(Debug, Clone, Serialize)]
18pub struct HealthSnapshot {
19 pub pid: u32,
20 pub updated_at: String,
21 pub uptime_seconds: u64,
22 pub components: BTreeMap<String, ComponentHealth>,
23}
24
25struct HealthRegistry {
26 started_at: Instant,
27 started_at_wall: chrono::DateTime<chrono::Utc>,
28 components: Mutex<BTreeMap<String, ComponentHealth>>,
29}
30
31static REGISTRY: OnceLock<HealthRegistry> = OnceLock::new();
32
33fn registry() -> &'static HealthRegistry {
34 REGISTRY.get_or_init(|| HealthRegistry {
35 started_at: Instant::now(),
36 started_at_wall: Utc::now(),
37 components: Mutex::new(BTreeMap::new()),
38 })
39}
40
41pub fn daemon_started_at() -> String {
45 registry().started_at_wall.to_rfc3339()
46}
47
48fn now_rfc3339() -> String {
49 Utc::now().to_rfc3339()
50}
51
52fn upsert_component<F>(component: &str, update: F)
53where
54 F: FnOnce(&mut ComponentHealth),
55{
56 let mut map = registry().components.lock();
57 let now = now_rfc3339();
58 let entry = map
59 .entry(component.to_string())
60 .or_insert_with(|| ComponentHealth {
61 status: "starting".into(),
62 updated_at: now.clone(),
63 last_ok: None,
64 last_error: None,
65 restart_count: 0,
66 });
67 update(entry);
68 entry.updated_at = now;
69}
70
71pub fn mark_component_ok(component: &str) {
72 upsert_component(component, |entry| {
73 entry.status = "ok".into();
74 entry.last_ok = Some(now_rfc3339());
75 entry.last_error = None;
76 });
77}
78
79#[allow(clippy::needless_pass_by_value)]
80pub fn mark_component_error(component: &str, error: impl ToString) {
81 let err = error.to_string();
82 upsert_component(component, move |entry| {
83 entry.status = "error".into();
84 entry.last_error = Some(err);
85 });
86}
87
88pub fn bump_component_restart(component: &str) {
89 upsert_component(component, |entry| {
90 entry.restart_count = entry.restart_count.saturating_add(1);
91 });
92}
93
94pub fn snapshot() -> HealthSnapshot {
95 let components = registry().components.lock().clone();
96
97 HealthSnapshot {
98 pid: std::process::id(),
99 updated_at: now_rfc3339(),
100 uptime_seconds: registry().started_at.elapsed().as_secs(),
101 components,
102 }
103}
104
105pub fn snapshot_json() -> serde_json::Value {
106 serde_json::to_value(snapshot()).unwrap_or_else(|_| {
107 serde_json::json!({
108 "status": "error",
109 "message": "failed to serialize health snapshot"
110 })
111 })
112}
113
114#[cfg(test)]
115mod tests {
116 use super::*;
117
118 fn unique_component(prefix: &str) -> String {
119 format!("{prefix}-{}", uuid::Uuid::new_v4())
120 }
121
122 #[test]
123 fn mark_component_ok_initializes_component_state() {
124 let component = unique_component("health-ok");
125
126 mark_component_ok(&component);
127
128 let snapshot = snapshot();
129 let entry = snapshot
130 .components
131 .get(&component)
132 .expect("component should be present after mark_component_ok");
133
134 assert_eq!(entry.status, "ok");
135 assert!(entry.last_ok.is_some());
136 assert!(entry.last_error.is_none());
137 }
138
139 #[test]
140 fn mark_component_error_then_ok_clears_last_error() {
141 let component = unique_component("health-error");
142
143 mark_component_error(&component, "first failure");
144 let error_snapshot = snapshot();
145 let errored = error_snapshot
146 .components
147 .get(&component)
148 .expect("component should exist after mark_component_error");
149 assert_eq!(errored.status, "error");
150 assert_eq!(errored.last_error.as_deref(), Some("first failure"));
151
152 mark_component_ok(&component);
153 let recovered_snapshot = snapshot();
154 let recovered = recovered_snapshot
155 .components
156 .get(&component)
157 .expect("component should exist after recovery");
158 assert_eq!(recovered.status, "ok");
159 assert!(recovered.last_error.is_none());
160 assert!(recovered.last_ok.is_some());
161 }
162
163 #[test]
164 fn bump_component_restart_increments_counter() {
165 let component = unique_component("health-restart");
166
167 bump_component_restart(&component);
168 bump_component_restart(&component);
169
170 let snapshot = snapshot();
171 let entry = snapshot
172 .components
173 .get(&component)
174 .expect("component should exist after restart bump");
175
176 assert_eq!(entry.restart_count, 2);
177 }
178
179 #[test]
180 fn snapshot_json_contains_registered_component_fields() {
181 let component = unique_component("health-json");
182
183 mark_component_ok(&component);
184
185 let json = snapshot_json();
186 let component_json = &json["components"][&component];
187
188 assert_eq!(component_json["status"], "ok");
189 assert!(component_json["updated_at"].as_str().is_some());
190 assert!(component_json["last_ok"].as_str().is_some());
191 assert!(json["uptime_seconds"].as_u64().is_some());
192 }
193}