Skip to main content

zeroclaw_runtime/health/
mod.rs

1use chrono::Utc;
2use parking_lot::Mutex;
3use serde::Serialize;
4use std::collections::BTreeMap;
5use std::sync::OnceLock;
6use std::time::Instant;
7
8#[derive(Debug, Clone, Serialize)]
9pub struct ComponentHealth {
10    pub status: String,
11    pub updated_at: String,
12    pub last_ok: Option<String>,
13    pub last_error: Option<String>,
14    pub restart_count: u64,
15}
16
17#[derive(Debug, Clone, Serialize)]
18pub struct HealthSnapshot {
19    pub pid: u32,
20    pub updated_at: String,
21    pub uptime_seconds: u64,
22    pub components: BTreeMap<String, ComponentHealth>,
23}
24
25struct HealthRegistry {
26    started_at: Instant,
27    started_at_wall: chrono::DateTime<chrono::Utc>,
28    components: Mutex<BTreeMap<String, ComponentHealth>>,
29}
30
31static REGISTRY: OnceLock<HealthRegistry> = OnceLock::new();
32
33fn registry() -> &'static HealthRegistry {
34    REGISTRY.get_or_init(|| HealthRegistry {
35        started_at: Instant::now(),
36        started_at_wall: Utc::now(),
37        components: Mutex::new(BTreeMap::new()),
38    })
39}
40
41/// Daemon start time as RFC 3339 UTC. Stable across the daemon's
42/// lifetime so the dashboard can implement "since daemon start"
43/// log queries without drift.
44pub fn daemon_started_at() -> String {
45    registry().started_at_wall.to_rfc3339()
46}
47
48fn now_rfc3339() -> String {
49    Utc::now().to_rfc3339()
50}
51
52fn upsert_component<F>(component: &str, update: F)
53where
54    F: FnOnce(&mut ComponentHealth),
55{
56    let mut map = registry().components.lock();
57    let now = now_rfc3339();
58    let entry = map
59        .entry(component.to_string())
60        .or_insert_with(|| ComponentHealth {
61            status: "starting".into(),
62            updated_at: now.clone(),
63            last_ok: None,
64            last_error: None,
65            restart_count: 0,
66        });
67    update(entry);
68    entry.updated_at = now;
69}
70
71pub fn mark_component_ok(component: &str) {
72    upsert_component(component, |entry| {
73        entry.status = "ok".into();
74        entry.last_ok = Some(now_rfc3339());
75        entry.last_error = None;
76    });
77}
78
79#[allow(clippy::needless_pass_by_value)]
80pub fn mark_component_error(component: &str, error: impl ToString) {
81    let err = error.to_string();
82    upsert_component(component, move |entry| {
83        entry.status = "error".into();
84        entry.last_error = Some(err);
85    });
86}
87
88pub fn bump_component_restart(component: &str) {
89    upsert_component(component, |entry| {
90        entry.restart_count = entry.restart_count.saturating_add(1);
91    });
92}
93
94pub fn snapshot() -> HealthSnapshot {
95    let components = registry().components.lock().clone();
96
97    HealthSnapshot {
98        pid: std::process::id(),
99        updated_at: now_rfc3339(),
100        uptime_seconds: registry().started_at.elapsed().as_secs(),
101        components,
102    }
103}
104
105pub fn snapshot_json() -> serde_json::Value {
106    serde_json::to_value(snapshot()).unwrap_or_else(|_| {
107        serde_json::json!({
108            "status": "error",
109            "message": "failed to serialize health snapshot"
110        })
111    })
112}
113
114#[cfg(test)]
115mod tests {
116    use super::*;
117
118    fn unique_component(prefix: &str) -> String {
119        format!("{prefix}-{}", uuid::Uuid::new_v4())
120    }
121
122    #[test]
123    fn mark_component_ok_initializes_component_state() {
124        let component = unique_component("health-ok");
125
126        mark_component_ok(&component);
127
128        let snapshot = snapshot();
129        let entry = snapshot
130            .components
131            .get(&component)
132            .expect("component should be present after mark_component_ok");
133
134        assert_eq!(entry.status, "ok");
135        assert!(entry.last_ok.is_some());
136        assert!(entry.last_error.is_none());
137    }
138
139    #[test]
140    fn mark_component_error_then_ok_clears_last_error() {
141        let component = unique_component("health-error");
142
143        mark_component_error(&component, "first failure");
144        let error_snapshot = snapshot();
145        let errored = error_snapshot
146            .components
147            .get(&component)
148            .expect("component should exist after mark_component_error");
149        assert_eq!(errored.status, "error");
150        assert_eq!(errored.last_error.as_deref(), Some("first failure"));
151
152        mark_component_ok(&component);
153        let recovered_snapshot = snapshot();
154        let recovered = recovered_snapshot
155            .components
156            .get(&component)
157            .expect("component should exist after recovery");
158        assert_eq!(recovered.status, "ok");
159        assert!(recovered.last_error.is_none());
160        assert!(recovered.last_ok.is_some());
161    }
162
163    #[test]
164    fn bump_component_restart_increments_counter() {
165        let component = unique_component("health-restart");
166
167        bump_component_restart(&component);
168        bump_component_restart(&component);
169
170        let snapshot = snapshot();
171        let entry = snapshot
172            .components
173            .get(&component)
174            .expect("component should exist after restart bump");
175
176        assert_eq!(entry.restart_count, 2);
177    }
178
179    #[test]
180    fn snapshot_json_contains_registered_component_fields() {
181        let component = unique_component("health-json");
182
183        mark_component_ok(&component);
184
185        let json = snapshot_json();
186        let component_json = &json["components"][&component];
187
188        assert_eq!(component_json["status"], "ok");
189        assert!(component_json["updated_at"].as_str().is_some());
190        assert!(component_json["last_ok"].as_str().is_some());
191        assert!(json["uptime_seconds"].as_u64().is_some());
192    }
193}