Skip to main content

zeroclaw_config/
migration.rs

1use anyhow::{Context, Result};
2use std::path::Path;
3
4use crate::schema::Config;
5use crate::schema::v1::V1Config;
6use crate::schema::v2::V2Config;
7
8/// The schema version this binary writes and expects on disk.
9pub const CURRENT_SCHEMA_VERSION: u32 = 3;
10
11/// Top-level TOML keys that legacy schema versions had but V3 either
12/// removed or restructured. Suppresses "unknown key" warnings on V1/V2
13/// configs flowing through `migrate_to_current`: every key here is
14/// consumed by `V1Config::migrate` or `V2Config::migrate`, so it's
15/// expected on a stale-but-being-migrated config.
16pub const V1_LEGACY_KEYS: &[&str] = &[
17    "api_key",
18    "api_url",
19    "api_path",
20    "default_model_provider",
21    "default_model",
22    "model_providers",
23    "default_temperature",
24    "provider_timeout_secs",
25    "provider_max_tokens",
26    "extra_headers",
27    "model_routes",
28    "embedding_routes",
29    "channels_config",
30    "autonomy",
31    "agent",
32    "swarms",
33    "cron",
34];
35
36/// Detect a config's schema version from its parsed TOML representation.
37///
38/// - Missing top-level `schema_version` key → V1 (pre-versioned).
39/// - Integer ≥ 1 → that integer.
40/// - Anything else → error.
41pub fn detect_version(value: &toml::Value) -> Result<u32> {
42    let table = value
43        .as_table()
44        .context("config root must be a TOML table")?;
45    match table.get("schema_version") {
46        None => Ok(1),
47        Some(toml::Value::Integer(n)) if *n >= 1 => Ok(*n as u32),
48        Some(other) => {
49            ::zeroclaw_log::record!(
50                ERROR,
51                ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Fail)
52                    .with_outcome(::zeroclaw_log::EventOutcome::Failure)
53                    .with_attrs(::serde_json::json!({"found": other.to_string()})),
54                "config schema_version is not a positive integer"
55            );
56            anyhow::bail!("schema_version must be a positive integer, got {other}")
57        }
58    }
59}
60
61/// Pure migration from any supported version's TOML string into the current
62/// schema version's TOML string. Returns `Ok(None)` when the input is already
63/// at `CURRENT_SCHEMA_VERSION`.
64///
65/// Comments and decoration on keys whose dotted path survives the migration
66/// are preserved via `toml_edit::DocumentMut` reconciliation (`sync_table`).
67/// Keys that are renamed, removed, or restructured lose their comments — the
68/// `.backup` file written by `migrate_file_in_place` retains the original
69/// for manual recovery.
70pub fn migrate_file(input: &str) -> Result<Option<String>> {
71    let value: toml::Value = toml::from_str(input).context("failed to parse config TOML")?;
72    let from = detect_version(&value)?;
73    if from == CURRENT_SCHEMA_VERSION {
74        return Ok(None);
75    }
76    if from > CURRENT_SCHEMA_VERSION {
77        ::zeroclaw_log::record!(
78            ERROR,
79            ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Fail)
80                .with_outcome(::zeroclaw_log::EventOutcome::Failure)
81                .with_attrs(::serde_json::json!({
82                    "from_version": from,
83                    "supported_version": CURRENT_SCHEMA_VERSION,
84                })),
85            "config schema_version is newer than this binary supports"
86        );
87        anyhow::bail!(
88            "config schema_version {from} is newer than this binary supports ({CURRENT_SCHEMA_VERSION})"
89        );
90    }
91    let migrated_value = run_chain(value, from)?;
92    let migrated_table = match migrated_value {
93        toml::Value::Table(t) => t,
94        _ => {
95            anyhow::bail!("migrated config is not a TOML table");
96        }
97    };
98
99    // Try to preserve comments by reconciling into the original DocumentMut.
100    // If the original doesn't parse as toml_edit (rare — toml::from_str
101    // already succeeded on it), fall back to a fresh serialization.
102    if let Ok(mut doc) = input.parse::<toml_edit::DocumentMut>() {
103        sync_table(doc.as_table_mut(), &migrated_table);
104        Ok(Some(doc.to_string()))
105    } else {
106        let serialized = toml::to_string_pretty(&toml::Value::Table(migrated_table))
107            .context("failed to serialize migrated config")?;
108        Ok(Some(serialized))
109    }
110}
111
112/// Embedded V1 fixture used by [`generate`] / the `zeroclaw config generate`
113/// CLI. Authored against the V1 schema at the parent of the V2-intro
114/// commit; see `fixtures/v1.toml`.
115const V1_FIXTURE: &str = include_str!("../fixtures/v1.toml");
116
117/// Options for [`generate`].
118#[derive(Debug, Default, Clone)]
119pub struct GenerateOptions<'a> {
120    /// Encrypt secret-bearing string values in the output. Works at every
121    /// schema version via [`encrypt_secret_strings`], which walks the TOML
122    /// and ChaCha20-Poly1305-encrypts any leaf whose key name appears in
123    /// `SECRET_KEY_NAMES`.
124    pub encrypt_secrets: bool,
125    /// Directory containing (or to receive) the `.secret_key` used for
126    /// `enc2:` encryption. Required when `encrypt_secrets` is true. The
127    /// key is created with 0o600 permissions if absent — matches how the
128    /// daemon's `SecretStore` behaves on first use.
129    pub secret_store_dir: Option<&'a Path>,
130}
131
132/// Generate a canonical TOML config at `target_version`, derived by
133/// running the V1 fixture forward through the typed migration chain.
134///
135/// `target_version` must be in `1..=CURRENT_SCHEMA_VERSION`. The chain is
136/// the same one used to migrate real on-disk configs — V1 fixture →
137/// `V1Config::migrate` → V2 typed value → `V2Config::migrate` → V3 typed
138/// value — so `generate <n>` shows exactly the shape an operator running
139/// `zeroclaw config migrate` would land on if they started from the V1
140/// fixture.
141///
142/// When [`GenerateOptions::encrypt_secrets`] is set, secret-bearing
143/// string values (api_key, bot_token, access_token, etc. — see
144/// `SECRET_KEY_NAMES`) are ChaCha20-Poly1305-encrypted with the
145/// `.secret_key` under `secret_store_dir`. Works at every version.
146pub fn generate(target_version: u32, opts: &GenerateOptions<'_>) -> Result<String> {
147    if target_version == 0 || target_version > CURRENT_SCHEMA_VERSION {
148        anyhow::bail!(
149            "unsupported schema version {target_version} \
150             (valid: 1..={CURRENT_SCHEMA_VERSION})"
151        );
152    }
153
154    let value = if target_version == 1 {
155        toml::from_str::<toml::Value>(V1_FIXTURE).context("embedded V1 fixture is malformed")?
156    } else {
157        let v1_value: toml::Value =
158            toml::from_str(V1_FIXTURE).context("embedded V1 fixture is malformed")?;
159        run_chain_until(v1_value, 1, target_version)?
160    };
161
162    let mut value = value;
163    if opts.encrypt_secrets {
164        let store_dir = opts.secret_store_dir.context(
165            "--encrypt requires a secret-store directory \
166             (typically the resolved ZEROCLAW_CONFIG_DIR)",
167        )?;
168        let store = crate::secrets::SecretStore::new(store_dir, true);
169        encrypt_secret_strings(&mut value, &store)
170            .context("failed to encrypt secret-bearing fields in generated config")?;
171    }
172
173    toml::to_string_pretty(&value).context("failed to serialize generated config")
174}
175
176/// Set of TOML terminal key names whose string leaves are treated as
177/// secrets by [`encrypt_secret_strings`]. Sourced from
178/// `Config::secret_field_terminals()`, the macro-emitted static
179/// enumeration of every `#[secret]` field reachable from the schema.
180/// The set is schema-driven — adding a new `#[secret]` annotation
181/// anywhere in the schema automatically extends encryption coverage
182/// with no companion edit in this module.
183///
184/// `secret_field_terminals()` (vs. the older `prop_fields().filter(is_secret)`
185/// approach) covers compound shapes like `HashMap<String, String>`
186/// — `prop_fields()` intentionally skips non-Vec compound types, which
187/// would silently drop e.g. `mcp.servers[*].headers` from the allowlist.
188fn secret_key_names() -> &'static std::collections::HashSet<&'static str> {
189    use std::collections::HashSet;
190    use std::sync::OnceLock;
191    static CACHE: OnceLock<HashSet<&'static str>> = OnceLock::new();
192    CACHE.get_or_init(|| Config::secret_field_terminals().into_iter().collect())
193}
194
195/// Walk a TOML tree and encrypt every string leaf whose terminal key
196/// name appears in `secret_key_names`. Strings already in `enc2:` /
197/// `enc:` form are left alone (idempotent). Arrays of strings under a
198/// matching key (e.g. `paired_tokens`) are encrypted element-wise.
199///
200/// Works at every schema version because it operates on raw TOML
201/// rather than a typed `#[secret]` index — only the *set of key names
202/// to encrypt* comes from the typed schema; the walker itself doesn't
203/// care about types.
204pub fn encrypt_secret_strings(
205    value: &mut toml::Value,
206    store: &crate::secrets::SecretStore,
207) -> Result<()> {
208    let names = secret_key_names();
209    encrypt_walk(value, store, names)
210}
211
212fn encrypt_walk(
213    value: &mut toml::Value,
214    store: &crate::secrets::SecretStore,
215    names: &std::collections::HashSet<&'static str>,
216) -> Result<()> {
217    match value {
218        toml::Value::Table(table) => {
219            for (key, child) in table.iter_mut() {
220                if names.contains(key.as_str()) {
221                    encrypt_in_place(child, store)
222                        .with_context(|| format!("encrypting secret at key `{key}`"))?;
223                } else {
224                    encrypt_walk(child, store, names)?;
225                }
226            }
227        }
228        toml::Value::Array(items) => {
229            for item in items.iter_mut() {
230                encrypt_walk(item, store, names)?;
231            }
232        }
233        _ => {}
234    }
235    Ok(())
236}
237
238/// Encrypt the value at this slot — a string, an array of strings, or
239/// a table containing strings — using the given store. Non-string leaves
240/// (numbers, bools) are left alone; the operator presumably annotated a
241/// non-secret field with a secret-shaped name and we don't second-guess.
242///
243/// When the slot is a Table (e.g. `headers = { Authorization = "Bearer
244/// ...", X-Tenant = "..." }`), every leaf in the subtree is encrypted —
245/// the parent key matched the secret allowlist, so every value below it
246/// inherits the secret marker. This is the contract for `HashMap<String,
247/// String>`-shaped `#[secret]` fields where individual keys are
248/// user-supplied and can't be checked against a static allowlist.
249fn encrypt_in_place(value: &mut toml::Value, store: &crate::secrets::SecretStore) -> Result<()> {
250    match value {
251        toml::Value::String(s)
252            if !crate::secrets::SecretStore::is_encrypted(s) && !s.is_empty() =>
253        {
254            let encrypted = store.encrypt(s).context("encrypt string")?;
255            *s = encrypted;
256        }
257        toml::Value::Array(items) => {
258            for item in items.iter_mut() {
259                encrypt_in_place(item, store)?;
260            }
261        }
262        toml::Value::Table(table) => {
263            for (_, child) in table.iter_mut() {
264                encrypt_in_place(child, store)?;
265            }
266        }
267        _ => {}
268    }
269    Ok(())
270}
271
272/// Versioned TOML → validated V3 `Config`, strict: any defect errors.
273/// Used by repair tooling (`zeroclaw config migrate`, `model_routing_config`)
274/// that needs the precise failure. Daemon load uses the resilient path.
275pub fn migrate_to_current(input: &str) -> Result<Config> {
276    let final_value = migrate_value(input)?;
277    final_value
278        .try_into()
279        .context("migrated config failed to deserialize as current schema")
280}
281
282/// Daemon load path: versioned TOML → usable `Config`, never failing.
283/// Thin wrapper over [`migrate_to_current_salvaged`] that drops the report.
284pub fn migrate_to_current_resilient(input: &str) -> Config {
285    migrate_to_current_salvaged(input).config
286}
287
288/// Top-level keys whose silent loss could *weaken* security posture: dropping
289/// a malformed one to its `Default` may grant a broader posture than intended.
290/// Salvage still drops them (so the daemon boots) but logs ERROR and reports
291/// them in [`ResilientLoad::dropped_security`] for exposure gating.
292pub const SECURITY_CRITICAL_KEYS: &[&str] = &["security", "risk_profiles", "peer_groups"];
293
294/// Sentinel `dropped_security` entry used when the *whole* config is replaced
295/// by `Config::default()` (unparseable TOML, unsupported future schema, broken
296/// migration chain, or a root that cannot be salvaged section-by-section). In
297/// that case every security-critical section is lost at once, so the posture is
298/// degraded and the serving gate must refuse to start without an explicit
299/// operator override — exactly as it does for a single dropped section.
300pub const WHOLE_CONFIG_SENTINEL: &str = "<entire-config>";
301
302/// Result of a resilient (never-failing) config load.
303#[derive(Debug, Clone, Default)]
304pub struct ResilientLoad {
305    /// Loaded config: every section that parsed, `Default` for any dropped.
306    pub config: Config,
307    /// Non-security paths dropped during salvage (logged WARN).
308    pub dropped: Vec<String>,
309    /// [`SECURITY_CRITICAL_KEYS`] sections dropped to `Default` (logged ERROR).
310    /// Non-empty means the running posture may be weaker than intended.
311    pub dropped_security: Vec<String>,
312}
313
314/// Daemon load path with a salvage report. Degrades instead of failing:
315/// strict deserialize first; else drop each invalid channel alias, channel
316/// type, and top-level section (substituting `Default`); else fall back to
317/// `Config::default()`. Security-critical drops log ERROR and surface in
318/// `dropped_security`. `Config::validate()` is intentionally not run.
319pub fn migrate_to_current_salvaged(input: &str) -> ResilientLoad {
320    let value = match migrate_value(input) {
321        Ok(value) => value,
322        Err(err) => {
323            ::zeroclaw_log::record!(
324                ERROR,
325                ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Fail)
326                    .with_outcome(::zeroclaw_log::EventOutcome::Failure)
327                    .with_attrs(::serde_json::json!({ "error": format!("{err:#}") })),
328                "config could not be parsed or migrated; starting on defaults so it \
329                 can be repaired (gateway /api/config, `zeroclaw config migrate`)"
330            );
331            return ResilientLoad {
332                config: Config::default(),
333                dropped: Vec::new(),
334                // Whole-config loss degrades the security posture: every
335                // security-critical section is gone, so mark it so the serving
336                // gate refuses to start without an explicit override.
337                dropped_security: vec![WHOLE_CONFIG_SENTINEL.to_string()],
338            };
339        }
340    };
341    deserialize_resilient(value)
342}
343
344/// Parse + migrate to the current schema version as a `toml::Value`, without
345/// the final typed deserialize. Shared by the strict and resilient entries.
346fn migrate_value(input: &str) -> Result<toml::Value> {
347    let value: toml::Value = toml::from_str(input).context("failed to parse config TOML")?;
348    let from = detect_version(&value)?;
349    if from == CURRENT_SCHEMA_VERSION {
350        Ok(value)
351    } else if from > CURRENT_SCHEMA_VERSION {
352        ::zeroclaw_log::record!(
353            ERROR,
354            ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Fail)
355                .with_outcome(::zeroclaw_log::EventOutcome::Failure)
356                .with_attrs(::serde_json::json!({
357                    "from_version": from,
358                    "supported_version": CURRENT_SCHEMA_VERSION,
359                })),
360            "config schema_version is newer than this binary supports"
361        );
362        anyhow::bail!(
363            "config schema_version {from} is newer than this binary supports ({CURRENT_SCHEMA_VERSION})"
364        )
365    } else {
366        run_chain(value, from)
367    }
368}
369
370/// Deserialize a migrated `toml::Value` into `Config`, never failing.
371/// Strict first; on failure prune broken channel aliases, channel types, then
372/// top-level sections (each → `Default`), so only the broken blocks are lost.
373fn deserialize_resilient(value: toml::Value) -> ResilientLoad {
374    if let Ok(config) = value.clone().try_into::<Config>() {
375        return ResilientLoad {
376            config,
377            dropped: Vec::new(),
378            dropped_security: Vec::new(),
379        };
380    }
381
382    let mut salvaged = value;
383    let mut dropped: Vec<String> = Vec::new();
384    prune_bad_channel_aliases(&mut salvaged, &mut dropped);
385    prune_bad_channel_types(&mut salvaged, &mut dropped);
386    prune_bad_provider_aliases(&mut salvaged, &mut dropped);
387    prune_bad_top_level_sections(&mut salvaged, &mut dropped);
388
389    let mut whole_config_lost = false;
390    let config = salvaged.try_into::<Config>().unwrap_or_else(|err| {
391        // Nothing in the root table is individually salvageable (e.g. a
392        // non-table root). Boot on defaults so repair surfaces are reachable.
393        whole_config_lost = true;
394        ::zeroclaw_log::record!(
395            ERROR,
396            ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Fail)
397                .with_outcome(::zeroclaw_log::EventOutcome::Failure)
398                .with_attrs(::serde_json::json!({ "error": format!("{err:#}") })),
399            "config could not be salvaged section-by-section; starting on defaults \
400             so it can be repaired"
401        );
402        Config::default()
403    });
404
405    let mut dropped_security: Vec<String> = Vec::new();
406    let mut dropped_plain: Vec<String> = Vec::new();
407    // A whole-config default loses every security-critical section at once, so
408    // mark it degraded even though no individual section was named in `dropped`.
409    if whole_config_lost {
410        dropped_security.push(WHOLE_CONFIG_SENTINEL.to_string());
411    }
412    for path in dropped {
413        if SECURITY_CRITICAL_KEYS.contains(&path.as_str()) {
414            dropped_security.push(path);
415        } else {
416            dropped_plain.push(path);
417        }
418    }
419
420    for path in &dropped_plain {
421        ::zeroclaw_log::record!(
422            WARN,
423            ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Note)
424                .with_outcome(::zeroclaw_log::EventOutcome::Unknown)
425                .with_attrs(::serde_json::json!({ "dropped_config": path })),
426            &format!(
427                "config section `{path}` is invalid and was skipped so the daemon can \
428                 start; fix the block and reload to re-enable it"
429            )
430        );
431    }
432    for path in &dropped_security {
433        ::zeroclaw_log::record!(
434            ERROR,
435            ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Fail)
436                .with_outcome(::zeroclaw_log::EventOutcome::Failure)
437                .with_attrs(::serde_json::json!({ "dropped_security_config": path })),
438            &format!(
439                "SECURITY-CRITICAL config section `{path}` is invalid and was reset to \
440                 its default so the daemon can boot; the running posture may be WEAKER \
441                 than intended — repair `{path}` and reload before trusting this instance. \
442                 Run `zeroclaw config migrate` to see the precise parse error, or fix it \
443                 via the gateway config editor at `/api/config`"
444            )
445        );
446    }
447
448    ResilientLoad {
449        config,
450        dropped: dropped_plain,
451        dropped_security,
452    }
453}
454
455/// Drop top-level `[section]`s that block deserialization (each → `Default`).
456/// Two probes: drop a single key if its removal validates the whole config;
457/// else drop every key that fails to deserialize in isolation (catches
458/// multiple independent offenders the joint probe can't). Appends to `dropped`.
459fn prune_bad_top_level_sections(value: &mut toml::Value, dropped: &mut Vec<String>) {
460    if value.as_table().is_none() {
461        return;
462    }
463    if value.clone().try_into::<Config>().is_ok() {
464        return;
465    }
466
467    let keys: Vec<String> = value
468        .as_table()
469        .expect("root is a table")
470        // toml::Value tables preserve insertion order, so drops are reported
471        // in TOML declaration order — predictable for operators reading logs.
472        .keys()
473        .cloned()
474        .collect();
475    for key in &keys {
476        let root = value.as_table_mut().expect("root is a table");
477        let Some(removed) = root.remove(key) else {
478            continue;
479        };
480        if value.clone().try_into::<Config>().is_ok() {
481            dropped.push(key.clone());
482            return;
483        }
484        value
485            .as_table_mut()
486            .expect("root is a table")
487            .insert(key.clone(), removed);
488    }
489
490    for key in keys {
491        let still_present = value.as_table().and_then(|root| root.get(&key)).cloned();
492        let Some(section) = still_present else {
493            continue;
494        };
495        if top_level_section_is_invalid(&key, &section) {
496            value.as_table_mut().expect("root is a table").remove(&key);
497            dropped.push(key);
498        }
499    }
500}
501
502/// True when top-level `[<key>]`, wrapped alone, fails to deserialize.
503fn top_level_section_is_invalid(key: &str, section: &toml::Value) -> bool {
504    let mut root = toml::value::Table::new();
505    root.insert(key.to_string(), section.clone());
506    toml::Value::Table(root).try_into::<Config>().is_err()
507}
508
509/// Drop each `[channels.<type>.<alias>]` that fails to deserialize, checked in
510/// isolation so valid siblings survive. Appends `channels.<type>.<alias>`.
511fn prune_bad_channel_aliases(value: &mut toml::Value, dropped: &mut Vec<String>) {
512    let Some(channels) = value
513        .as_table_mut()
514        .and_then(|root| root.get_mut("channels"))
515        .and_then(toml::Value::as_table_mut)
516    else {
517        return;
518    };
519
520    for (chan_type, aliases) in channels.iter_mut() {
521        let Some(alias_table) = aliases.as_table_mut() else {
522            continue;
523        };
524        let invalid: Vec<String> = alias_table
525            .iter()
526            .filter(|(_, v)| channel_alias_is_invalid(chan_type, v))
527            .map(|(k, _)| k.clone())
528            .collect();
529        for alias in invalid {
530            alias_table.remove(&alias);
531            dropped.push(format!("channels.{chan_type}.{alias}"));
532        }
533    }
534}
535
536/// Drop each `[providers.<kind>.<family>.<alias>]` that fails to deserialize,
537/// checked in isolation so valid siblings survive. Without this, one
538/// malformed provider alias makes `prune_bad_top_level_sections` drop the
539/// whole `providers` section: every model/tts/transcription provider
540/// vanishes on reload while agents.*.model_provider references dangle.
541/// Appends `providers.<kind>.<family>.<alias>`.
542fn prune_bad_provider_aliases(value: &mut toml::Value, dropped: &mut Vec<String>) {
543    let Some(provider_kinds) = value
544        .as_table_mut()
545        .and_then(|root| root.get_mut("providers"))
546        .and_then(toml::Value::as_table_mut)
547    else {
548        return;
549    };
550
551    // Non-table nodes where a kind/family map is required (e.g.
552    // `[providers.models] ollama = "oops"`) would otherwise still sink the
553    // whole section in prune_bad_top_level_sections. Drop just the node.
554    let scalar_kinds: Vec<String> = provider_kinds
555        .iter()
556        .filter(|(_, v)| !v.is_table())
557        .map(|(k, _)| k.clone())
558        .collect();
559    for kind in scalar_kinds {
560        provider_kinds.remove(&kind);
561        dropped.push(format!("providers.{kind}"));
562    }
563
564    for (kind, families) in provider_kinds.iter_mut() {
565        let family_table = families.as_table_mut().expect("scalar kinds pruned above");
566        let scalar_families: Vec<String> = family_table
567            .iter()
568            .filter(|(_, v)| !v.is_table())
569            .map(|(k, _)| k.clone())
570            .collect();
571        for family in scalar_families {
572            family_table.remove(&family);
573            dropped.push(format!("providers.{kind}.{family}"));
574        }
575        for (family, aliases) in family_table.iter_mut() {
576            let alias_table = aliases
577                .as_table_mut()
578                .expect("scalar families pruned above");
579            let invalid: Vec<String> = alias_table
580                .iter()
581                .filter(|(_, v)| provider_alias_is_invalid(kind, family, v))
582                .map(|(k, _)| k.clone())
583                .collect();
584            for alias in invalid {
585                alias_table.remove(&alias);
586                dropped.push(format!("providers.{kind}.{family}.{alias}"));
587            }
588        }
589    }
590}
591
592/// True when `[providers.<kind>.<family>.<alias>]`, wrapped alone, fails to
593/// deserialize. Unknown families pass (serde ignores them); only a
594/// known-family alias with bad field data is invalid.
595fn provider_alias_is_invalid(kind: &str, family: &str, alias_value: &toml::Value) -> bool {
596    let mut inner = toml::value::Table::new();
597    inner.insert("probe".to_string(), alias_value.clone());
598    let mut family_table = toml::value::Table::new();
599    family_table.insert(family.to_string(), toml::Value::Table(inner));
600    let mut kind_table = toml::value::Table::new();
601    kind_table.insert(kind.to_string(), toml::Value::Table(family_table));
602    let mut root = toml::value::Table::new();
603    root.insert("providers".to_string(), toml::Value::Table(kind_table));
604    toml::Value::Table(root).try_into::<Config>().is_err()
605}
606
607/// Drop each `[channels.<type>]` block still blocking the load after alias
608/// pruning (e.g. a scalar where a table is required). Drops only the offending
609/// type, never the whole `[channels]` section. Appends `channels.<type>`.
610fn prune_bad_channel_types(value: &mut toml::Value, dropped: &mut Vec<String>) {
611    let Some(channel_types) = value
612        .as_table()
613        .and_then(|root| root.get("channels"))
614        .and_then(toml::Value::as_table)
615        .map(|chans| chans.keys().cloned().collect::<Vec<_>>())
616    else {
617        return;
618    };
619
620    for chan_type in channel_types {
621        if channels_section_is_valid(value) {
622            return;
623        }
624        let Some(removed) = value
625            .as_table_mut()
626            .and_then(|root| root.get_mut("channels"))
627            .and_then(toml::Value::as_table_mut)
628            .and_then(|chans| chans.remove(&chan_type))
629        else {
630            continue;
631        };
632        if channels_section_is_valid(value) {
633            dropped.push(format!("channels.{chan_type}"));
634        } else {
635            value
636                .as_table_mut()
637                .and_then(|root| root.get_mut("channels"))
638                .and_then(toml::Value::as_table_mut)
639                .expect("channels is a table")
640                .insert(chan_type, removed);
641        }
642    }
643}
644
645/// True when `value`'s `[channels]` section deserializes cleanly in isolation.
646fn channels_section_is_valid(value: &toml::Value) -> bool {
647    let Some(channels) = value
648        .as_table()
649        .and_then(|root| root.get("channels"))
650        .cloned()
651    else {
652        return true;
653    };
654    let mut root = toml::value::Table::new();
655    root.insert("channels".to_string(), channels);
656    toml::Value::Table(root).try_into::<Config>().is_ok()
657}
658
659/// True when `[channels.<type>.<alias>]`, wrapped alone, fails to deserialize.
660fn channel_alias_is_invalid(chan_type: &str, alias_value: &toml::Value) -> bool {
661    let mut inner = toml::value::Table::new();
662    inner.insert("probe".to_string(), alias_value.clone());
663    let mut type_table = toml::value::Table::new();
664    type_table.insert(chan_type.to_string(), toml::Value::Table(inner));
665    let mut channels = toml::value::Table::new();
666    channels.insert("channels".to_string(), toml::Value::Table(type_table));
667    toml::Value::Table(channels).try_into::<Config>().is_err()
668}
669
670/// File-API wrapper: read disk config, migrate, write `<file>.backup`
671/// adjacent to the original, then atomically replace the original. Returns
672/// `Ok(None)` when already current.
673///
674/// Backup file is `<config_filename>.backup` (joined cross-platform via
675/// `Path` ops). The write path mirrors `Config::save()` so the documented
676/// durability guarantee holds end-to-end:
677///
678/// 1. Write the migrated content to `<path>.tmp-<uuid>` and fsync it.
679/// 2. Copy the original to `<path>.backup` (existing behavior; recovery
680///    rope if anything later goes wrong).
681/// 3. `rename(<path>.tmp, <path>)` — atomic on Unix and on modern Windows.
682/// 4. Fsync the parent directory so the rename is durable.
683///
684/// On rename failure the temp file is removed and the backup is restored
685/// over the original so the operator never observes a partial write.
686pub fn migrate_file_in_place(path: &Path) -> Result<Option<MigrateReport>> {
687    let raw = std::fs::read_to_string(path)
688        .with_context(|| format!("failed to read config at {}", path.display().to_string()))?;
689    let migrated = match migrate_file(&raw)? {
690        Some(s) => s,
691        None => return Ok(None),
692    };
693    let parent = path.parent().with_context(|| {
694        format!(
695            "config path {} has no parent directory",
696            path.display().to_string()
697        )
698    })?;
699    let file_name = path.file_name().and_then(|s| s.to_str()).with_context(|| {
700        format!(
701            "config path {} has no file name",
702            path.display().to_string()
703        )
704    })?;
705    let backup_path = parent.join(format!("{file_name}.backup"));
706    let temp_path = parent.join(format!(".{file_name}.tmp-{}", uuid::Uuid::new_v4()));
707
708    // 1. Write migrated content to temp + fsync.
709    {
710        let mut temp = std::fs::OpenOptions::new()
711            .create_new(true)
712            .write(true)
713            .open(&temp_path)
714            .with_context(|| {
715                format!(
716                    "failed to create temporary migrated config at {}",
717                    temp_path.display()
718                )
719            })?;
720        std::io::Write::write_all(&mut temp, migrated.as_bytes()).with_context(|| {
721            format!(
722                "failed to write migrated config to {}",
723                temp_path.display().to_string()
724            )
725        })?;
726        temp.sync_all().with_context(|| {
727            format!(
728                "failed to fsync temporary migrated config at {}",
729                temp_path.display()
730            )
731        })?;
732    }
733
734    // 2. Backup original BEFORE touching the destination. Copy gets a fresh inode.
735    std::fs::copy(path, &backup_path).with_context(|| {
736        format!(
737            "failed to write backup {} before migration (temp file intact at {})",
738            backup_path.display().to_string(),
739            temp_path.display().to_string(),
740        )
741    })?;
742
743    // 3. Atomic rename. On failure, restore from backup so the operator
744    //    never observes a partial write.
745    if let Err(rename_err) = std::fs::rename(&temp_path, path) {
746        let _ = std::fs::remove_file(&temp_path);
747        if backup_path.exists() {
748            let _ = std::fs::copy(&backup_path, path);
749        }
750        ::zeroclaw_log::record!(
751            ERROR,
752            ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Fail)
753                .with_outcome(::zeroclaw_log::EventOutcome::Failure)
754                .with_attrs(::serde_json::json!({
755                    "path": path.display().to_string(),
756                    "backup_path": backup_path.display().to_string(),
757                    "error": format!("{}", rename_err),
758                })),
759            "atomic rename failed during config migration"
760        );
761        anyhow::bail!(
762            "failed to atomically replace {} with migrated config: {rename_err} \
763             (backup retained at {})",
764            path.display().to_string(),
765            backup_path.display().to_string(),
766        );
767    }
768
769    // 4. Fsync the parent directory so the rename is durable across crashes.
770    sync_directory(parent).with_context(|| {
771        format!(
772            "failed to fsync parent directory after migration: {}",
773            parent.display()
774        )
775    })?;
776
777    Ok(Some(MigrateReport {
778        backup_path,
779        to_version: CURRENT_SCHEMA_VERSION,
780    }))
781}
782
783/// Fsync the directory entry so a subsequent rename inside it is durable.
784/// No-op on platforms where directory fsync isn't a meaningful primitive.
785#[allow(clippy::unused_async)] // kept sync to mirror Config::save()'s helper
786fn sync_directory(path: &Path) -> Result<()> {
787    #[cfg(unix)]
788    {
789        let dir = std::fs::File::open(path).with_context(|| {
790            format!(
791                "failed to open directory for fsync: {}",
792                path.display().to_string()
793            )
794        })?;
795        dir.sync_all().with_context(|| {
796            format!("failed to fsync directory: {}", path.display().to_string())
797        })?;
798    }
799    #[cfg(not(unix))]
800    {
801        // Best-effort: open + drop. Windows doesn't provide a portable
802        // directory-fsync primitive in std; the rename itself is durable
803        // on NTFS.
804        let _ = std::fs::File::open(path);
805    }
806    Ok(())
807}
808
809/// Result of an on-disk migration. Returned by `migrate_file_in_place` when
810/// migration ran (vs. `Ok(None)` when input was already current).
811#[derive(Debug, Clone)]
812pub struct MigrateReport {
813    pub backup_path: std::path::PathBuf,
814    pub to_version: u32,
815}
816
817/// Refuse to proceed if the on-disk config is at a stale schema version.
818///
819/// Used by CLI write commands (`config set`, `config patch`, `config init`)
820/// to ensure the user explicitly opts into the migration via
821/// `zeroclaw config migrate` before modifying a stale config — the alternative
822/// would be a silent auto-migrate-on-write, which is harder to audit and
823/// surprises users who didn't realize their config schema had changed.
824///
825/// - Missing file → `Ok(())` (fresh install: nothing to migrate yet).
826/// - Current version → `Ok(())`.
827/// - Stale (or future) version → `Err` with a message that names the disk
828///   version and the command the user needs to run.
829pub fn ensure_disk_at_current_version(path: &Path) -> Result<()> {
830    let raw = match std::fs::read_to_string(path) {
831        Ok(s) => s,
832        Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(()),
833        Err(e) => {
834            return Err(anyhow::Error::from(e)).with_context(|| {
835                format!("failed to read config at {}", path.display().to_string())
836            });
837        }
838    };
839    let value: toml::Value =
840        toml::from_str(&raw).context("failed to parse config TOML for version check")?;
841    let from = detect_version(&value)?;
842    if from == CURRENT_SCHEMA_VERSION {
843        return Ok(());
844    }
845    if from > CURRENT_SCHEMA_VERSION {
846        anyhow::bail!(
847            "config at {} is schema_version {from}, newer than this binary supports ({})",
848            path.display().to_string(),
849            CURRENT_SCHEMA_VERSION,
850        );
851    }
852    anyhow::bail!(
853        "config at {} is schema_version {from}; run `zeroclaw config migrate` to update before modifying",
854        path.display().to_string(),
855    );
856}
857
858/// Fold a `from_key: String` value into a `to_key: Vec<String>` array on the
859/// same table. Used for the singular→plural channel transforms (V1→V2:
860/// `matrix.room_id` → `allowed_rooms`, `slack.channel_id` → `channel_ids`;
861/// V2→V3: `discord.guild_id` → `guild_ids`, etc.).
862///
863/// - Removes `from_key` from the table.
864/// - If the value was a non-empty string, appends it to `to_key`'s array
865///   (creating the array if missing). Existing entries are preserved; the new
866///   value is deduplicated against current contents.
867/// - Empty strings, non-string types, and missing `from_key` are no-ops.
868///
869/// Returns `true` if a value was actually folded (caller may emit a log line).
870pub(crate) fn fold_string_into_array(
871    table: &mut toml::Table,
872    from_key: &str,
873    to_key: &str,
874) -> bool {
875    let value = match table.remove(from_key) {
876        Some(toml::Value::String(s)) if !s.is_empty() => s,
877        Some(other) => {
878            // Non-string: re-insert under from_key untouched (caller may handle).
879            table.insert(from_key.to_string(), other);
880            return false;
881        }
882        None => return false,
883    };
884    let entry = table
885        .entry(to_key.to_string())
886        .or_insert_with(|| toml::Value::Array(Vec::new()));
887    if let Some(arr) = entry.as_array_mut() {
888        let already_present = arr.iter().any(|v| v.as_str() == Some(value.as_str()));
889        if !already_present {
890            arr.push(toml::Value::String(value));
891        }
892        true
893    } else {
894        // Existing to_key wasn't an array (unusual). Reinsert from_key as-is.
895        table.insert(from_key.to_string(), toml::Value::String(value));
896        false
897    }
898}
899
900/// One typed migration step: `V_n` TOML → `V_{n+1}` TOML.
901type MigrationStep = fn(toml::Value) -> Result<toml::Value>;
902
903/// Migration steps keyed 1-indexed by `from` version: `MIGRATION_STEPS[n]`
904/// is the step from `V_n` to `V_{n+1}`. Slot 0 is a never-invoked
905/// placeholder so callers can write `&MIGRATION_STEPS[from..target]`
906/// directly — both bounds read as schema-version numbers, no offset math.
907///
908/// To add a new schema version `V_n`:
909/// 1. Add `schema/v{n-1}.rs` with a partial typed lens for the prior shape.
910/// 2. Implement `V{n-1}Config::migrate(self) -> Result<toml::Value>`.
911/// 3. Bump [`CURRENT_SCHEMA_VERSION`] to `n`.
912/// 4. Append a new closure here that deserializes `V{n-1}Config` and calls
913///    its `migrate()`. The compile-time assertion below catches drift.
914const MIGRATION_STEPS: &[MigrationStep] = &[
915    // V0 → V1: padding so slot 0 is never indexed. V0 does not exist.
916    |_| unreachable!("MIGRATION_STEPS[0] is a 1-indexing pad and is never invoked"),
917    // V1 → V2
918    |value| {
919        let v1: V1Config = value
920            .try_into()
921            .context("failed to deserialize input as V1 schema")?;
922        let v2 = v1.migrate();
923        toml::Value::try_from(v2).context("failed to serialize V2 intermediate")
924    },
925    // V2 → V3
926    |value| {
927        let v2: V2Config = value
928            .try_into()
929            .context("failed to deserialize as V2 schema")?;
930        v2.migrate().context("failed to migrate V2 → V3")
931    },
932];
933
934const _: () = assert!(
935    MIGRATION_STEPS.len() as u32 == CURRENT_SCHEMA_VERSION,
936    "MIGRATION_STEPS must have exactly one entry per schema version \
937     (length = CURRENT_SCHEMA_VERSION, including the slot-0 padding)",
938);
939
940/// Run the typed migration chain from `from` up to `CURRENT_SCHEMA_VERSION`.
941/// `from` must be `< CURRENT_SCHEMA_VERSION` (caller checks).
942fn run_chain(value: toml::Value, from: u32) -> Result<toml::Value> {
943    run_chain_until(value, from, CURRENT_SCHEMA_VERSION)
944}
945
946/// Run the typed migration chain from `from` up to `target` (the shape that
947/// is emitted). `target` must be in `from..=CURRENT_SCHEMA_VERSION`.
948///
949/// Used by `migrate_file` / `migrate_to_current` (target = current) and by
950/// [`generate`] (target = any historical version, for fixture generation).
951fn run_chain_until(value: toml::Value, from: u32, target: u32) -> Result<toml::Value> {
952    if target < from {
953        anyhow::bail!("cannot migrate backwards from V{from} to V{target}");
954    }
955    if target > CURRENT_SCHEMA_VERSION {
956        anyhow::bail!(
957            "target V{target} exceeds CURRENT_SCHEMA_VERSION (V{CURRENT_SCHEMA_VERSION})"
958        );
959    }
960
961    let mut cur = value;
962    for step in &MIGRATION_STEPS[from as usize..target as usize] {
963        cur = step(cur)?;
964    }
965    Ok(cur)
966}
967
968/// Reconcile new typed values into an existing `toml_edit::DocumentMut` so
969/// comments and decoration on surviving keys are preserved across save.
970///
971/// Walks `new` recursively. For each key:
972/// - If the key exists in `doc` AND both sides are tables, recurse.
973/// - If the key exists in `doc` and at least one side is not a table, replace
974///   the value while preserving the key's prefix decor (i.e. the comment lines
975///   that lead the key).
976/// - If the key does not exist in `doc`, insert it.
977///
978/// Removed keys (present in `doc` but absent from `new`) are dropped from `doc`.
979/// This matches the prior crate behavior: the typed schema is authoritative,
980/// and any TOML key not represented in `new` is not part of the current schema.
981pub(crate) fn sync_table(doc: &mut toml_edit::Table, new: &toml::Table) {
982    // Drop keys not present in new
983    let to_remove: Vec<String> = doc
984        .iter()
985        .map(|(k, _)| k.to_string())
986        .filter(|k| !new.contains_key(k))
987        .collect();
988    for k in to_remove {
989        doc.remove(&k);
990    }
991
992    for (key, new_value) in new.iter() {
993        if let (Some(doc_item), toml::Value::Table(new_sub)) =
994            (doc.get_mut(key.as_str()), new_value)
995            && let Some(doc_sub) = doc_item.as_table_mut()
996        {
997            // Both tables — recurse to preserve nested comments.
998            sync_table(doc_sub, new_sub);
999            continue;
1000        }
1001        // Otherwise, replace the value while preserving the key's leading decor.
1002        let new_item = toml_value_to_edit_item(new_value);
1003        match doc.get_mut(key.as_str()) {
1004            Some(existing) => {
1005                // Preserve the key's leading decor (comments) by mutating in place.
1006                *existing = new_item;
1007            }
1008            None => {
1009                doc.insert(key.as_str(), new_item);
1010            }
1011        }
1012    }
1013}
1014
1015/// Convert a `toml::Value` into a `toml_edit::Item` for insertion into
1016/// a `DocumentMut`. Tables become inline tables when small, real tables
1017/// otherwise — matches `toml_edit`'s default round-trip behavior.
1018pub(crate) fn toml_value_to_edit_item(value: &toml::Value) -> toml_edit::Item {
1019    // Easiest path: serialize to string, parse as toml_edit. Lossy on numeric
1020    // formatting nuance but correct for migration round-trip where we're
1021    // emitting freshly-serialized values.
1022    let serialized = match value {
1023        toml::Value::Table(t) => {
1024            let mut wrapper = toml::Table::new();
1025            wrapper.insert("__v".into(), toml::Value::Table(t.clone()));
1026            toml::to_string(&wrapper).unwrap_or_default()
1027        }
1028        other => {
1029            let mut wrapper = toml::Table::new();
1030            wrapper.insert("__v".into(), other.clone());
1031            toml::to_string(&wrapper).unwrap_or_default()
1032        }
1033    };
1034    let doc: toml_edit::DocumentMut = serialized.parse().unwrap_or_default();
1035    doc.get("__v").cloned().unwrap_or(toml_edit::Item::None)
1036}
1037
1038#[cfg(test)]
1039mod tests {
1040    use super::*;
1041
1042    #[test]
1043    fn detect_version_missing_is_v1() {
1044        let v: toml::Value = toml::from_str("foo = 1").unwrap();
1045        assert_eq!(detect_version(&v).unwrap(), 1);
1046    }
1047
1048    #[test]
1049    fn detect_version_explicit() {
1050        let v: toml::Value = toml::from_str("schema_version = 2\n").unwrap();
1051        assert_eq!(detect_version(&v).unwrap(), 2);
1052    }
1053
1054    #[test]
1055    fn detect_version_negative_errors() {
1056        let v: toml::Value = toml::from_str("schema_version = -1\n").unwrap();
1057        assert!(detect_version(&v).is_err());
1058    }
1059
1060    #[test]
1061    fn detect_version_string_errors() {
1062        let v: toml::Value = toml::from_str("schema_version = \"two\"\n").unwrap();
1063        assert!(detect_version(&v).is_err());
1064    }
1065
1066    // ── resilient daemon load: starts no matter what, so config can be repaired ──
1067
1068    #[test]
1069    fn broken_channel_alias_is_dropped_not_fatal() {
1070        // Email alias missing required `imap_host` must not abort the load.
1071        let raw = r#"
1072schema_version = 3
1073
1074[channels.email.fakeemail]
1075enabled = true
1076smtp_host = "smtp.example.com"
1077username = "u"
1078password = "p"
1079from_address = "a@example.com"
1080"#;
1081        let cfg = migrate_to_current_resilient(raw);
1082        assert!(
1083            !cfg.channels.email.contains_key("fakeemail"),
1084            "invalid alias must be pruned"
1085        );
1086    }
1087
1088    #[test]
1089    fn valid_provider_aliases_survive_broken_sibling() {
1090        // Repro for the zerocode "all providers vanish after restart" report:
1091        // one malformed provider alias must not take the whole [providers]
1092        // section (and every other provider) down with it.
1093        let raw = r#"
1094schema_version = 3
1095
1096[providers.models.ollama.ai]
1097model = "qwen3:30b"
1098
1099[providers.models.custom.rag_bot]
1100uri = "http://localhost:8000/v1"
1101model = "m"
1102
1103[providers.models.custom.broken]
1104uri = "http://localhost:9000/v1"
1105model = "m"
1106temperature = "hot"
1107"#;
1108        let load = migrate_to_current_salvaged(raw);
1109        assert_eq!(load.dropped, vec!["providers.models.custom.broken"]);
1110        assert!(
1111            load.config.providers.models.find("ollama", "ai").is_some(),
1112            "valid alias in another family must survive"
1113        );
1114        assert!(
1115            load.config
1116                .providers
1117                .models
1118                .find("custom", "rag_bot")
1119                .is_some(),
1120            "valid sibling alias must survive"
1121        );
1122        assert!(
1123            load.config
1124                .providers
1125                .models
1126                .find("custom", "broken")
1127                .is_none(),
1128            "only the malformed alias is pruned"
1129        );
1130    }
1131
1132    #[test]
1133    fn provider_pruner_never_panics_on_non_table_shapes() {
1134        // Array-of-tables where a family map is expected, scalar [providers],
1135        // array alias value. The salvage path is the daemon's never-fail
1136        // loader, and prune_bad_provider_aliases carries expect() calls that
1137        // rely on the scalar pre-passes; pin that invariant here.
1138        for raw in [
1139            "schema_version = 3\nproviders = 3\n",
1140            "schema_version = 3\n[[providers.models.ollama]]\nmodel = \"x\"\n",
1141            "schema_version = 3\n[providers.models.ollama]\nai = [1, 2]\n",
1142            "schema_version = 3\n[providers.models]\nollama = [1]\n",
1143        ] {
1144            let _ = migrate_to_current_salvaged(raw);
1145        }
1146    }
1147
1148    #[test]
1149    fn scalar_provider_nodes_pruned_without_sinking_section() {
1150        // A scalar where a family/kind table is required must drop only
1151        // that node, not the whole [providers] section.
1152        let raw = r#"
1153schema_version = 3
1154
1155[providers.models]
1156ollama = "oops"
1157
1158[providers.models.custom.rag_bot]
1159uri = "http://localhost:8000/v1"
1160model = "m"
1161"#;
1162        let load = migrate_to_current_salvaged(raw);
1163        assert_eq!(load.dropped, vec!["providers.models.ollama"]);
1164        assert!(
1165            load.config
1166                .providers
1167                .models
1168                .find("custom", "rag_bot")
1169                .is_some(),
1170            "valid alias must survive a scalar sibling family"
1171        );
1172    }
1173
1174    #[test]
1175    fn valid_alias_survives_broken_sibling() {
1176        let raw = r#"
1177schema_version = 3
1178
1179[channels.email.broken]
1180enabled = true
1181smtp_host = "smtp.example.com"
1182username = "u"
1183password = "p"
1184from_address = "a@example.com"
1185
1186[channels.email.good]
1187enabled = true
1188imap_host = "imap.example.com"
1189smtp_host = "smtp.example.com"
1190username = "u"
1191password = "p"
1192from_address = "a@example.com"
1193"#;
1194        let cfg = migrate_to_current_resilient(raw);
1195        assert!(
1196            cfg.channels.email.contains_key("good"),
1197            "valid sibling must be kept"
1198        );
1199        assert!(
1200            !cfg.channels.email.contains_key("broken"),
1201            "invalid sibling must be pruned"
1202        );
1203    }
1204
1205    #[test]
1206    fn broken_non_channel_section_falls_back_to_default() {
1207        // A type mismatch outside the channel maps must NOT abort the daemon:
1208        // the section is dropped to its default so the operator can repair it.
1209        let raw = r#"
1210schema_version = 3
1211
1212[heartbeat]
1213enabled = "not-a-bool"
1214"#;
1215        let cfg = migrate_to_current_resilient(raw);
1216        // `[heartbeat]` reverted to its serde default; load did not panic.
1217        assert!(!cfg.heartbeat.enabled);
1218        assert_eq!(cfg.heartbeat.interval_minutes, 30);
1219    }
1220
1221    #[test]
1222    fn unparseable_config_falls_back_to_defaults() {
1223        // Not even valid TOML — the daemon still boots on defaults so the
1224        // operator can reach a repair surface and overwrite the file.
1225        let cfg = migrate_to_current_resilient("this is not valid TOML {{{");
1226        assert_eq!(cfg.schema_version, Config::default().schema_version);
1227    }
1228
1229    #[test]
1230    fn future_schema_version_falls_back_to_defaults() {
1231        // A schema newer than this binary can't be migrated, but the daemon
1232        // must still start rather than refuse to boot.
1233        let raw = format!("schema_version = {}\n", CURRENT_SCHEMA_VERSION + 100);
1234        let cfg = migrate_to_current_resilient(&raw);
1235        assert_eq!(cfg.schema_version, Config::default().schema_version);
1236    }
1237
1238    #[test]
1239    fn unparseable_config_marks_whole_config_degraded() {
1240        // Whole-config loss loses every security-critical section at once, so it
1241        // must mark the posture degraded — otherwise the serving gate has no
1242        // signal and boots a defaulted security posture silently.
1243        let load = migrate_to_current_salvaged("this is not valid TOML {{{");
1244        assert!(
1245            load.dropped_security
1246                .iter()
1247                .any(|p| p == WHOLE_CONFIG_SENTINEL),
1248            "unparseable config must degrade security posture, got {:?}",
1249            load.dropped_security
1250        );
1251    }
1252
1253    #[test]
1254    fn future_schema_version_marks_whole_config_degraded() {
1255        let raw = format!("schema_version = {}\n", CURRENT_SCHEMA_VERSION + 100);
1256        let load = migrate_to_current_salvaged(&raw);
1257        assert!(
1258            load.dropped_security
1259                .iter()
1260                .any(|p| p == WHOLE_CONFIG_SENTINEL),
1261            "unsupported future schema must degrade security posture, got {:?}",
1262            load.dropped_security
1263        );
1264    }
1265
1266    #[test]
1267    fn unsalvageable_root_marks_whole_config_degraded() {
1268        // A root that is not a table cannot be salvaged section-by-section; the
1269        // final deserialize fallback defaults the whole config and must mark it.
1270        let raw = "schema_version = 3\nthis_is_a_bare_top_level = \"value\"\n[\n";
1271        let load = migrate_to_current_salvaged(raw);
1272        assert!(
1273            !load.dropped_security.is_empty(),
1274            "an unsalvageable root must degrade security posture, got {:?}",
1275            load.dropped_security
1276        );
1277    }
1278
1279    #[test]
1280    fn strict_path_still_errors_for_tooling() {
1281        // `migrate_to_current` stays strict — repair tooling needs the error.
1282        let raw = r#"
1283schema_version = 3
1284
1285[channels.email.fakeemail]
1286enabled = true
1287smtp_host = "smtp.example.com"
1288username = "u"
1289password = "p"
1290from_address = "a@example.com"
1291"#;
1292        assert!(
1293            migrate_to_current(raw).is_err(),
1294            "strict path must surface the defect for repair tooling"
1295        );
1296    }
1297
1298    #[test]
1299    fn broken_security_section_is_reported_as_degraded() {
1300        let raw = r#"
1301schema_version = 3
1302
1303[security]
1304audit = "should-be-a-table-not-a-string"
1305"#;
1306        let load = migrate_to_current_salvaged(raw);
1307        assert!(
1308            load.dropped_security.iter().any(|p| p == "security"),
1309            "malformed [security] must be reported as a security-critical drop"
1310        );
1311        assert!(
1312            load.dropped.is_empty(),
1313            "security drop must not also appear in the plain dropped list"
1314        );
1315    }
1316
1317    #[test]
1318    fn broken_non_security_section_is_plain_drop_not_security() {
1319        let raw = r#"
1320schema_version = 3
1321
1322[heartbeat]
1323enabled = "not-a-bool"
1324"#;
1325        let load = migrate_to_current_salvaged(raw);
1326        assert!(
1327            load.dropped.iter().any(|p| p == "heartbeat"),
1328            "malformed [heartbeat] must be a plain drop"
1329        );
1330        assert!(
1331            load.dropped_security.is_empty(),
1332            "a non-security section must never be flagged security-critical"
1333        );
1334    }
1335
1336    #[test]
1337    fn broken_channel_type_block_is_dropped_not_fatal() {
1338        let raw = r#"
1339schema_version = 3
1340
1341[channels]
1342email = "oops-this-should-be-a-table"
1343
1344[channels.telegram.main]
1345enabled = true
1346bot_token = "t"
1347"#;
1348        let load = migrate_to_current_salvaged(raw);
1349        assert!(
1350            load.dropped.iter().any(|p| p == "channels.email"),
1351            "the broken whole-type block must be dropped, got {:?}",
1352            load.dropped
1353        );
1354        assert!(
1355            load.config.channels.telegram.contains_key("main"),
1356            "valid sibling channel type must survive a broken-type drop"
1357        );
1358    }
1359
1360    #[test]
1361    fn multiple_independent_bad_sections_all_dropped() {
1362        let raw = r#"
1363schema_version = 3
1364
1365[heartbeat]
1366enabled = "not-a-bool"
1367
1368[backup]
1369enabled = "also-not-a-bool"
1370"#;
1371        let load = migrate_to_current_salvaged(raw);
1372        assert!(
1373            load.dropped.iter().any(|p| p == "heartbeat"),
1374            "first offender must be dropped, got {:?}",
1375            load.dropped
1376        );
1377        assert!(
1378            load.dropped.iter().any(|p| p == "backup"),
1379            "second offender must be dropped, got {:?}",
1380            load.dropped
1381        );
1382    }
1383
1384    #[test]
1385    fn multiple_bad_sections_one_security_critical() {
1386        let raw = r#"
1387schema_version = 3
1388
1389[security]
1390audit = "should-be-a-table-not-a-string"
1391
1392[heartbeat]
1393enabled = "not-a-bool"
1394"#;
1395        let load = migrate_to_current_salvaged(raw);
1396        assert!(
1397            load.dropped_security.iter().any(|p| p == "security"),
1398            "malformed [security] must be classified security-critical, got {:?}",
1399            load.dropped_security
1400        );
1401        assert!(
1402            load.dropped.iter().any(|p| p == "heartbeat"),
1403            "malformed [heartbeat] must be a plain drop, got {:?}",
1404            load.dropped
1405        );
1406        assert!(
1407            !load.dropped.iter().any(|p| p == "security"),
1408            "security drop must not also appear in the plain dropped list"
1409        );
1410    }
1411
1412    // ── migrate_file_in_place atomic-write semantics ──
1413    fn setup_temp_config_dir() -> tempfile::TempDir {
1414        tempfile::TempDir::new().expect("temp dir")
1415    }
1416
1417    #[test]
1418    fn migrate_file_in_place_writes_backup_and_replaces_atomically() {
1419        let dir = setup_temp_config_dir();
1420        let path = dir.path().join("config.toml");
1421        // Minimal V1 input (no schema_version) so migration runs.
1422        std::fs::write(&path, "default_model_provider = \"openai\"\nfoo = 1\n").unwrap();
1423
1424        let report = migrate_file_in_place(&path)
1425            .expect("migration succeeds")
1426            .expect("migration ran (V1 input)");
1427
1428        // Backup retains the original content verbatim.
1429        let backup = std::fs::read_to_string(&report.backup_path).unwrap();
1430        assert!(
1431            backup.contains("default_model_provider = \"openai\"") && backup.contains("foo = 1"),
1432            "backup must contain the original V1 content; got: {backup}"
1433        );
1434
1435        // Original is replaced with migrated content.
1436        let migrated = std::fs::read_to_string(&path).unwrap();
1437        assert!(
1438            migrated.contains("schema_version"),
1439            "migrated config must carry a schema_version line; got: {migrated}"
1440        );
1441
1442        // No `<file>.tmp-*` files left behind in the parent.
1443        let leftovers: Vec<_> = std::fs::read_dir(dir.path())
1444            .unwrap()
1445            .filter_map(|e| e.ok())
1446            .filter(|e| {
1447                e.file_name()
1448                    .to_string_lossy()
1449                    .starts_with(".config.toml.tmp-")
1450            })
1451            .collect();
1452        assert!(
1453            leftovers.is_empty(),
1454            "no temp files must remain after a successful migration; got {leftovers:?}"
1455        );
1456    }
1457
1458    #[test]
1459    fn migrate_file_in_place_noop_when_already_current() {
1460        let dir = setup_temp_config_dir();
1461        let path = dir.path().join("config.toml");
1462        std::fs::write(
1463            &path,
1464            format!("schema_version = {CURRENT_SCHEMA_VERSION}\n"),
1465        )
1466        .unwrap();
1467
1468        let report = migrate_file_in_place(&path).expect("idempotent on current schema");
1469        assert!(
1470            report.is_none(),
1471            "no migration should run when the file is already at CURRENT_SCHEMA_VERSION"
1472        );
1473        // No backup file should exist when the migration didn't run.
1474        let backup = path.with_file_name("config.toml.backup");
1475        assert!(
1476            !backup.exists(),
1477            "no `.backup` should be created on the no-op path; got {}",
1478            backup.display()
1479        );
1480    }
1481}