Skip to main content

doiget_core/
lib.rs

1//! # doiget-core
2//!
3//! Core library for [doiget](https://github.com/sotashimozono/doiget): an Open Access
4//! first paper-fetcher with strict capability gating, fail-closed provenance logging,
5//! and a BiblioFetch.jl-compatible store layout.
6//!
7//! Phase 0 ships only this skeleton. Real implementations land in Phase 1.
8//! See `docs/PUBLIC_API.md` for the semver-locked surface and `docs/ARCHITECTURE.md`
9//! for the high-level design.
10
11#![warn(missing_docs)]
12#![forbid(unsafe_code)]
13
14use serde::{Deserialize, Serialize};
15use sha2::Digest;
16
17// --- Modules ---
18pub mod canonical;
19pub mod dry_run;
20pub mod http;
21pub mod orchestrator;
22pub mod provenance;
23pub mod rate_limiter;
24pub mod refs;
25pub mod source;
26pub mod sources;
27pub mod store;
28pub mod user_extension;
29
30// Phase 4 citation graph (ADR-0010). Compile-gated by the `citation`
31// Cargo feature, which itself enables the `metadata` feature so the
32// Tier-2 source impls are available.
33#[cfg(feature = "citation")]
34pub mod citation_graph;
35
36// Re-export the canonical-tuple audit-identity types at the crate root
37// per ADR-0024 / `docs/PUBLIC_API.md` §1. The types themselves live in
38// the [`canonical`] submodule.
39pub use crate::canonical::{CanonicalRef, SourceType};
40
41/// Crate version. Used by `doiget-cli --version` and `doiget_health`.
42pub const VERSION: &str = env!("CARGO_PKG_VERSION");
43
44/// TOML schema version this build writes. See `docs/STORE.md` §3.
45pub const SCHEMA_VERSION: &str = "1.0";
46
47/// Hard-coded rate limit. See `docs/LEGAL.md` §6 safeguard 8.
48pub const MAX_CONCURRENT_FETCHES: u32 = 5;
49
50/// Hard-coded rate limit. See `docs/LEGAL.md` §6 safeguard 8.
51pub const MAX_FETCHES_PER_SECOND: f32 = 5.0;
52
53/// Maximum batch size for `doiget batch` and `doiget_batch_fetch`.
54pub const MCP_BATCH_MAX_SIZE: usize = 100;
55
56/// Slice 2 alias for [`MCP_BATCH_MAX_SIZE`] using the
57/// spec-language name (`docs/MCP_TOOLS.md` §1 / Slice 2 plan). The
58/// numeric value MUST equal [`MCP_BATCH_MAX_SIZE`]; an internal test
59/// pins the equivalence so the two constants cannot drift.
60pub const MAX_BATCH_REFS: usize = MCP_BATCH_MAX_SIZE;
61
62/// Maximum queued MCP requests beyond `MAX_CONCURRENT_FETCHES`. Excess returns
63/// `ErrorCode::RateLimited`. See `docs/SECURITY.md` §1.4 / `docs/MCP_TOOLS.md`.
64pub const MCP_QUEUE_DEPTH_MAX: usize = 100;
65
66/// MCP server stdin-EOF graceful-shutdown deadline, in seconds. See ADR-0001
67/// and `docs/MCP_TOOLS.md` §8.
68pub const MCP_STDIN_EOF_SHUTDOWN_SEC: u64 = 5;
69
70/// Maximum DOI suffix length accepted at validation. See `docs/SECURITY.md` §1.1.
71pub const DOI_SUFFIX_MAX_LEN: usize = 256;
72
73/// Maximum PDF body size accepted by the fetcher, in bytes. See
74/// `docs/SECURITY.md` §1.2 (Oversized PDF).
75pub const PDF_MAX_BYTES: u64 = 100_000_000;
76
77/// Time-to-live for entries in `~/.cache/doiget/resolver/`. See
78/// `docs/CACHE.md` §3.
79pub const RESOLVER_CACHE_TTL_DAYS: u32 = 7;
80
81/// Time-to-live for entries in `~/.cache/doiget/citations/`. See
82/// `docs/CACHE.md` §3.
83pub const CITATION_CACHE_TTL_DAYS: u32 = 30;
84
85// ---------------------------------------------------------------------------
86// Ref
87// ---------------------------------------------------------------------------
88
89/// A reference to a paper, either by DOI or arXiv id.
90///
91/// See `docs/SECURITY.md` §1.1 for input-validation rules.
92#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
93#[serde(rename_all = "lowercase", tag = "kind", content = "id")]
94pub enum Ref {
95    /// A DOI (e.g., `10.1234/example`).
96    Doi(Doi),
97    /// An arXiv id (e.g., `2401.12345`).
98    Arxiv(ArxivId),
99}
100
101/// A validated DOI string.
102///
103/// Construct via `Doi::parse(s)` (Phase 1+). The inner field is intentionally
104/// `pub(crate)` to forbid bypass construction; tests inside `doiget-core` may
105/// still use `Doi(s)` for fixture purposes.
106///
107/// Wire format: bare string (`#[serde(transparent)]`), e.g. `"10.1234/example"`.
108#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
109#[serde(transparent)]
110pub struct Doi(pub(crate) String);
111
112/// A validated arXiv id string.
113///
114/// Construct via `ArxivId::parse(s)` (Phase 1+). Inner field is `pub(crate)`.
115///
116/// Wire format: bare string (`#[serde(transparent)]`), e.g. `"2401.12345"`.
117#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
118#[serde(transparent)]
119pub struct ArxivId(pub(crate) String);
120
121impl Doi {
122    /// Returns the DOI as a string slice.
123    pub fn as_str(&self) -> &str {
124        &self.0
125    }
126
127    /// Parses and validates a DOI string per `docs/SECURITY.md` §1.1.
128    ///
129    /// Accepts:
130    /// - Bare DOIs: `10.<registrant>/<suffix>` where `<registrant>` is 4–9
131    ///   digits and `<suffix>` is a non-empty sequence of characters drawn
132    ///   from `[A-Za-z0-9._/():-]` (the `:` covers legacy Kluwer
133    ///   `10.1023/A:NNNN` and EDP Sciences `10.1051/jphys:NNNN` DOIs).
134    /// - The `doi:` URI scheme prefix; it is stripped before validation, so
135    ///   the stored value never carries a scheme. (Matches the convention
136    ///   established in `docs/SAFEKEY.md` §3 step 0.)
137    ///
138    /// Rejects:
139    /// - Inputs missing the literal `10.` prefix (after optional scheme
140    ///   strip).
141    /// - Suffixes longer than [`DOI_SUFFIX_MAX_LEN`] bytes.
142    /// - Empty suffixes.
143    /// - Any character outside the suffix charset above (including control
144    ///   characters, whitespace, and non-ASCII).
145    ///
146    /// # Errors
147    ///
148    /// Returns a [`RefParseError`] variant that names the specific rejection
149    /// category. Tier 1+ callers should map any [`RefParseError`] to
150    /// [`ErrorCode::InvalidRef`] when surfacing to MCP / CLI.
151    pub fn parse(s: &str) -> Result<Self, RefParseError> {
152        let stripped = parse::strip_doi_scheme(s);
153        parse::validate_doi(stripped)?;
154        Ok(Doi(stripped.to_string()))
155    }
156}
157
158impl ArxivId {
159    /// Returns the arXiv id as a string slice.
160    pub fn as_str(&self) -> &str {
161        &self.0
162    }
163
164    /// Parses and validates an arXiv id per `docs/SECURITY.md` §1.1 and the
165    /// pattern published in `docs/MCP_TOOLS.md`.
166    ///
167    /// Accepts:
168    /// - New-style ids: `YYMM.NNNNN[vN]` where the date block is 4 digits, the
169    ///   sequence number is 4–5 digits, and the optional version `vN` is one
170    ///   or more digits. Examples: `2401.12345`, `2401.12345v2`.
171    /// - Old-style ids: `subject-class/YYMMNNN[vN]` where the subject class
172    ///   is a lowercase token (with optional internal hyphens and an
173    ///   optional `.XX` two-uppercase-letter group), and the numeric body
174    ///   is exactly 7 digits with optional `vN`. Examples:
175    ///   `cond-mat/9501001`, `astro-ph.CO/0703123v2`.
176    /// - The `arxiv:` / `arXiv:` URI scheme prefix; it is stripped before
177    ///   validation.
178    ///
179    /// Rejects:
180    /// - Inputs that match neither the new-style nor old-style shape.
181    /// - Inputs containing characters outside the per-shape charset
182    ///   (control chars, whitespace, non-ASCII).
183    /// - Empty input.
184    ///
185    /// # Errors
186    ///
187    /// Returns a [`RefParseError`] variant that names the specific rejection
188    /// category.
189    pub fn parse(s: &str) -> Result<Self, RefParseError> {
190        let stripped = parse::strip_arxiv_scheme(s);
191        parse::validate_arxiv(stripped)?;
192        Ok(ArxivId(stripped.to_string()))
193    }
194}
195
196impl Ref {
197    /// Parses a string into a [`Ref`], auto-detecting DOI vs arXiv.
198    ///
199    /// Detection rules:
200    /// 1. If the input begins with the case-insensitive `doi:` scheme, the
201    ///    remainder is parsed as a DOI.
202    /// 2. If the input begins with the `arxiv:` or `arXiv:` scheme, the
203    ///    remainder is parsed as an arXiv id.
204    /// 3. Otherwise, if the input starts with `10.` it is treated as a bare
205    ///    DOI; this matches the heuristic in `docs/SAFEKEY.md` §4 (Julia
206    ///    reference) and is stable because DOIs always begin `10.`.
207    /// 4. Failing all of the above, parsing falls back to arXiv.
208    ///
209    /// The returned [`Ref`] never carries the URI scheme — `as_str()` on the
210    /// inner `Doi` / `ArxivId` is always the bare identifier.
211    ///
212    /// # Errors
213    ///
214    /// Returns a [`RefParseError`] from the underlying [`Doi::parse`] or
215    /// [`ArxivId::parse`] call. When the input has an explicit scheme
216    /// (`doi:` / `arxiv:`), the matching parser is dispatched and its error
217    /// surfaces directly. When the input is bare and ambiguous, the
218    /// heuristic in rule 3/4 selects the parser; an unparsable bare input
219    /// surfaces the arXiv parser's error (a non-`10.` ref that also fails
220    /// arXiv validation is never a valid DOI).
221    pub fn parse(s: &str) -> Result<Self, RefParseError> {
222        // Reject empty up front so all three parsers see a meaningful slice;
223        // without this, `strip_*_scheme("")` returns "" and we'd get a
224        // confusing "missing 10. prefix" error for empty input.
225        if s.is_empty() {
226            return Err(RefParseError::Empty);
227        }
228
229        if parse::has_doi_scheme(s) {
230            return Doi::parse(s).map(Ref::Doi);
231        }
232        if parse::has_arxiv_scheme(s) {
233            return ArxivId::parse(s).map(Ref::Arxiv);
234        }
235        if s.starts_with("10.") {
236            return Doi::parse(s).map(Ref::Doi);
237        }
238        ArxivId::parse(s).map(Ref::Arxiv)
239    }
240}
241
242// ---------------------------------------------------------------------------
243// Parser internals
244// ---------------------------------------------------------------------------
245
246mod parse {
247    use super::{RefParseError, DOI_SUFFIX_MAX_LEN};
248
249    /// Case-insensitive `doi:` prefix detector. Matches both `doi:` and
250    /// `DOI:` (and any case mix); the spec in `docs/SAFEKEY.md` §3 only
251    /// names the lowercase form, but the field convention is to be lenient
252    /// in what we accept (the scheme is dropped at the boundary anyway).
253    pub(crate) fn has_doi_scheme(s: &str) -> bool {
254        s.len() >= 4 && s.is_char_boundary(4) && s[..4].eq_ignore_ascii_case("doi:")
255    }
256
257    /// Case-insensitive `arxiv:` prefix detector. Accepts `arxiv:`,
258    /// `arXiv:` (the form used in `docs/MCP_TOOLS.md`), and any other case
259    /// mix.
260    pub(crate) fn has_arxiv_scheme(s: &str) -> bool {
261        s.len() >= 6 && s.is_char_boundary(6) && s[..6].eq_ignore_ascii_case("arxiv:")
262    }
263
264    pub(crate) fn strip_doi_scheme(s: &str) -> &str {
265        if has_doi_scheme(s) {
266            &s[4..]
267        } else {
268            s
269        }
270    }
271
272    pub(crate) fn strip_arxiv_scheme(s: &str) -> &str {
273        if has_arxiv_scheme(s) {
274            &s[6..]
275        } else {
276            s
277        }
278    }
279
280    /// DOI suffix charset per `docs/SECURITY.md` §1.1:
281    /// `[A-Za-z0-9._/():-]`. The forward slash is permitted inside the
282    /// suffix (e.g. `10.1016/...`); the registrant separator is the
283    /// *first* `/` and the suffix is everything after it.
284    ///
285    /// `:` is permitted because two large real publisher DOI families use
286    /// it in the suffix — legacy Kluwer/Springer (`10.1023/A:NNNNNNNNNN`)
287    /// and EDP Sciences / Journal de Physique
288    /// (`10.1051/jphys:NNNNNNNNNNNNNNNNN`). It adds no path-traversal
289    /// capability: traversal requires composing `/` and `.` into `../`,
290    /// and both characters are already in the suffix charset. In addition,
291    /// `safekey` independently escapes every char outside `[A-Za-z0-9._-]`
292    /// before any filesystem use, so `:` never reaches a path literally.
293    /// See ADR-0026 and `docs/SECURITY.md` §1.1.
294    fn is_doi_suffix_char(c: char) -> bool {
295        matches!(c,
296            'A'..='Z' | 'a'..='z' | '0'..='9'
297            | '.' | '_' | '/' | '(' | ')' | '-' | ':'
298        )
299    }
300
301    pub(crate) fn validate_doi(s: &str) -> Result<(), RefParseError> {
302        if s.is_empty() {
303            return Err(RefParseError::Empty);
304        }
305
306        // Must begin with literal "10."; the registrant is 4–9 digits up
307        // to the first '/'. After that, everything is suffix.
308        let rest = s
309            .strip_prefix("10.")
310            .ok_or(RefParseError::MissingDoiPrefix)?;
311        let slash_idx = rest
312            .find('/')
313            .ok_or(RefParseError::MissingDoiSuffixSeparator)?;
314        let registrant = &rest[..slash_idx];
315        let suffix = &rest[slash_idx + 1..];
316
317        // Registrant: 4–9 ASCII digits.
318        if registrant.len() < 4
319            || registrant.len() > 9
320            || !registrant.chars().all(|c| c.is_ascii_digit())
321        {
322            return Err(RefParseError::InvalidDoiRegistrant);
323        }
324
325        // Suffix: non-empty, charset-restricted, length-bounded.
326        if suffix.is_empty() {
327            return Err(RefParseError::EmptyDoiSuffix);
328        }
329        if suffix.len() > DOI_SUFFIX_MAX_LEN {
330            return Err(RefParseError::DoiSuffixTooLong {
331                len: suffix.len(),
332                max: DOI_SUFFIX_MAX_LEN,
333            });
334        }
335        if let Some(bad) = suffix.chars().find(|c| !is_doi_suffix_char(*c)) {
336            return Err(RefParseError::InvalidDoiSuffixChar { ch: bad });
337        }
338        Ok(())
339    }
340
341    /// Validates an arXiv id (with the `arxiv:` / `arXiv:` scheme already
342    /// stripped). Tries the new-style shape first, then the old-style.
343    pub(crate) fn validate_arxiv(s: &str) -> Result<(), RefParseError> {
344        if s.is_empty() {
345            return Err(RefParseError::Empty);
346        }
347        if validate_arxiv_new(s).is_ok() || validate_arxiv_old(s).is_ok() {
348            return Ok(());
349        }
350        Err(RefParseError::InvalidArxivShape)
351    }
352
353    /// New-style arXiv id: `YYMM.NNNNN[vN]`.
354    fn validate_arxiv_new(s: &str) -> Result<(), ()> {
355        let dot_idx = s.find('.').ok_or(())?;
356        let head = &s[..dot_idx];
357        let tail = &s[dot_idx + 1..];
358
359        // Head: exactly 4 ASCII digits.
360        if head.len() != 4 || !head.chars().all(|c| c.is_ascii_digit()) {
361            return Err(());
362        }
363
364        // Tail: 4–5 digits, then optional `v` followed by ≥1 digits.
365        let bytes = tail.as_bytes();
366        let mut i = 0;
367        while i < bytes.len() && bytes[i].is_ascii_digit() {
368            i += 1;
369        }
370        let digits_len = i;
371        if !(4..=5).contains(&digits_len) {
372            return Err(());
373        }
374        if i == bytes.len() {
375            return Ok(());
376        }
377        // Optional version suffix.
378        if bytes[i] != b'v' {
379            return Err(());
380        }
381        i += 1;
382        let v_start = i;
383        while i < bytes.len() && bytes[i].is_ascii_digit() {
384            i += 1;
385        }
386        if i == v_start || i != bytes.len() {
387            return Err(());
388        }
389        Ok(())
390    }
391
392    /// Old-style arXiv id: `subject-class/YYMMNNN[vN]`.
393    /// Subject class: `[a-z]([a-z-]*[a-z])?(\.[A-Z]{2})?`.
394    fn validate_arxiv_old(s: &str) -> Result<(), ()> {
395        let slash_idx = s.find('/').ok_or(())?;
396        let class = &s[..slash_idx];
397        let id = &s[slash_idx + 1..];
398
399        // Class: starts with [a-z], body is [a-z-], optional `.XX` (two
400        // ASCII upper).
401        let (core_class, dot_part) = match class.find('.') {
402            Some(d) => (&class[..d], Some(&class[d + 1..])),
403            None => (class, None),
404        };
405        if core_class.is_empty()
406            || !core_class
407                .chars()
408                .all(|c| c.is_ascii_lowercase() || c == '-')
409            || core_class.starts_with('-')
410            || core_class.ends_with('-')
411        {
412            return Err(());
413        }
414        if let Some(dp) = dot_part {
415            if dp.len() != 2 || !dp.chars().all(|c| c.is_ascii_uppercase()) {
416                return Err(());
417            }
418        }
419
420        // Id: 7 digits, optional `vN`.
421        let bytes = id.as_bytes();
422        let mut i = 0;
423        while i < bytes.len() && bytes[i].is_ascii_digit() {
424            i += 1;
425        }
426        if i != 7 {
427            return Err(());
428        }
429        if i == bytes.len() {
430            return Ok(());
431        }
432        if bytes[i] != b'v' {
433            return Err(());
434        }
435        i += 1;
436        let v_start = i;
437        while i < bytes.len() && bytes[i].is_ascii_digit() {
438            i += 1;
439        }
440        if i == v_start || i != bytes.len() {
441            return Err(());
442        }
443        Ok(())
444    }
445}
446
447// ---------------------------------------------------------------------------
448// RefParseError
449// ---------------------------------------------------------------------------
450
451/// Reasons a `Doi::parse` / `ArxivId::parse` / `Ref::parse` call can fail.
452///
453/// Each variant maps to one rejection category in `docs/SECURITY.md` §1.1.
454/// All variants funnel to [`ErrorCode::InvalidRef`] when surfacing to MCP /
455/// CLI; the granular shape is preserved for tests and for future log
456/// breadcrumbs. The `From<RefParseError> for ErrorCode` impl below makes
457/// `?` propagation collapse to `INVALID_REF` automatically, satisfying
458/// `docs/PUBLIC_API.md` §4.
459///
460/// Marked `#[non_exhaustive]` so adding new categories is a non-breaking
461/// change. Pattern-match with a wildcard arm.
462#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)]
463#[non_exhaustive]
464pub enum RefParseError {
465    /// Input was empty.
466    #[error("empty input")]
467    Empty,
468    /// Input did not begin with the required `10.` literal (after any
469    /// scheme strip).
470    #[error("DOI must begin with '10.'")]
471    MissingDoiPrefix,
472    /// Input started with `10.` but had no `/` separator between
473    /// registrant and suffix.
474    #[error("DOI must contain '/' between registrant and suffix")]
475    MissingDoiSuffixSeparator,
476    /// Registrant was not 4–9 ASCII digits.
477    #[error("DOI registrant must be 4–9 ASCII digits")]
478    InvalidDoiRegistrant,
479    /// DOI suffix was empty.
480    #[error("DOI suffix is empty")]
481    EmptyDoiSuffix,
482    /// DOI suffix exceeded `DOI_SUFFIX_MAX_LEN` bytes.
483    #[error("DOI suffix is {len} bytes; maximum is {max}")]
484    DoiSuffixTooLong {
485        /// Observed suffix length, in bytes.
486        len: usize,
487        /// Hard upper bound (always [`DOI_SUFFIX_MAX_LEN`]).
488        max: usize,
489    },
490    /// DOI suffix contained a character outside `[A-Za-z0-9._/():-]`.
491    #[error("DOI suffix contains invalid character {ch:?}")]
492    InvalidDoiSuffixChar {
493        /// The first offending character.
494        ch: char,
495    },
496    /// Input matched neither the new-style nor old-style arXiv shape.
497    #[error("input does not match any known arXiv id shape")]
498    InvalidArxivShape,
499}
500
501impl From<RefParseError> for ErrorCode {
502    fn from(_: RefParseError) -> Self {
503        // All parse failures collapse to INVALID_REF at the public boundary,
504        // matching `docs/PUBLIC_API.md` §4 and `docs/SECURITY.md` §1.1.
505        ErrorCode::InvalidRef
506    }
507}
508
509// ---------------------------------------------------------------------------
510// Safekey
511// ---------------------------------------------------------------------------
512
513/// A filesystem-safe key derived deterministically from a `Ref`.
514///
515/// See `docs/SAFEKEY.md` for the full algorithm and reference test vectors.
516/// Construct via `Ref::safekey()` (Phase 1+); inner field is `pub(crate)`.
517///
518/// Wire format: bare string (`#[serde(transparent)]`), e.g. `"doi_10.1234_example"`.
519#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
520#[serde(transparent)]
521pub struct Safekey(pub(crate) String);
522
523impl Safekey {
524    /// Returns the safekey as a string slice.
525    pub fn as_str(&self) -> &str {
526        &self.0
527    }
528}
529
530impl Ref {
531    /// Returns the bare identifier string usable as a provenance `ref` field.
532    ///
533    /// Equivalent to `Doi::as_str` / `ArxivId::as_str` dispatched on the
534    /// variant — the URI scheme (`doi:` / `arxiv:`) is never present in the
535    /// inner identifiers (it is stripped at parse time), so the result is
536    /// always the bare DOI or arXiv id. Used by the CLI / MCP orchestrators
537    /// to populate the `ref` column of provenance log rows
538    /// (`docs/PROVENANCE_LOG.md` §3) without re-matching the variant.
539    pub fn as_input_str(&self) -> &str {
540        match self {
541            Ref::Doi(d) => d.as_str(),
542            Ref::Arxiv(a) => a.as_str(),
543        }
544    }
545
546    /// Derives a deterministic, filesystem-safe key from this reference.
547    ///
548    /// The algorithm is the NORMATIVE binding spec in `docs/SAFEKEY.md` §3.
549    /// Both Rust and Julia implementations MUST produce bit-identical output
550    /// for every entry in `tests/fixtures/safekey/vectors.json`.
551    ///
552    /// # Algorithm summary
553    ///
554    /// 1. Prefix with `doi_` or `arxiv_` (per variant).
555    /// 2. Replace any character outside `[A-Za-z0-9._-]` with `_`.
556    /// 3. Collapse consecutive `_` runs to a single `_`.
557    /// 4. Trim leading/trailing `_`.
558    /// 5. If the result exceeds 192 bytes, take the first 192 bytes plus
559    ///    `_` plus the first 8 hex chars of `SHA-256(raw)` (where `raw` is
560    ///    the step-1 output, before escaping).
561    ///
562    /// The bound on `as_str()` after step 4 is pure ASCII (steps 1-3 produce
563    /// only ASCII bytes), so the byte-slice in step 5 cannot split a
564    /// multibyte char.
565    pub fn safekey(&self) -> Safekey {
566        // Step 0: prefix per variant. Doi/ArxivId hold the bare identifier
567        // (no `doi:` / `arxiv:` URI scheme — that is stripped by Ref::parse,
568        // not relevant here).
569        let raw = match self {
570            Ref::Doi(d) => format!("doi_{}", d.as_str()),
571            Ref::Arxiv(a) => format!("arxiv_{}", a.as_str()),
572        };
573
574        // Step 1: replace unsafe chars with '_'. Non-ASCII chars (emitted by
575        // String::chars() as full Unicode code points) all hit the wildcard
576        // arm and become a single '_'.
577        let escaped: String = raw
578            .chars()
579            .map(|c| match c {
580                'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '-' | '_' => c,
581                _ => '_',
582            })
583            .collect();
584
585        // Step 2: collapse consecutive '_' runs to a single '_'.
586        let mut collapsed = String::with_capacity(escaped.len());
587        let mut last_was_underscore = false;
588        for c in escaped.chars() {
589            if c == '_' {
590                if !last_was_underscore {
591                    collapsed.push('_');
592                }
593                last_was_underscore = true;
594            } else {
595                collapsed.push(c);
596                last_was_underscore = false;
597            }
598        }
599
600        // Step 3: trim leading/trailing '_'.
601        let trimmed = collapsed.trim_matches('_');
602
603        // Step 4: length-bound. After steps 1-3 `trimmed` is pure ASCII, so
604        // `len()` (bytes) == char count and `&trimmed[..192]` is char-safe.
605        let key = if trimmed.len() > 192 {
606            let digest = sha2::Sha256::digest(raw.as_bytes());
607            let hash = hex::encode(&digest[..4]);
608            format!("{}_{}", &trimmed[..192], hash)
609        } else {
610            trimmed.to_string()
611        };
612
613        Safekey(key)
614    }
615}
616
617// ---------------------------------------------------------------------------
618// ErrorCode
619// ---------------------------------------------------------------------------
620
621/// The closed set of error codes doiget surfaces.
622///
623/// See `docs/ERRORS.md` for the persona × code matrix.
624///
625/// Marked `#[non_exhaustive]` so adding new variants is a minor (not major)
626/// version bump.
627#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
628#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
629#[non_exhaustive]
630pub enum ErrorCode {
631    /// DOI / arXiv id failed validation.
632    InvalidRef,
633    /// Tier 1 sources reported no OA URL.
634    NoOaAvailable,
635    /// Internal rate cap or upstream 429.
636    RateLimited,
637    /// Transport / DNS / TLS failure.
638    NetworkError,
639    /// Filesystem write failed.
640    StoreError,
641    /// Provenance log write failed; the fetch was aborted.
642    LogError,
643    /// Source not granted by the runtime `CapabilityProfile`.
644    CapabilityDenied,
645    /// Per-request timeout exceeded.
646    FetchTimeout,
647    /// Store entry's `schema_version` is ahead of this build.
648    SchemaTooNew,
649    /// Could not acquire `flock` within 5 s.
650    LockTimeout,
651    /// Bug — please open an issue.
652    InternalError,
653    /// Feature is spec'd but not yet wired in this Phase. Distinct from
654    /// [`Self::InternalError`] (which signals a bug) and
655    /// [`Self::CapabilityDenied`] (which signals a runtime config gate).
656    /// Returned by stubs that exist to pin the public surface ahead of
657    /// orchestrator implementation, so an agent can react with "wait for
658    /// next minor release" rather than "report a bug" or "tweak my
659    /// capability profile". Wire form: `"NOT_IMPLEMENTED"`.
660    NotImplemented,
661}
662
663impl ErrorCode {
664    /// The `SCREAMING_SNAKE_CASE` wire token for this code, as a
665    /// `&'static str`. Identical to the serde representation but
666    /// allocation-free and usable where a borrowed string with a
667    /// `'static` lifetime is required — notably the provenance log
668    /// `error_code` column (`docs/PROVENANCE_LOG.md` §3), so a failure
669    /// row records the *actual* mapped code instead of a hand-written
670    /// literal that can drift from this enum (issue #118).
671    #[must_use]
672    pub fn as_wire(&self) -> &'static str {
673        match self {
674            ErrorCode::InvalidRef => "INVALID_REF",
675            ErrorCode::NoOaAvailable => "NO_OA_AVAILABLE",
676            ErrorCode::RateLimited => "RATE_LIMITED",
677            ErrorCode::NetworkError => "NETWORK_ERROR",
678            ErrorCode::StoreError => "STORE_ERROR",
679            ErrorCode::LogError => "LOG_ERROR",
680            ErrorCode::CapabilityDenied => "CAPABILITY_DENIED",
681            ErrorCode::FetchTimeout => "FETCH_TIMEOUT",
682            ErrorCode::SchemaTooNew => "SCHEMA_TOO_NEW",
683            ErrorCode::LockTimeout => "LOCK_TIMEOUT",
684            ErrorCode::InternalError => "INTERNAL_ERROR",
685            ErrorCode::NotImplemented => "NOT_IMPLEMENTED",
686        }
687    }
688}
689
690// ---------------------------------------------------------------------------
691// DenialReason / DenialContext (ADR-0023)
692// ---------------------------------------------------------------------------
693
694/// Closed-set reasons a denial-class error envelope can carry on its
695/// optional `denial_context.reason` field.
696///
697/// Wire form (JSON / MCP) is `snake_case` — e.g. `"redirect_not_in_allowlist"`.
698/// The set is **closed** per ADR-0023 §2: adding a new variant is a minor
699/// semver bump; renaming or repurposing one is a breaking change. Mirrors
700/// the stability rule that already governs [`ErrorCode`].
701///
702/// See [`DenialContext`] for the surrounding struct, `docs/ERRORS.md` §3.1
703/// for the wire surface, and `docs/PUBLIC_API.md` §8 for the
704/// semver-locked surface contract.
705#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
706#[serde(rename_all = "snake_case")]
707pub enum DenialReason {
708    /// Redirect target host did not match the source's allowlist
709    /// (`HttpError::RedirectDenied`).
710    RedirectNotInAllowlist,
711    /// Redirect target had a non-HTTPS scheme (`HttpError::InsecureRedirect`).
712    InsecureScheme,
713    /// Source produced a URL whose host is on a future blocklist.
714    ///
715    /// Reserved — no producer wired yet. Will be emitted by the future
716    /// per-source URL host-blocklist guard once that component lands
717    /// (post-Phase-1 supply-chain hardening; see
718    /// `docs/REDIRECT_ALLOWLIST.md` §4 for the staging plan).
719    HostInBlockList,
720    /// Body exceeded [`PDF_MAX_BYTES`] (`HttpError::OversizedBody`).
721    SizeCapExceeded,
722    /// Store entry's `schema_version` is ahead of this binary.
723    ///
724    /// Reserved — no producer wired yet. Will be emitted by the
725    /// `FsStore` schema-rejection path once the read-side bump check
726    /// lands (it currently only writes the current `SCHEMA_VERSION`).
727    SchemaDrift,
728    /// Source not in the runtime [`CapabilityProfile`]
729    /// (`FetchError::NotEligible`).
730    CapabilityNotGranted,
731    /// Rate limiter rejected the call inside the current window.
732    ///
733    /// Reserved — no producer wired yet. Will be emitted by
734    /// [`RateLimiter`](crate::rate_limiter::RateLimiter) once the
735    /// limiter surfaces structured denials (Phase 2+; today the
736    /// limiter only sleeps to enforce the window).
737    RateLimitWindow,
738    /// SSRF guard rejected a private / link-local / cloud-metadata address.
739    ///
740    /// Reserved — no producer wired yet. Will be emitted by the
741    /// future SSRF pre-flight check (post-Phase-1 supply-chain
742    /// hardening; the workspace currently relies on rustls + the
743    /// HTTPS-only redirect policy to keep the attack surface small).
744    SsrfPrivateAddress,
745    /// Response Content-Type / magic-byte mismatch (`HttpError::NotAPdf`).
746    ContentTypeMismatch,
747}
748
749/// Structured machine-parseable companion to `error.message` for
750/// recoverable denials.
751///
752/// The field is **optional and additive** on the public error envelope —
753/// every previously-shipped `{code, message}` envelope remains valid, and
754/// agents that ignore this struct continue to work. When present, it
755/// carries the concrete parameters an LLM agent can use to plan a recovery
756/// (e.g. "the redirect to `evil.example.com` was denied because it is not
757/// in the crossref allowlist") without text-mining `error.message`.
758///
759/// ## Wire shape
760///
761/// `#[serde(deny_unknown_fields)]`: forward-compatible field additions on
762/// the wire are forbidden by design — adding a field to this struct is a
763/// **breaking** change. This is why the type is **not** `#[non_exhaustive]`
764/// (per `docs/PUBLIC_API.md` §8): both production rules — Rust struct
765/// construction outside the crate AND wire-level extension — must agree.
766///
767/// All fields except `reason` are optional. Producers populate the fields
768/// relevant to the reason and leave the rest at `None`; consumers MUST
769/// tolerate any subset of fields being present. Optional fields are
770/// skipped on serialize but accepted as missing on deserialize via
771/// `#[serde(default, skip_serializing_if = "Option::is_none")]`.
772///
773/// [`Self::expected`] is `Option<Vec<String>>` rather than `Vec<String>`
774/// so the producer can distinguish "this reason has no allowlist channel"
775/// (`None` → field absent on the wire) from "this is the explicit list of
776/// acceptable values, possibly empty" (`Some(vec![])` → `"expected":[]` on
777/// the wire). The previous `Vec<String>` shape collapsed both states
778/// into "field omitted", which an LLM agent could not safely disambiguate.
779///
780/// Mapping table: see ADR-0023 §4, plus the
781/// `From<&HttpError> for Option<DenialContext>` and
782/// `From<&FetchError> for Option<DenialContext>` impls in
783/// [`crate::http`] / [`crate::source`].
784#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
785#[serde(deny_unknown_fields)]
786pub struct DenialContext {
787    /// Closed-enum reason code; the only required field.
788    pub reason: DenialReason,
789    /// Resolver source key (e.g. `"crossref"`) when one is in scope.
790    #[serde(default, skip_serializing_if = "Option::is_none")]
791    pub source: Option<String>,
792    /// Concrete value the producer attempted (host, path, hex magic bytes,
793    /// scheme prefix). Shape is reason-specific; consumers MUST treat it
794    /// as opaque text.
795    #[serde(default, skip_serializing_if = "Option::is_none")]
796    pub attempted: Option<String>,
797    /// Allowlist entries / acceptable values. `Option<Vec<String>>` so the
798    /// producer can distinguish "this reason has no allowlist channel"
799    /// (`None`, field absent on the wire) from "this is the explicit list
800    /// of acceptable values, possibly empty" (`Some(vec![])`, `"expected":[]`
801    /// on the wire). The inner `Vec<String>` is used even when only one
802    /// value is meaningful (e.g. `Some(vec!["%PDF-".into()])`) so the
803    /// format does not have to flip when multiple values are acceptable.
804    #[serde(default, skip_serializing_if = "Option::is_none")]
805    pub expected: Option<Vec<String>>,
806    /// Redirect-chain hop position, 0-indexed. `u8` because the chain is
807    /// hard-capped at [`crate::http`]'s `MAX_REDIRECTS` (= 10) and any
808    /// larger value indicates a bug.
809    #[serde(default, skip_serializing_if = "Option::is_none")]
810    pub hop_index: Option<u8>,
811    /// Size or rate cap value (e.g. [`PDF_MAX_BYTES`]).
812    #[serde(default, skip_serializing_if = "Option::is_none")]
813    pub cap: Option<u64>,
814    /// Observed value (e.g. response bytes when [`Self::cap`] is the byte
815    /// cap, or row schema_version when [`Self::cap`] is the binary's).
816    #[serde(default, skip_serializing_if = "Option::is_none")]
817    pub actual: Option<u64>,
818}
819
820// ---------------------------------------------------------------------------
821// ResolvedCandidate / ResolveResult (Issue #242)
822// ---------------------------------------------------------------------------
823
824/// A candidate paper resolved from a bibliographic citation string.
825#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
826pub struct ResolvedCandidate {
827    /// Resolved DOI.
828    pub doi: String,
829    /// Title of the resolved candidate.
830    pub title: String,
831    /// First author or primary author representation.
832    pub author: String,
833    /// Publication year, if resolved.
834    pub year: Option<i32>,
835    /// Token similarity overlap score in `0.0..=1.0`.
836    pub score: f64,
837    /// Resolving metadata source (e.g. `"crossref"`).
838    pub source: String,
839}
840
841/// The result structure returned by bibliographic citation resolution.
842#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
843pub struct ResolveResult {
844    /// The original query bibliographic citation string.
845    pub query: String,
846    /// Ranked candidate list (highest score first, thresholded to >= 0.5).
847    pub candidates: Vec<ResolvedCandidate>,
848}
849
850// ---------------------------------------------------------------------------
851// CapabilityProfile (placeholder; full impl in Phase 1)
852// ---------------------------------------------------------------------------
853
854/// Marker for the always-on Open Access tier. See `docs/CAPABILITY.md`.
855#[derive(Debug, Clone, Copy)]
856pub struct AlwaysOn;
857
858/// Which Tier 2 metadata sources are enabled this session. See `docs/CAPABILITY.md`.
859#[derive(Debug, Clone, Default)]
860#[non_exhaustive]
861pub struct MetadataAccess {
862    /// Phase 4+; enabled by `DOIGET_ENABLE_OPENALEX`.
863    pub openalex: bool,
864    /// Phase 4+; enabled by `DOIGET_ENABLE_S2`.
865    pub semantic_scholar: bool,
866    /// Phase 4+; enabled by `DOIGET_ENABLE_DOAJ`.
867    pub doaj: bool,
868}
869
870/// Process-wide rate limits. Hard-coded; not configurable.
871///
872/// Construct only via [`RateLimits::HARD_CODED`]. The struct fields are
873/// `pub(crate)` so downstream code cannot synthesize a `RateLimits` with
874/// different values, which would weaken `docs/LEGAL.md` §6 safeguard 8.
875#[derive(Debug, Clone, Copy)]
876#[non_exhaustive]
877pub struct RateLimits {
878    pub(crate) max_concurrent_fetches: u32,
879    pub(crate) max_fetches_per_second: f32,
880    pub(crate) per_source_backoff_ms: u64,
881}
882
883impl RateLimits {
884    /// The single, hard-coded set of rate limits. There is no other public
885    /// constructor — see the type-level docs.
886    pub const HARD_CODED: Self = Self {
887        max_concurrent_fetches: MAX_CONCURRENT_FETCHES,
888        max_fetches_per_second: MAX_FETCHES_PER_SECOND,
889        per_source_backoff_ms: 200,
890    };
891
892    /// Maximum number of concurrent fetches in flight.
893    pub const fn max_concurrent_fetches(&self) -> u32 {
894        self.max_concurrent_fetches
895    }
896
897    /// Maximum fetch attempts per second across all sources.
898    pub const fn max_fetches_per_second(&self) -> f32 {
899        self.max_fetches_per_second
900    }
901
902    /// Per-source backoff in milliseconds between consecutive requests.
903    pub const fn per_source_backoff_ms(&self) -> u64 {
904        self.per_source_backoff_ms
905    }
906}
907
908/// A successful TDM grant.
909///
910/// Carries the validated API key (`docs/CAPABILITY.md` §1) so that the key
911/// flows from the startup capability gate into the source, rather than each
912/// TDM source re-reading the env var at fetch time (issue #153 — an env
913/// mutation between startup and fetch is otherwise undetectable).
914///
915/// The `api_key` field exists only when at least one `tdm-*` Cargo feature
916/// is compiled in (the `secrecy` dependency is `optional = true` and gated
917/// on those features per ADR-0002, so default release binaries contain no
918/// TDM code path at all). The struct is `#[non_exhaustive]`; the
919/// `tdm-*`-gated `api_key` field is therefore additive, not breaking, for
920/// builds that toggle the feature set.
921///
922/// `docs/CAPABILITY.md` §1 specifies the type as `Secret<String>`; that is
923/// the `secrecy` 0.9 spelling. The workspace pins `secrecy` 0.10, whose
924/// equivalent owned-string secret type is `secrecy::SecretString`
925/// (`= SecretBox<str>`). CAPABILITY.md §1 has been updated to match the
926/// 0.10 API. `Debug` redacts the value.
927///
928/// Implements `Default` so in-crate test fixtures using
929/// `TdmGrant { agree_env_var: ..., ..Default::default() }` keep compiling;
930/// the default `api_key` is an empty secret.
931#[derive(Debug, Clone)]
932#[non_exhaustive]
933pub struct TdmGrant {
934    /// The publisher API key, validated present at startup by
935    /// [`CapabilityProfile::from_env`]. Wrapped in
936    /// `secrecy::SecretString` so `Debug` never prints it; use
937    /// `secrecy::ExposeSecret::expose_secret` at the point of use.
938    ///
939    /// Only present when a `tdm-*` feature is compiled in (see the
940    /// type-level docs and ADR-0002).
941    #[cfg(any(
942        feature = "tdm-elsevier",
943        feature = "tdm-aps",
944        feature = "tdm-springer"
945    ))]
946    pub api_key: secrecy::SecretString,
947    /// Which env var the user used to acknowledge the publisher's ToS.
948    pub agree_env_var: String,
949    /// When the agreement env var was first observed at startup.
950    pub agreed_at: chrono::DateTime<chrono::Utc>,
951}
952
953impl Default for TdmGrant {
954    fn default() -> Self {
955        Self {
956            #[cfg(any(
957                feature = "tdm-elsevier",
958                feature = "tdm-aps",
959                feature = "tdm-springer"
960            ))]
961            api_key: secrecy::SecretString::from(String::new()),
962            agree_env_var: String::new(),
963            agreed_at: chrono::Utc::now(),
964        }
965    }
966}
967
968/// Runtime gate for which sources may be invoked. See `docs/CAPABILITY.md`.
969///
970/// Marked `#[non_exhaustive]` so adding new capability classes is non-breaking.
971/// Pattern-match only against the documented variants and use a wildcard arm.
972///
973/// **Construction**: external callers use [`CapabilityProfile::from_env()`].
974/// Struct-literal construction is blocked outside this crate by
975/// `#[non_exhaustive]`; this is intentional — the type's safety guarantees
976/// rely on the resolution rules in `from_env`. `Default` is **not yet**
977/// implemented; Phase 1 will add it once the field set stabilizes.
978#[derive(Debug, Clone)]
979#[non_exhaustive]
980pub struct CapabilityProfile {
981    /// Tier 1 OA sources are always permitted.
982    pub oa: AlwaysOn,
983    /// Tier 2 metadata access (Phase 4+).
984    pub metadata: MetadataAccess,
985    /// Tier 3 grants are populated only when both env var and feature compile-in are set.
986    pub tdm_elsevier: Option<TdmGrant>,
987    /// Tier 3 grants are populated only when both env var and feature compile-in are set.
988    pub tdm_aps: Option<TdmGrant>,
989    /// Tier 3 grants are populated only when both env var and feature compile-in are set.
990    pub tdm_springer: Option<TdmGrant>,
991    /// Hard-coded rate limits for this process.
992    pub rate_limits: RateLimits,
993}
994
995/// Errors that can arise during `CapabilityProfile::from_env`.
996#[derive(Debug, thiserror::Error)]
997pub enum CapabilityError {
998    /// User set the agree env var but provided no key. See `docs/CAPABILITY.md` §2.
999    #[error("env {agree_var} is set but {key_var} is missing")]
1000    AgreedButNoKey {
1001        /// The agreement env var the user set.
1002        agree_var: String,
1003        /// The key env var that should accompany it.
1004        key_var: String,
1005    },
1006    /// Key env var is set but user has not agreed. See `docs/CAPABILITY.md` §2.
1007    #[error("key for {agree_var} is present but {agree_var} is not set to '1'")]
1008    KeyButNotAgreed {
1009        /// The agreement env var the user must set to `1` before the key takes effect.
1010        agree_var: String,
1011    },
1012}
1013
1014impl CapabilityProfile {
1015    /// Read the runtime profile from environment variables.
1016    ///
1017    /// Implements the resolution algorithm specified in
1018    /// [`docs/CAPABILITY.md`](../../../docs/CAPABILITY.md) §2.
1019    ///
1020    /// # Tier 1 (Open Access)
1021    ///
1022    /// Always permitted; not gated on any env var or feature.
1023    ///
1024    /// # Tier 2 (metadata)
1025    ///
1026    /// Each metadata source becomes available when its env var is set
1027    /// (presence-checked, value ignored) **and** the `metadata` Cargo feature
1028    /// was compiled in. If the env var is set but the feature is not compiled
1029    /// in, a `tracing::warn!` is emitted and the source is left disabled —
1030    /// this is not an error so that users can move binaries between machines
1031    /// (or switch feature sets between cargo invocations) without breaking
1032    /// startup. See `docs/CAPABILITY.md` §3 for the env var list.
1033    ///
1034    /// # Tier 3 (TDM)
1035    ///
1036    /// For each publisher in `{ELSEVIER, APS, SPRINGER}`, the
1037    /// `DOIGET_AGREE_TDM_<X>` agreement env var is paired with
1038    /// `DOIGET_KEY_<X>`. Resolution rules (per `docs/CAPABILITY.md` §2):
1039    ///
1040    /// - both unset → `tdm_<x> = None` (no error);
1041    /// - `agree == "1"` and key set → `Some(TdmGrant { .. })` (subject to the
1042    ///   feature gate below);
1043    /// - `agree == "1"` and key unset → [`CapabilityError::AgreedButNoKey`];
1044    /// - key set but `agree` unset (or `agree != "1"`) →
1045    ///   [`CapabilityError::KeyButNotAgreed`].
1046    ///
1047    /// When both env vars are set correctly **but** the corresponding
1048    /// `tdm-<x>` Cargo feature is not compiled in, this function emits a
1049    /// `tracing::warn!` and sets the grant to `None` rather than returning an
1050    /// error — same rationale as for the Tier 2 warn-and-skip behavior.
1051    ///
1052    /// # Precondition: tracing subscriber must be installed first
1053    ///
1054    /// Warn breadcrumbs are delivered via `tracing::warn!`. Callers MUST
1055    /// install a `tracing-subscriber` (or equivalent) **before** invoking
1056    /// this function, otherwise warnings are silently dropped. The
1057    /// `doiget-cli` binary does this in `main.rs`.
1058    ///
1059    /// # Errors
1060    ///
1061    /// Returns [`CapabilityError::AgreedButNoKey`] or
1062    /// [`CapabilityError::KeyButNotAgreed`] when the TDM env-var pair for any
1063    /// publisher is misconfigured. See the variant docs for the precise
1064    /// trigger conditions.
1065    ///
1066    /// # Note on `api_key` storage
1067    ///
1068    /// When a `tdm-*` feature is compiled in, [`TdmGrant`] carries the
1069    /// validated key as `secrecy::SecretString` (issue #153). The key is
1070    /// read exactly once here, at startup; TDM sources consume it from the
1071    /// grant and never re-read the env var at fetch time. This makes the
1072    /// grant a true startup attestation — an env mutation between startup
1073    /// and fetch can no longer silently change the credential in flight.
1074    /// See the [`TdmGrant`] doc-comment and `docs/CAPABILITY.md` §1/§2.
1075    pub fn from_env() -> Result<Self, CapabilityError> {
1076        // Issue #153: the validated API key is now threaded through
1077        // `TdmGrant` (as `secrecy::SecretString`, behind the `tdm-*`
1078        // features) by `resolve_tdm_grant` below — sources no longer
1079        // re-read the key env var at fetch time. See the `TdmGrant`
1080        // doc-comment and `docs/CAPABILITY.md` §1/§2.
1081
1082        // -- Tier 2 metadata -------------------------------------------------
1083        let metadata = MetadataAccess {
1084            openalex: resolve_metadata_flag(
1085                "DOIGET_ENABLE_OPENALEX",
1086                "metadata",
1087                cfg!(feature = "metadata"),
1088            ),
1089            semantic_scholar: resolve_metadata_flag(
1090                "DOIGET_ENABLE_S2",
1091                "metadata",
1092                cfg!(feature = "metadata"),
1093            ),
1094            doaj: resolve_metadata_flag(
1095                "DOIGET_ENABLE_DOAJ",
1096                "metadata",
1097                cfg!(feature = "metadata"),
1098            ),
1099        };
1100
1101        // -- Tier 3 TDM grants ----------------------------------------------
1102        let tdm_elsevier = resolve_tdm_grant(
1103            "DOIGET_AGREE_TDM_ELSEVIER",
1104            "DOIGET_KEY_ELSEVIER",
1105            "tdm-elsevier",
1106            cfg!(feature = "tdm-elsevier"),
1107        )?;
1108        let tdm_aps = resolve_tdm_grant(
1109            "DOIGET_AGREE_TDM_APS",
1110            "DOIGET_KEY_APS",
1111            "tdm-aps",
1112            cfg!(feature = "tdm-aps"),
1113        )?;
1114        let tdm_springer = resolve_tdm_grant(
1115            "DOIGET_AGREE_TDM_SPRINGER",
1116            "DOIGET_KEY_SPRINGER",
1117            "tdm-springer",
1118            cfg!(feature = "tdm-springer"),
1119        )?;
1120
1121        Ok(Self {
1122            oa: AlwaysOn,
1123            metadata,
1124            tdm_elsevier,
1125            tdm_aps,
1126            tdm_springer,
1127            rate_limits: RateLimits::HARD_CODED,
1128        })
1129    }
1130}
1131
1132/// Resolve a Tier 2 metadata flag from its env var and compile-in feature.
1133///
1134/// Returns `true` only when both the env var is present and the feature is
1135/// compiled in. When the env var is set without the feature, emits a
1136/// `tracing::warn!` and returns `false` — see [`CapabilityProfile::from_env`]
1137/// for the rationale (binaries may move between hosts / feature sets).
1138fn resolve_metadata_flag(env_var: &str, feature: &str, feature_enabled: bool) -> bool {
1139    let env_set = std::env::var_os(env_var).is_some();
1140    match (env_set, feature_enabled) {
1141        (true, true) => true,
1142        (true, false) => {
1143            tracing::warn!(
1144                env_var,
1145                feature,
1146                "{} is set but feature {} was not compiled in; the source will be unavailable",
1147                env_var,
1148                feature
1149            );
1150            false
1151        }
1152        (false, _) => false,
1153    }
1154}
1155
1156/// Resolve a Tier 3 TDM grant from the `agree`/`key` env-var pair and the
1157/// per-publisher Cargo feature.
1158///
1159/// Implements the rules in `docs/CAPABILITY.md` §2:
1160///
1161/// - both unset → `Ok(None)`.
1162/// - `agree == "1"` and `key` set → `Ok(Some(TdmGrant { .. }))` (when the
1163///   feature is enabled), or warn-and-`Ok(None)` (when the feature is not
1164///   compiled in).
1165/// - `agree == "1"` and `key` unset →
1166///   [`CapabilityError::AgreedButNoKey`].
1167/// - `key` set and `agree` unset OR `agree` set to anything other than `"1"`
1168///   → [`CapabilityError::KeyButNotAgreed`].
1169fn resolve_tdm_grant(
1170    agree_var: &str,
1171    key_var: &str,
1172    feature: &str,
1173    feature_enabled: bool,
1174) -> Result<Option<TdmGrant>, CapabilityError> {
1175    // `agree` is "agreed" iff the value is exactly the literal "1"; any other
1176    // value (including "true", "yes", empty) is treated as not-agreed per
1177    // `docs/CAPABILITY.md` §2.
1178    let agree_raw = std::env::var(agree_var).ok();
1179    let agreed = matches!(agree_raw.as_deref(), Some("1"));
1180    let agree_present = agree_raw.is_some();
1181    // Read the key value once, at startup, so the validated key flows
1182    // through `TdmGrant` and sources never re-read the env (issue #153).
1183    // An empty value is treated as "not set" — an empty API key cannot
1184    // authenticate, and silently constructing a grant around it would
1185    // mask the misconfiguration the AgreedButNoKey rule exists to surface.
1186    let key_value = std::env::var(key_var).ok().filter(|v| !v.is_empty());
1187
1188    match (agreed, agree_present, key_value) {
1189        (true, _, Some(key)) => {
1190            if feature_enabled {
1191                Ok(Some(build_tdm_grant(agree_var, key)))
1192            } else {
1193                // `key` is dropped here; under no-tdm builds it is the only
1194                // consumer of the owned `String`, which is intended.
1195                let _ = key;
1196                tracing::warn!(
1197                    env_var = agree_var,
1198                    feature,
1199                    "{} is set but feature {} was not compiled in; the source will be unavailable",
1200                    agree_var,
1201                    feature
1202                );
1203                Ok(None)
1204            }
1205        }
1206        (true, _, None) => Err(CapabilityError::AgreedButNoKey {
1207            agree_var: agree_var.to_string(),
1208            key_var: key_var.to_string(),
1209        }),
1210        // agree set to non-"1", key also set: KeyButNotAgreed (the key would
1211        // otherwise authorize the source without an explicit agreement).
1212        (false, true, Some(_)) => Err(CapabilityError::KeyButNotAgreed {
1213            agree_var: agree_var.to_string(),
1214        }),
1215        // agree unset, key set: KeyButNotAgreed (same rule).
1216        (false, false, Some(_)) => Err(CapabilityError::KeyButNotAgreed {
1217            agree_var: agree_var.to_string(),
1218        }),
1219        // agree set to non-"1" and no key: treat as no-grant. The user
1220        // expressed something but did not opt in and provided no credential,
1221        // so silent skip is the safe default (no source enabled).
1222        (false, true, None) => Ok(None),
1223        // Neither env var set: no grant, no error.
1224        (false, false, None) => Ok(None),
1225    }
1226}
1227
1228/// Construct a [`TdmGrant`] from the validated agreement var and key value.
1229///
1230/// Split out so the `tdm-*`-gated `api_key` field is populated in exactly
1231/// one place. When no `tdm-*` feature is compiled in the `key` is consumed
1232/// (dropped) here — the grant is still produced so that startup attestation
1233/// behavior (the warn-and-skip path) does not change shape between feature
1234/// sets.
1235fn build_tdm_grant(agree_var: &str, key: String) -> TdmGrant {
1236    #[cfg(any(
1237        feature = "tdm-elsevier",
1238        feature = "tdm-aps",
1239        feature = "tdm-springer"
1240    ))]
1241    {
1242        TdmGrant {
1243            api_key: secrecy::SecretString::from(key),
1244            agree_env_var: agree_var.to_string(),
1245            agreed_at: chrono::Utc::now(),
1246        }
1247    }
1248    #[cfg(not(any(
1249        feature = "tdm-elsevier",
1250        feature = "tdm-aps",
1251        feature = "tdm-springer"
1252    )))]
1253    {
1254        let _ = key;
1255        TdmGrant {
1256            agree_env_var: agree_var.to_string(),
1257            agreed_at: chrono::Utc::now(),
1258        }
1259    }
1260}
1261
1262// ---------------------------------------------------------------------------
1263// Tests — one smoke test per legally-load-bearing constant. See
1264// `docs/LEGAL.md` §6 safeguard 8 and `docs/PHASES.md` §4. These also keep the
1265// `cargo test --workspace` job from being a false-green during Phase 0.
1266// ---------------------------------------------------------------------------
1267
1268// `expect`/`unwrap` are idiomatic in tests where panics double as assertions.
1269// The workspace lints deny them in production code; relax for the test module
1270// only.
1271#[cfg(test)]
1272#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
1273mod tests {
1274    use super::*;
1275
1276    #[test]
1277    fn rate_limits_hard_coded_match_legal_safeguards() {
1278        // docs/LEGAL.md §6 safeguard 8 names these exact values.
1279        assert_eq!(RateLimits::HARD_CODED.max_concurrent_fetches(), 5);
1280        assert!((RateLimits::HARD_CODED.max_fetches_per_second() - 5.0).abs() < f32::EPSILON);
1281        assert_eq!(RateLimits::HARD_CODED.per_source_backoff_ms(), 200);
1282    }
1283
1284    #[test]
1285    fn batch_size_caps_match_security_doc() {
1286        // docs/SECURITY.md §1.4 + docs/MCP_TOOLS.md.
1287        assert_eq!(MCP_BATCH_MAX_SIZE, 100);
1288        assert_eq!(MCP_QUEUE_DEPTH_MAX, 100);
1289        assert_eq!(DOI_SUFFIX_MAX_LEN, 256);
1290        assert_eq!(MCP_STDIN_EOF_SHUTDOWN_SEC, 5);
1291        // Slice 2: spec-language alias for MCP_BATCH_MAX_SIZE must
1292        // numerically agree with the original constant.
1293        assert_eq!(MAX_BATCH_REFS, MCP_BATCH_MAX_SIZE);
1294    }
1295
1296    #[test]
1297    fn schema_version_is_pinned_to_1_0() {
1298        // docs/STORE.md §3 — Phase 0/1 writes 1.0 exactly.
1299        // A bump to 1.1 (minor, backward-compat additions) requires updating
1300        // both this test and the cross-tool compat fixtures simultaneously.
1301        assert_eq!(SCHEMA_VERSION, "1.0");
1302    }
1303
1304    // -----------------------------------------------------------------
1305    // CapabilityProfile::from_env — Phase 1 resolution algorithm tests.
1306    //
1307    // These tests mutate process-global env state via std::env::set_var /
1308    // remove_var, so each test holds an `EnvGuard` RAII drop guard that
1309    // captures the pre-test value of every env var it touches and restores
1310    // it on drop (even on panic). They also use `#[serial_test::serial]` so
1311    // that no two tests in this module touch env state concurrently — the
1312    // workspace's test runner defaults to multi-threaded.
1313    //
1314    // Spec: docs/CAPABILITY.md §2 (resolution algorithm) and §3 (env var
1315    // reference table).
1316    // -----------------------------------------------------------------
1317
1318    /// RAII guard that captures the prior value of an env var on construction
1319    /// and restores it on drop. Use one guard per touched var per test.
1320    struct EnvGuard {
1321        var: &'static str,
1322        prior: Option<std::ffi::OsString>,
1323    }
1324
1325    impl EnvGuard {
1326        /// Capture and clear `var`. Use `set` afterwards to install a value.
1327        fn unset(var: &'static str) -> Self {
1328            let prior = std::env::var_os(var);
1329            // SAFETY (env mutation): tests are serialized via
1330            // `#[serial_test::serial]`. `remove_var` is sound when no other
1331            // thread reads or writes the environment concurrently.
1332            std::env::remove_var(var);
1333            EnvGuard { var, prior }
1334        }
1335
1336        /// Capture, then set `var` to `value`.
1337        fn set(var: &'static str, value: &str) -> Self {
1338            let prior = std::env::var_os(var);
1339            std::env::set_var(var, value);
1340            EnvGuard { var, prior }
1341        }
1342    }
1343
1344    impl Drop for EnvGuard {
1345        fn drop(&mut self) {
1346            match &self.prior {
1347                Some(v) => std::env::set_var(self.var, v),
1348                None => std::env::remove_var(self.var),
1349            }
1350        }
1351    }
1352
1353    /// Convenience: unset every Tier 2 / Tier 3 env var the resolution
1354    /// algorithm reads, returning a vector of guards that restore them on
1355    /// drop. Callers can then `EnvGuard::set` individual vars on top.
1356    fn unset_all_capability_env_vars() -> Vec<EnvGuard> {
1357        [
1358            "DOIGET_ENABLE_OPENALEX",
1359            "DOIGET_ENABLE_S2",
1360            "DOIGET_ENABLE_DOAJ",
1361            "DOIGET_AGREE_TDM_ELSEVIER",
1362            "DOIGET_KEY_ELSEVIER",
1363            "DOIGET_AGREE_TDM_APS",
1364            "DOIGET_KEY_APS",
1365            "DOIGET_AGREE_TDM_SPRINGER",
1366            "DOIGET_KEY_SPRINGER",
1367        ]
1368        .iter()
1369        .map(|v| EnvGuard::unset(v))
1370        .collect()
1371    }
1372
1373    #[test]
1374    #[serial_test::serial]
1375    fn from_env_no_env_vars_set_returns_tier_1_only() {
1376        // Rule: with every relevant env var unset, the resolved profile has
1377        // all TDM grants `None` and all metadata flags `false`. Hard-coded
1378        // rate limits still apply. (Replaces the old Phase 0 stub test.)
1379        let _g = unset_all_capability_env_vars();
1380
1381        let p = CapabilityProfile::from_env().expect("clean env never errors");
1382        assert!(p.tdm_elsevier.is_none());
1383        assert!(p.tdm_aps.is_none());
1384        assert!(p.tdm_springer.is_none());
1385        assert!(!p.metadata.openalex);
1386        assert!(!p.metadata.semantic_scholar);
1387        assert!(!p.metadata.doaj);
1388        assert_eq!(p.rate_limits.max_concurrent_fetches(), 5);
1389    }
1390
1391    #[test]
1392    #[serial_test::serial]
1393    fn from_env_no_tdm_returns_tier_1_profile() {
1394        // Rule (CAPABILITY.md §2): with every TDM env var unset, all
1395        // `tdm_*` fields are `None` and no error is produced.
1396        let _g = unset_all_capability_env_vars();
1397
1398        let p = CapabilityProfile::from_env().expect("no TDM env -> Ok");
1399        assert!(p.tdm_elsevier.is_none());
1400        assert!(p.tdm_aps.is_none());
1401        assert!(p.tdm_springer.is_none());
1402    }
1403
1404    #[test]
1405    #[serial_test::serial]
1406    fn from_env_agreed_but_no_key_errs() {
1407        // Rule (CAPABILITY.md §2): agree=1 + key unset -> AgreedButNoKey.
1408        let _g = unset_all_capability_env_vars();
1409        let _agree = EnvGuard::set("DOIGET_AGREE_TDM_ELSEVIER", "1");
1410
1411        let result = CapabilityProfile::from_env();
1412        match result {
1413            Err(CapabilityError::AgreedButNoKey { agree_var, key_var }) => {
1414                assert_eq!(agree_var, "DOIGET_AGREE_TDM_ELSEVIER");
1415                assert_eq!(key_var, "DOIGET_KEY_ELSEVIER");
1416            }
1417            other => panic!("expected AgreedButNoKey, got {:?}", other),
1418        }
1419    }
1420
1421    #[test]
1422    #[serial_test::serial]
1423    fn from_env_agreed_but_empty_key_errs() {
1424        // Security-adjacent (PR #161 review): an *empty* key string is
1425        // treated as "not set" by `resolve_tdm_grant`. With agree=1 and
1426        // DOIGET_KEY_ELSEVIER="" the misconfiguration must surface as
1427        // AgreedButNoKey, not silently build a grant around an empty
1428        // secret that could never authenticate.
1429        let _g = unset_all_capability_env_vars();
1430        let _agree = EnvGuard::set("DOIGET_AGREE_TDM_ELSEVIER", "1");
1431        let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "");
1432
1433        let result = CapabilityProfile::from_env();
1434        match result {
1435            Err(CapabilityError::AgreedButNoKey { agree_var, key_var }) => {
1436                assert_eq!(agree_var, "DOIGET_AGREE_TDM_ELSEVIER");
1437                assert_eq!(key_var, "DOIGET_KEY_ELSEVIER");
1438            }
1439            other => panic!("expected AgreedButNoKey for empty key, got {:?}", other),
1440        }
1441    }
1442
1443    #[test]
1444    #[serial_test::serial]
1445    fn from_env_empty_key_without_agree_is_no_grant() {
1446        // Security-adjacent (PR #161 review): an empty key with the
1447        // agree var unset is indistinguishable from "no key at all".
1448        // It must resolve to Ok(None) (no grant, no error) — an empty
1449        // string must NOT trip the KeyButNotAgreed leaked-credential
1450        // rule, since there is no credential.
1451        let _g = unset_all_capability_env_vars();
1452        let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "");
1453
1454        let p = CapabilityProfile::from_env()
1455            .expect("empty key + agree unset must be Ok(None), not an error");
1456        assert!(
1457            p.tdm_elsevier.is_none(),
1458            "empty DOIGET_KEY_ELSEVIER with no agree var must yield no grant"
1459        );
1460        assert!(p.tdm_aps.is_none());
1461        assert!(p.tdm_springer.is_none());
1462    }
1463
1464    #[test]
1465    #[serial_test::serial]
1466    fn from_env_key_but_not_agreed_errs() {
1467        // Rule (CAPABILITY.md §2): key set + agree unset -> KeyButNotAgreed.
1468        // A leaked DOIGET_KEY_ELSEVIER must not silently enable a source.
1469        let _g = unset_all_capability_env_vars();
1470        let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "sk-test");
1471
1472        let result = CapabilityProfile::from_env();
1473        match result {
1474            Err(CapabilityError::KeyButNotAgreed { agree_var }) => {
1475                assert_eq!(agree_var, "DOIGET_AGREE_TDM_ELSEVIER");
1476            }
1477            other => panic!("expected KeyButNotAgreed, got {:?}", other),
1478        }
1479    }
1480
1481    #[test]
1482    #[serial_test::serial]
1483    fn from_env_agree_not_one_errs() {
1484        // Rule (CAPABILITY.md §2): the agree var must be exactly "1". Any
1485        // other value (here: "true") is treated as not-agreed; combined
1486        // with a key set, that triggers KeyButNotAgreed.
1487        let _g = unset_all_capability_env_vars();
1488        let _agree = EnvGuard::set("DOIGET_AGREE_TDM_ELSEVIER", "true");
1489        let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "sk-test");
1490
1491        let result = CapabilityProfile::from_env();
1492        match result {
1493            Err(CapabilityError::KeyButNotAgreed { agree_var }) => {
1494                assert_eq!(agree_var, "DOIGET_AGREE_TDM_ELSEVIER");
1495            }
1496            other => panic!("expected KeyButNotAgreed, got {:?}", other),
1497        }
1498    }
1499
1500    #[test]
1501    #[serial_test::serial]
1502    fn from_env_both_set_correctly_returns_grant() {
1503        // Rule (CAPABILITY.md §2): agree=1 + key set -> Some(TdmGrant) when
1504        // the corresponding feature is compiled in; else None (warn-and-skip).
1505        // The feature gate for elsevier is `tdm-elsevier`; this test asserts
1506        // both branches via `cfg!`.
1507        let _g = unset_all_capability_env_vars();
1508        let _agree = EnvGuard::set("DOIGET_AGREE_TDM_ELSEVIER", "1");
1509        let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "sk-test");
1510
1511        let p = CapabilityProfile::from_env().expect("agree=1 + key -> Ok");
1512
1513        if cfg!(feature = "tdm-elsevier") {
1514            let grant = p
1515                .tdm_elsevier
1516                .as_ref()
1517                .expect("feature tdm-elsevier compiled in -> Some(TdmGrant)");
1518            assert_eq!(grant.agree_env_var, "DOIGET_AGREE_TDM_ELSEVIER");
1519            // Issue #153 / PR #161 review: prove the key was actually
1520            // threaded into TdmGrant::api_key at startup (not just that
1521            // the agree var was recorded). The field is cfg-gated to
1522            // the same `tdm-*` set as the assertion below, so gate the
1523            // check identically.
1524            #[cfg(any(
1525                feature = "tdm-elsevier",
1526                feature = "tdm-aps",
1527                feature = "tdm-springer"
1528            ))]
1529            {
1530                use secrecy::ExposeSecret as _;
1531                assert_eq!(
1532                    grant.api_key.expose_secret(),
1533                    "sk-test",
1534                    "the DOIGET_KEY_ELSEVIER value must be threaded into \
1535                     TdmGrant::api_key (issue #153)"
1536                );
1537            }
1538        } else {
1539            assert!(
1540                p.tdm_elsevier.is_none(),
1541                "feature tdm-elsevier NOT compiled in -> None (warn-and-skip)"
1542            );
1543        }
1544    }
1545
1546    #[test]
1547    #[serial_test::serial]
1548    fn from_env_metadata_env_warns_without_feature() {
1549        // Rule (CAPABILITY.md §2): metadata env var without the `metadata`
1550        // feature -> source disabled (warn-and-skip, not an error).
1551        // We don't capture the tracing warn here; we just assert the field
1552        // is `false` when the feature is absent and `true` when present.
1553        let _g = unset_all_capability_env_vars();
1554        let _enable = EnvGuard::set("DOIGET_ENABLE_OPENALEX", "1");
1555
1556        let p = CapabilityProfile::from_env().expect("metadata env never errors");
1557
1558        if cfg!(feature = "metadata") {
1559            assert!(p.metadata.openalex);
1560        } else {
1561            assert!(!p.metadata.openalex);
1562        }
1563    }
1564
1565    // -----------------------------------------------------------------
1566    // Safekey reference vectors (docs/SAFEKEY.md §3, NORMATIVE).
1567    //
1568    // The vectors.json file is the binding cross-tool contract with
1569    // BiblioFetch.jl: every entry MUST round-trip identically through
1570    // both implementations. Phase 0 ships 13 entries; the full 100-entry
1571    // set is gated on the BiblioFetch.jl pre-flight (ADR-0007 Status:
1572    // Proposed at the time of this Phase 1 implementation).
1573    //
1574    // `Ref::parse` is concurrent W3-A work and is not on `main` yet, so
1575    // this test branches on the input prefix (`doi:` / `arxiv:`) and
1576    // constructs the variant directly via the in-crate `pub(crate)`
1577    // tuple constructor.
1578    // -----------------------------------------------------------------
1579
1580    #[derive(Deserialize)]
1581    struct SafekeyVector {
1582        input: String,
1583        expected: String,
1584    }
1585
1586    #[derive(Deserialize)]
1587    struct SafekeyVectorFile {
1588        vectors: Vec<SafekeyVector>,
1589    }
1590
1591    /// In-crate test helper: build a `Ref` from the user-facing form used
1592    /// in the vectors file, by stripping the `doi:` / `arxiv:` URI scheme
1593    /// and wrapping the remainder. This bypasses validation; it is fine
1594    /// here because the vectors are hand-curated and the test asserts the
1595    /// derivation algorithm, not parser semantics.
1596    fn ref_from_vector_input(input: &str) -> Ref {
1597        if let Some(rest) = input.strip_prefix("doi:") {
1598            Ref::Doi(Doi(rest.to_string()))
1599        } else if let Some(rest) = input.strip_prefix("arxiv:") {
1600            Ref::Arxiv(ArxivId(rest.to_string()))
1601        } else {
1602            panic!(
1603                "vectors.json entry has unknown ref scheme (expected doi: or arxiv: prefix): {}",
1604                input
1605            );
1606        }
1607    }
1608
1609    #[test]
1610    fn safekey_matches_reference_vectors() {
1611        // include_str! resolves relative to the file containing this macro
1612        // call (crates/doiget-core/src/lib.rs), so we go up three levels
1613        // to reach the workspace root, then down to tests/fixtures.
1614        let raw = include_str!("../../../tests/fixtures/safekey/vectors.json");
1615        let parsed: SafekeyVectorFile =
1616            serde_json::from_str(raw).expect("vectors.json is valid JSON matching schema");
1617
1618        // Phase 0 final ships the full NORMATIVE 100-entry set
1619        // (docs/SAFEKEY.md §5). The fixture is the binding cross-tool
1620        // contract with BiblioFetch.jl; tightening the count guard to
1621        // `== 100` ensures the set cannot silently grow or shrink without
1622        // a coordinated ADR bump (per docs/SAFEKEY.md status block).
1623        assert_eq!(
1624            parsed.vectors.len(),
1625            100,
1626            "vectors.json MUST be exactly 100 entries (NORMATIVE per docs/SAFEKEY.md §5); got {}",
1627            parsed.vectors.len()
1628        );
1629
1630        let mut failures: Vec<String> = Vec::new();
1631        for v in &parsed.vectors {
1632            let r = ref_from_vector_input(&v.input);
1633            let got = r.safekey().as_str().to_string();
1634            if got != v.expected {
1635                failures.push(format!(
1636                    "input={:?}\n  expected={:?}\n  got     ={:?}",
1637                    v.input, v.expected, got
1638                ));
1639            }
1640        }
1641
1642        assert!(
1643            failures.is_empty(),
1644            "{}/{} safekey reference vectors failed:\n{}",
1645            failures.len(),
1646            parsed.vectors.len(),
1647            failures.join("\n")
1648        );
1649    }
1650
1651    #[test]
1652    fn safekey_truncates_long_inputs_with_sha256_suffix() {
1653        // Construct a synthetic DOI whose suffix produces a `trimmed` longer than
1654        // 192 chars after step 3. 220 ASCII-safe chars + the `doi_10.1234/`
1655        // prefix easily exceeds 192. The resulting key must be exactly 201 chars:
1656        // 192 (trimmed prefix) + 1 (`_` separator) + 8 (hex of first 4 bytes of
1657        // SHA-256(raw)). Per docs/SAFEKEY.md §3 step 5.
1658        let suffix = "a".repeat(220);
1659        let doi = Doi(format!("10.1234/{}", suffix));
1660        let key = Ref::Doi(doi).safekey();
1661        let s = key.as_str();
1662
1663        // Shape: <192 ASCII chars from {A-Za-z0-9._-}> + "_" + <8 hex chars>
1664        assert_eq!(
1665            s.len(),
1666            201,
1667            "expected 201-char truncated key, got {}: {}",
1668            s.len(),
1669            s
1670        );
1671        assert_eq!(&s[192..193], "_", "expected '_' separator at byte 192");
1672        let hash_part = &s[193..];
1673        assert_eq!(hash_part.len(), 8, "hash suffix must be 8 hex chars");
1674        assert!(
1675            hash_part
1676                .chars()
1677                .all(|c| c.is_ascii_hexdigit() && !c.is_ascii_uppercase()),
1678            "hash suffix must be lowercase hex: {}",
1679            hash_part
1680        );
1681
1682        // Determinism: same input twice must produce the same key.
1683        let key2 = Ref::Doi(Doi(format!("10.1234/{}", "a".repeat(220)))).safekey();
1684        assert_eq!(s, key2.as_str(), "safekey must be deterministic");
1685
1686        // Hash content: must equal hex(sha256(raw)[..4]) where raw is the
1687        // pre-escape prefixed form per docs/SAFEKEY.md §3 step 5.
1688        use sha2::Digest;
1689        let raw = format!("doi_10.1234/{}", "a".repeat(220));
1690        let expected_hash = {
1691            let digest = sha2::Sha256::digest(raw.as_bytes());
1692            format!(
1693                "{:02x}{:02x}{:02x}{:02x}",
1694                digest[0], digest[1], digest[2], digest[3]
1695            )
1696        };
1697        assert_eq!(
1698            hash_part, expected_hash,
1699            "hash must match SHA-256 of raw form"
1700        );
1701    }
1702
1703    // -----------------------------------------------------------------
1704    // Doi::parse / ArxivId::parse / Ref::parse — Phase 1 W3-A.
1705    // Spec: docs/SECURITY.md §1.1 (input validation). The rejection
1706    // category set is the binding contract; each test case below names
1707    // which rule it exercises in a comment.
1708    // -----------------------------------------------------------------
1709
1710    // ---- Doi::parse happy paths (≥6) --------------------------------
1711
1712    #[test]
1713    fn doi_parse_accepts_bare_canonical_form() {
1714        // Rule: "10.<registrant>/<suffix>" is the canonical bare form.
1715        let d = Doi::parse("10.1234/example").expect("canonical bare DOI");
1716        assert_eq!(d.as_str(), "10.1234/example");
1717    }
1718
1719    #[test]
1720    fn doi_parse_accepts_doi_uri_scheme() {
1721        // Rule: the `doi:` scheme is stripped at construction; as_str
1722        // never carries it (matches docs/SAFEKEY.md §3 step 0).
1723        let d = Doi::parse("doi:10.1234/example").expect("doi: scheme accepted");
1724        assert_eq!(d.as_str(), "10.1234/example");
1725    }
1726
1727    #[test]
1728    fn doi_parse_accepts_complex_real_world_suffix() {
1729        // Rule: suffix charset includes `.`, `(`, `)`, `-`. From a real
1730        // PhysRevLett DOI used elsewhere in the test fixture set.
1731        let d = Doi::parse("10.1103/PhysRevLett.130.200601").expect("real-world PhysRev DOI");
1732        assert_eq!(d.as_str(), "10.1103/PhysRevLett.130.200601");
1733    }
1734
1735    #[test]
1736    fn doi_parse_accepts_parens_in_suffix() {
1737        // Rule: `(` and `)` are explicitly listed in the spec charset.
1738        let d = Doi::parse("10.1016/S0370-1573(98)00122-3").expect("parens in suffix");
1739        assert_eq!(d.as_str(), "10.1016/S0370-1573(98)00122-3");
1740    }
1741
1742    #[test]
1743    fn doi_parse_accepts_nested_slashes_in_suffix() {
1744        // Rule: `/` is a suffix character; only the first `/` is the
1745        // registrant/suffix separator.
1746        let d = Doi::parse("10.1234/foo/bar/baz").expect("nested slashes");
1747        assert_eq!(d.as_str(), "10.1234/foo/bar/baz");
1748    }
1749
1750    #[test]
1751    fn doi_parse_accepts_colon_in_legacy_kluwer_suffix() {
1752        // #194: legacy Kluwer/Springer DOIs (`10.1023/A:NNNNNNNNNN`)
1753        // carry a `:` in the suffix. Real DOI: "Entanglement, Quantum
1754        // Phase Transitions, and DMRG" (Kluwer, 2002).
1755        let d = Doi::parse("10.1023/A:1019601218492").expect("legacy Kluwer colon DOI");
1756        assert_eq!(d.as_str(), "10.1023/A:1019601218492");
1757    }
1758
1759    #[test]
1760    fn doi_parse_accepts_colon_in_edp_jphys_suffix() {
1761        // #194: EDP Sciences / Journal de Physique legacy corpus uses
1762        // `10.1051/jphys:NNNNNNNNNNNNNNNNN`. Real DOIs from the dogfood
1763        // Ising-RG run; both resolve at doi.org and via Crossref.
1764        let d = Doi::parse("10.1051/jphys:0198900500120136500").expect("EDP jphys colon DOI");
1765        assert_eq!(d.as_str(), "10.1051/jphys:0198900500120136500");
1766        let d2 = Doi::parse("doi:10.1051/jphys:0198500460100164500").expect("scheme + colon");
1767        assert_eq!(d2.as_str(), "10.1051/jphys:0198500460100164500");
1768    }
1769
1770    #[test]
1771    fn doi_parse_rejects_semicolon_in_suffix() {
1772        // #194 / ADR-0026: `;` is the natural ASCII neighbor of `:` and
1773        // is explicitly EXCLUDED from the suffix charset extension
1774        // (ADR-0026 §"Out of scope"). This test guards against an
1775        // over-broad `matches!` arm (e.g. an accidental `':'..=';'` range
1776        // typo) re-admitting `;` along with `:`.
1777        let result = Doi::parse("10.1234/foo;bar");
1778        assert!(
1779            matches!(result, Err(RefParseError::InvalidDoiSuffixChar { ch: ';' })),
1780            "expected InvalidDoiSuffixChar with ch=';', got {:?}",
1781            result
1782        );
1783    }
1784
1785    #[test]
1786    fn doi_parse_accepts_suffix_at_max_len_boundary() {
1787        // Rule: a suffix of exactly DOI_SUFFIX_MAX_LEN bytes is accepted;
1788        // 1 byte more is rejected (covered separately below).
1789        let suffix = "a".repeat(DOI_SUFFIX_MAX_LEN);
1790        let input = format!("10.1234/{}", suffix);
1791        let d = Doi::parse(&input).expect("suffix at max len");
1792        assert_eq!(d.as_str().len(), "10.1234/".len() + DOI_SUFFIX_MAX_LEN);
1793    }
1794
1795    #[test]
1796    fn doi_parse_uri_scheme_is_case_insensitive() {
1797        // Rule: be lenient on scheme casing; the scheme is stripped
1798        // either way so the stored form is identical.
1799        let d = Doi::parse("DOI:10.1234/example").expect("uppercase scheme");
1800        assert_eq!(d.as_str(), "10.1234/example");
1801    }
1802
1803    // ---- Doi::parse rejection paths (≥6) ----------------------------
1804
1805    #[test]
1806    fn doi_parse_rejects_missing_10_prefix() {
1807        // Rule: must start with "10." literal.
1808        assert_eq!(
1809            Doi::parse("11.1234/example"),
1810            Err(RefParseError::MissingDoiPrefix)
1811        );
1812    }
1813
1814    #[test]
1815    fn doi_parse_rejects_empty_input() {
1816        // Rule: empty inputs are not valid DOIs.
1817        assert_eq!(Doi::parse(""), Err(RefParseError::Empty));
1818    }
1819
1820    #[test]
1821    fn doi_parse_rejects_missing_suffix_separator() {
1822        // Rule: must contain a `/` between registrant and suffix.
1823        assert_eq!(
1824            Doi::parse("10.1234"),
1825            Err(RefParseError::MissingDoiSuffixSeparator)
1826        );
1827    }
1828
1829    #[test]
1830    fn doi_parse_rejects_empty_suffix() {
1831        // Rule: suffix must be non-empty.
1832        assert_eq!(Doi::parse("10.1234/"), Err(RefParseError::EmptyDoiSuffix));
1833    }
1834
1835    #[test]
1836    fn doi_parse_rejects_invalid_registrant_too_short() {
1837        // Rule: registrant must be 4–9 digits.
1838        assert_eq!(
1839            Doi::parse("10.12/example"),
1840            Err(RefParseError::InvalidDoiRegistrant)
1841        );
1842    }
1843
1844    #[test]
1845    fn doi_parse_rejects_non_digit_registrant() {
1846        // Rule: registrant chars must all be ASCII digits.
1847        assert_eq!(
1848            Doi::parse("10.12ab/example"),
1849            Err(RefParseError::InvalidDoiRegistrant)
1850        );
1851    }
1852
1853    #[test]
1854    fn doi_parse_rejects_control_char_in_suffix() {
1855        // Rule (from docs/SECURITY.md §1.1, log-injection mitigation):
1856        // control chars are not in the suffix charset; reject before they
1857        // can reach the provenance log.
1858        let result = Doi::parse("10.1234/foo\nbar");
1859        assert!(
1860            matches!(
1861                result,
1862                Err(RefParseError::InvalidDoiSuffixChar { ch: '\n' })
1863            ),
1864            "got {:?}",
1865            result
1866        );
1867    }
1868
1869    #[test]
1870    fn doi_parse_rejects_suffix_over_max_len() {
1871        // Rule: DOI_SUFFIX_MAX_LEN + 1 bytes is rejected.
1872        let suffix = "a".repeat(DOI_SUFFIX_MAX_LEN + 1);
1873        let input = format!("10.1234/{}", suffix);
1874        let result = Doi::parse(&input);
1875        match result {
1876            Err(RefParseError::DoiSuffixTooLong { len, max }) => {
1877                assert_eq!(len, DOI_SUFFIX_MAX_LEN + 1);
1878                assert_eq!(max, DOI_SUFFIX_MAX_LEN);
1879            }
1880            other => panic!("expected DoiSuffixTooLong, got {:?}", other),
1881        }
1882    }
1883
1884    #[test]
1885    fn doi_parse_rejects_non_ascii_in_suffix() {
1886        // Rule: spec charset is ASCII-only; non-ASCII becomes an
1887        // InvalidDoiSuffixChar (consistent with safekey behavior of
1888        // collapsing such chars to '_', which is a downstream concern).
1889        let result = Doi::parse("10.1234/物理学");
1890        assert!(
1891            matches!(result, Err(RefParseError::InvalidDoiSuffixChar { .. })),
1892            "got {:?}",
1893            result
1894        );
1895    }
1896
1897    // ---- ArxivId::parse happy paths (≥6) ----------------------------
1898
1899    #[test]
1900    fn arxiv_parse_accepts_new_style_4_digit_seq() {
1901        // Rule: new-style YYMM.NNNN (4-digit sequence number).
1902        let a = ArxivId::parse("0704.0001").expect("new-style 4-digit seq");
1903        assert_eq!(a.as_str(), "0704.0001");
1904    }
1905
1906    #[test]
1907    fn arxiv_parse_accepts_new_style_5_digit_seq() {
1908        // Rule: new-style YYMM.NNNNN (5-digit sequence number, post-2015).
1909        let a = ArxivId::parse("2401.12345").expect("new-style 5-digit seq");
1910        assert_eq!(a.as_str(), "2401.12345");
1911    }
1912
1913    #[test]
1914    fn arxiv_parse_accepts_new_style_with_version() {
1915        // Rule: optional `vN` version suffix.
1916        let a = ArxivId::parse("2401.12345v2").expect("with version");
1917        assert_eq!(a.as_str(), "2401.12345v2");
1918    }
1919
1920    #[test]
1921    fn arxiv_parse_accepts_old_style() {
1922        // Rule: old-style subject-class/YYMMNNN.
1923        let a = ArxivId::parse("cond-mat/9501001").expect("old-style cond-mat");
1924        assert_eq!(a.as_str(), "cond-mat/9501001");
1925    }
1926
1927    #[test]
1928    fn arxiv_parse_accepts_old_style_with_subclass_and_version() {
1929        // Rule: old-style subject-class may have a `.XX` two-upper subclass
1930        // and an optional `vN` suffix.
1931        let a = ArxivId::parse("astro-ph.CO/0703123v2").expect("old-style with subclass + version");
1932        assert_eq!(a.as_str(), "astro-ph.CO/0703123v2");
1933    }
1934
1935    #[test]
1936    fn arxiv_parse_accepts_arxiv_uri_scheme() {
1937        // Rule: `arxiv:` / `arXiv:` scheme is stripped at construction.
1938        let a = ArxivId::parse("arxiv:2401.12345").expect("arxiv: scheme");
1939        assert_eq!(a.as_str(), "2401.12345");
1940    }
1941
1942    #[test]
1943    fn arxiv_parse_accepts_arxiv_uri_scheme_mixed_case() {
1944        // Rule: scheme case-insensitive; matches the `arXiv:` form named
1945        // in docs/MCP_TOOLS.md.
1946        let a = ArxivId::parse("arXiv:2401.12345v2").expect("arXiv: scheme");
1947        assert_eq!(a.as_str(), "2401.12345v2");
1948    }
1949
1950    // ---- ArxivId::parse rejection paths (≥6) ------------------------
1951
1952    #[test]
1953    fn arxiv_parse_rejects_empty_input() {
1954        // Rule: empty rejected up-front.
1955        assert_eq!(ArxivId::parse(""), Err(RefParseError::Empty));
1956    }
1957
1958    #[test]
1959    fn arxiv_parse_rejects_no_dot_or_slash() {
1960        // Rule: must contain `.` (new-style) or `/` (old-style).
1961        assert_eq!(
1962            ArxivId::parse("notanarxivid"),
1963            Err(RefParseError::InvalidArxivShape)
1964        );
1965    }
1966
1967    #[test]
1968    fn arxiv_parse_rejects_new_style_wrong_head_length() {
1969        // Rule: head must be exactly 4 digits.
1970        assert_eq!(
1971            ArxivId::parse("240.12345"),
1972            Err(RefParseError::InvalidArxivShape)
1973        );
1974    }
1975
1976    #[test]
1977    fn arxiv_parse_rejects_new_style_seq_too_short() {
1978        // Rule: seq must be 4–5 digits.
1979        assert_eq!(
1980            ArxivId::parse("2401.123"),
1981            Err(RefParseError::InvalidArxivShape)
1982        );
1983    }
1984
1985    #[test]
1986    fn arxiv_parse_rejects_old_style_wrong_id_length() {
1987        // Rule: old-style id is exactly 7 digits.
1988        assert_eq!(
1989            ArxivId::parse("cond-mat/95001"),
1990            Err(RefParseError::InvalidArxivShape)
1991        );
1992    }
1993
1994    #[test]
1995    fn arxiv_parse_rejects_invalid_version_suffix() {
1996        // Rule: version suffix is `v` followed by ≥1 digits, nothing else.
1997        assert_eq!(
1998            ArxivId::parse("2401.12345v"),
1999            Err(RefParseError::InvalidArxivShape)
2000        );
2001    }
2002
2003    #[test]
2004    fn arxiv_parse_rejects_control_char() {
2005        // Rule (docs/SECURITY.md §1.1 log-injection): no control chars.
2006        assert_eq!(
2007            ArxivId::parse("2401.12345\n"),
2008            Err(RefParseError::InvalidArxivShape)
2009        );
2010    }
2011
2012    #[test]
2013    fn arxiv_parse_rejects_non_ascii() {
2014        // Rule: ASCII-only.
2015        assert_eq!(
2016            ArxivId::parse("2401.物理"),
2017            Err(RefParseError::InvalidArxivShape)
2018        );
2019    }
2020
2021    // ---- Ref::parse happy paths (≥6) --------------------------------
2022
2023    #[test]
2024    fn ref_parse_dispatches_doi_scheme_to_doi() {
2025        // Detection rule 1: explicit `doi:` scheme.
2026        match Ref::parse("doi:10.1234/example").expect("doi: dispatched to Doi") {
2027            Ref::Doi(d) => assert_eq!(d.as_str(), "10.1234/example"),
2028            other => panic!("expected Ref::Doi, got {:?}", other),
2029        }
2030    }
2031
2032    #[test]
2033    fn ref_parse_dispatches_arxiv_scheme_to_arxiv() {
2034        // Detection rule 2: explicit `arxiv:` scheme.
2035        match Ref::parse("arxiv:2401.12345").expect("arxiv: dispatched to Arxiv") {
2036            Ref::Arxiv(a) => assert_eq!(a.as_str(), "2401.12345"),
2037            other => panic!("expected Ref::Arxiv, got {:?}", other),
2038        }
2039    }
2040
2041    #[test]
2042    fn ref_parse_dispatches_arxiv_mixed_case_scheme() {
2043        // Detection rule 2 (case-insensitive): `arXiv:` form.
2044        match Ref::parse("arXiv:cond-mat/9501001").expect("arXiv: dispatched") {
2045            Ref::Arxiv(a) => assert_eq!(a.as_str(), "cond-mat/9501001"),
2046            other => panic!("expected Ref::Arxiv, got {:?}", other),
2047        }
2048    }
2049
2050    #[test]
2051    fn ref_parse_bare_doi_resolves_to_doi() {
2052        // Detection rule 3: bare input starting with `10.` is a DOI.
2053        match Ref::parse("10.1234/foo").expect("bare DOI") {
2054            Ref::Doi(d) => assert_eq!(d.as_str(), "10.1234/foo"),
2055            other => panic!("expected Ref::Doi, got {:?}", other),
2056        }
2057    }
2058
2059    #[test]
2060    fn ref_parse_bare_arxiv_new_resolves_to_arxiv() {
2061        // Detection rule 4: bare input not starting with `10.` falls
2062        // through to arXiv. Tests the ambiguous-input branch named in the
2063        // PR brief: `2401.12345` should resolve to ArxivId.
2064        match Ref::parse("2401.12345").expect("bare new-style arXiv") {
2065            Ref::Arxiv(a) => assert_eq!(a.as_str(), "2401.12345"),
2066            other => panic!("expected Ref::Arxiv, got {:?}", other),
2067        }
2068    }
2069
2070    #[test]
2071    fn ref_parse_bare_arxiv_old_resolves_to_arxiv() {
2072        // Detection rule 4: bare old-style arXiv id.
2073        match Ref::parse("cond-mat/9501001").expect("bare old-style arXiv") {
2074            Ref::Arxiv(a) => assert_eq!(a.as_str(), "cond-mat/9501001"),
2075            other => panic!("expected Ref::Arxiv, got {:?}", other),
2076        }
2077    }
2078
2079    // ---- Ref::parse rejection paths (≥6) ----------------------------
2080
2081    #[test]
2082    fn ref_parse_rejects_empty() {
2083        // Rule: empty up-front.
2084        assert_eq!(Ref::parse(""), Err(RefParseError::Empty));
2085    }
2086
2087    #[test]
2088    fn ref_parse_doi_scheme_with_invalid_doi_propagates_doi_error() {
2089        // When the scheme is explicit, we surface the parser's error
2090        // verbatim — not a generic "shape mismatch".
2091        assert_eq!(
2092            Ref::parse("doi:10.1234"),
2093            Err(RefParseError::MissingDoiSuffixSeparator)
2094        );
2095    }
2096
2097    #[test]
2098    fn ref_parse_arxiv_scheme_with_invalid_arxiv_propagates_arxiv_error() {
2099        assert_eq!(
2100            Ref::parse("arxiv:notanid"),
2101            Err(RefParseError::InvalidArxivShape)
2102        );
2103    }
2104
2105    #[test]
2106    fn ref_parse_bare_with_10_prefix_uses_doi_errors() {
2107        // Bare `10.…` heuristic: DOI parser is dispatched and its error
2108        // surfaces (here: bad registrant).
2109        assert_eq!(
2110            Ref::parse("10.12/x"),
2111            Err(RefParseError::InvalidDoiRegistrant)
2112        );
2113    }
2114
2115    #[test]
2116    fn ref_parse_bare_without_10_prefix_uses_arxiv_errors() {
2117        // Bare ambiguous fallback: ArxivId parser is dispatched and its
2118        // error surfaces. `1.2.3` is neither a DOI nor an arXiv shape.
2119        assert_eq!(Ref::parse("1.2.3"), Err(RefParseError::InvalidArxivShape));
2120    }
2121
2122    #[test]
2123    fn ref_parse_rejects_doi_scheme_with_oversized_suffix() {
2124        // Length-bound: DOI suffix > DOI_SUFFIX_MAX_LEN through Ref::parse
2125        // surfaces DoiSuffixTooLong, not a generic InvalidArxivShape.
2126        let suffix = "a".repeat(DOI_SUFFIX_MAX_LEN + 5);
2127        let input = format!("doi:10.1234/{}", suffix);
2128        match Ref::parse(&input) {
2129            Err(RefParseError::DoiSuffixTooLong { .. }) => {}
2130            other => panic!("expected DoiSuffixTooLong, got {:?}", other),
2131        }
2132    }
2133
2134    #[test]
2135    fn ref_parse_round_trip_via_serde_preserves_inner_string() {
2136        // Wire-format check: Doi/ArxivId are #[serde(transparent)], and a
2137        // round-trip through Ref::parse → serde_json → Ref must preserve
2138        // the inner identifier. Guards against accidental scheme leakage
2139        // into the stored form.
2140        let r = Ref::parse("doi:10.1234/example").expect("parse ok");
2141        let json = serde_json::to_string(&r).expect("serialize");
2142        // The transparent inner value is the bare identifier (no `doi:`).
2143        assert!(
2144            json.contains("10.1234/example") && !json.contains("doi:"),
2145            "scheme leaked into wire form: {}",
2146            json
2147        );
2148    }
2149
2150    #[test]
2151    fn ref_parse_error_maps_to_invalid_ref_error_code() {
2152        // Public-API contract (docs/PUBLIC_API.md §4): all parse failures
2153        // collapse to ErrorCode::InvalidRef at the public boundary.
2154        let err: ErrorCode = RefParseError::Empty.into();
2155        assert_eq!(err, ErrorCode::InvalidRef);
2156        let err2: ErrorCode = RefParseError::MissingDoiPrefix.into();
2157        assert_eq!(err2, ErrorCode::InvalidRef);
2158    }
2159
2160    // -----------------------------------------------------------------
2161    // DenialReason / DenialContext (ADR-0023) — wire-shape tests.
2162    // -----------------------------------------------------------------
2163
2164    #[test]
2165    fn denial_reason_serializes_snake_case() {
2166        // ADR-0023 §2 / docs/PUBLIC_API.md §8: wire form is snake_case.
2167        let s = serde_json::to_string(&DenialReason::RedirectNotInAllowlist).expect("ser");
2168        assert_eq!(s, "\"redirect_not_in_allowlist\"");
2169        let s = serde_json::to_string(&DenialReason::SizeCapExceeded).expect("ser");
2170        assert_eq!(s, "\"size_cap_exceeded\"");
2171        let s = serde_json::to_string(&DenialReason::ContentTypeMismatch).expect("ser");
2172        assert_eq!(s, "\"content_type_mismatch\"");
2173    }
2174
2175    #[test]
2176    fn denial_reason_round_trip_via_serde() {
2177        // Round-trip every closed-set variant so adding a new variant
2178        // forces this test to be updated (the closed-set contract).
2179        for r in [
2180            DenialReason::RedirectNotInAllowlist,
2181            DenialReason::InsecureScheme,
2182            DenialReason::HostInBlockList,
2183            DenialReason::SizeCapExceeded,
2184            DenialReason::SchemaDrift,
2185            DenialReason::CapabilityNotGranted,
2186            DenialReason::RateLimitWindow,
2187            DenialReason::SsrfPrivateAddress,
2188            DenialReason::ContentTypeMismatch,
2189        ] {
2190            let s = serde_json::to_string(&r).expect("ser");
2191            let back: DenialReason = serde_json::from_str(&s).expect("de");
2192            assert_eq!(back, r, "round-trip mismatch for {:?} -> {}", r, s);
2193        }
2194    }
2195
2196    #[test]
2197    fn denial_context_round_trips_full_shape() {
2198        // A populated context (the redirect-denied case from ADR-0023 §1
2199        // example) survives a JSON round-trip. Whole-struct equality
2200        // exercises the `PartialEq` derive added per ADR-0023 §3 (added
2201        // in the multi-agent review feedback PR — see ADR-0023 history).
2202        let dc = DenialContext {
2203            reason: DenialReason::RedirectNotInAllowlist,
2204            source: Some("crossref".to_string()),
2205            attempted: Some("evil.example.com".to_string()),
2206            expected: Some(vec![
2207                "api.crossref.org".to_string(),
2208                "*.crossref.org".to_string(),
2209            ]),
2210            hop_index: Some(1),
2211            cap: None,
2212            actual: None,
2213        };
2214        let s = serde_json::to_string(&dc).expect("ser");
2215        let back: DenialContext = serde_json::from_str(&s).expect("de");
2216        assert_eq!(back, dc);
2217    }
2218
2219    #[test]
2220    fn denial_context_serialize_elides_empty_fields() {
2221        // `skip_serializing_if = "Option::is_none"` must keep the wire form
2222        // lean: every `None` field MUST NOT appear on the wire. Reason is
2223        // always present.
2224        let dc = DenialContext {
2225            reason: DenialReason::CapabilityNotGranted,
2226            source: None,
2227            attempted: None,
2228            expected: None,
2229            hop_index: None,
2230            cap: None,
2231            actual: None,
2232        };
2233        let s = serde_json::to_string(&dc).expect("ser");
2234        assert_eq!(s, "{\"reason\":\"capability_not_granted\"}");
2235    }
2236
2237    #[test]
2238    fn denial_context_expected_some_empty_vec_preserves_explicit_empty_allowlist() {
2239        // Post-refinement disambiguation: `expected: Some(vec![])` is the
2240        // "explicit empty allowlist" signal and MUST survive the wire as
2241        // `"expected":[]`. Only `expected: None` is skipped on serialize.
2242        // This is the bug the previous `Vec<String>` shape masked.
2243        let dc = DenialContext {
2244            reason: DenialReason::RedirectNotInAllowlist,
2245            source: Some("crossref".to_string()),
2246            attempted: Some("evil.example.com".to_string()),
2247            expected: Some(Vec::new()),
2248            hop_index: None,
2249            cap: None,
2250            actual: None,
2251        };
2252        let s = serde_json::to_string(&dc).expect("ser");
2253        assert!(
2254            s.contains("\"expected\":[]"),
2255            "expected:[] must survive on the wire (got: {s})"
2256        );
2257        let back: DenialContext = serde_json::from_str(&s).expect("de");
2258        assert_eq!(back.expected, Some(Vec::new()));
2259    }
2260
2261    #[test]
2262    fn denial_context_deserialize_tolerates_missing_optional_fields() {
2263        // Consumer-side contract (ADR-0023 §3): consumers MUST tolerate
2264        // any subset of fields being present. Missing optional fields
2265        // deserialize to their defaults via `#[serde(default)]`.
2266        let wire = r#"{"reason":"size_cap_exceeded","cap":104857600,"actual":209715200}"#;
2267        let dc: DenialContext = serde_json::from_str(wire).expect("de");
2268        assert_eq!(dc.reason, DenialReason::SizeCapExceeded);
2269        assert_eq!(dc.cap, Some(104857600));
2270        assert_eq!(dc.actual, Some(209715200));
2271        assert!(dc.source.is_none());
2272        assert!(dc.attempted.is_none());
2273        assert!(dc.expected.is_none());
2274        assert!(dc.hop_index.is_none());
2275    }
2276
2277    #[test]
2278    fn full_error_envelope_with_denial_context_serializes_to_pinned_json() {
2279        // Pins the byte-exact wire shape of the full failure envelope
2280        // documented in docs/ERRORS.md §3 + §3.1 and ADR-0023 §1. A
2281        // future regression that flips key order or skip-rules anywhere
2282        // in the chain breaks this test loudly.
2283        //
2284        // Note: serde_json's `Map` (used by `json!`) sorts keys
2285        // alphabetically when the `preserve_order` feature is NOT
2286        // enabled (we do not enable it). Embedding a `DenialContext`
2287        // via `json!` first re-serialises it through the same alphabet-
2288        // sorted Map path, so the inner field order is also alphabetical
2289        // here — NOT the struct field-order produced by direct
2290        // `to_string(&DenialContext)`. This is by design: the public
2291        // wire shape is canonicalised by serde_json's Map ordering, so
2292        // the byte-exact pin below documents that exact canonicalisation.
2293        let denial = DenialContext {
2294            reason: DenialReason::RedirectNotInAllowlist,
2295            source: Some("crossref".into()),
2296            attempted: Some("evil.example.com".into()),
2297            expected: Some(vec!["api.crossref.org".into(), "*.crossref.org".into()]),
2298            hop_index: Some(1),
2299            cap: None,
2300            actual: None,
2301        };
2302        let envelope = serde_json::json!({
2303            "ok": false,
2304            "error": {
2305                "code": ErrorCode::NetworkError,
2306                "message": "redirect target evil.example.com not in allowlist for source crossref",
2307                "denial_context": denial,
2308            }
2309        });
2310        let actual = serde_json::to_string(&envelope).expect("serialize envelope");
2311        let expected = r#"{"error":{"code":"NETWORK_ERROR","denial_context":{"attempted":"evil.example.com","expected":["api.crossref.org","*.crossref.org"],"hop_index":1,"reason":"redirect_not_in_allowlist","source":"crossref"},"message":"redirect target evil.example.com not in allowlist for source crossref"},"ok":false}"#;
2312        assert_eq!(actual, expected);
2313    }
2314
2315    #[test]
2316    fn denial_context_rejects_unknown_fields() {
2317        // `#[serde(deny_unknown_fields)]` (ADR-0023 §3, PUBLIC_API.md §8):
2318        // an unknown field on the wire MUST be a deserialize error so
2319        // forward-compat field additions stay a breaking change.
2320        let wire = r#"{"reason":"capability_not_granted","banana":1}"#;
2321        let result: Result<DenialContext, _> = serde_json::from_str(wire);
2322        assert!(
2323            result.is_err(),
2324            "deny_unknown_fields must reject 'banana': {:?}",
2325            result.map(|d| d.reason),
2326        );
2327    }
2328}