Skip to main content

doiget_core/
orchestrator.rs

1//! Cross-source orchestrators that compose multiple [`Source`] impls into
2//! a single user-facing operation.
3//!
4//! Slice 2 of the doiget roadmap promotes [`fetch_paper`] and
5//! [`batch_fetch`] from `doiget-cli` into this module so the MCP server
6//! (`doiget-mcp`) and the CLI share one source of truth for the per-ref
7//! orchestration. The CLI's `commands::fetch::fetch_one` is now a thin
8//! wrapper that delegates here and adds the human-facing stderr print
9//! line. Dry-run preview helpers live as [`fetch_paper_plan`] and
10//! [`batch_fetch_plans`].
11//!
12//! [`Source`]: crate::source::Source
13
14use std::collections::BTreeMap;
15
16use camino::{Utf8Path, Utf8PathBuf};
17use chrono::Utc;
18use serde_json::Value;
19
20use crate::dry_run::{build_fetch_plan, try_build_fetch_plan, FetchPlan};
21use crate::http::HttpError;
22use crate::provenance::{Capability, LogEvent, LogResult, RowInput};
23use crate::source::{FetchContext, FetchError, FetchResult, Source};
24use crate::sources::arxiv::ArxivSource;
25use crate::sources::crossref::CrossrefSource;
26use crate::sources::unpaywall::UnpaywallSource;
27use crate::store::{DoigetExtension, Metadata, Store};
28use crate::{ArxivId, CapabilityProfile, Doi, Ref, Safekey, MAX_BATCH_REFS, SCHEMA_VERSION};
29
30/// Outcome of a successful [`metadata_only`] call.
31///
32/// Mirrors the wire shape documented in `docs/MCP_TOOLS.md` §11: the
33/// `source` identifies which resolver produced the metadata, `license`
34/// is the OA license string when known (Unpaywall channel), `oa_url` is
35/// the discovered OA URL **(never followed by this orchestrator)**, and
36/// `metadata` is the source's native JSON payload (Crossref `message`,
37/// Unpaywall work record, or the parsed arXiv Atom-feed object).
38///
39/// `metadata` is serialized as-is by the MCP envelope builder
40/// (`crates/doiget-mcp/src/lib.rs`); we deliberately do NOT normalize
41/// here so the agent can see exactly what the source returned.
42#[derive(Debug, Clone)]
43#[non_exhaustive]
44pub struct MetadataOnlyOutcome {
45    /// Resolver key that produced the metadata payload. One of
46    /// `"crossref"`, `"unpaywall"`, `"arxiv"` (the closed set named in
47    /// `docs/MCP_TOOLS.md` §11 type alias).
48    pub source: String,
49    /// Resolver profile under which the canonical-digest (ADR-0021 §1)
50    /// was minted for this call. In Slice 4 this equals
51    /// [`Self::source`] verbatim (the metadata-only path emits one row
52    /// per consulted resolver); future slices that introduce overlapping
53    /// resolvers MAY have `resolver_profile != source`. Surfaced through
54    /// the `doiget_metadata_only` MCP envelope per ADR-0021 §4.
55    pub resolver_profile: String,
56    /// OA license string when the resolver could supply one (today only
57    /// the Unpaywall fallback path populates this). `None` when the
58    /// primary source did not surface a license.
59    pub license: Option<String>,
60    /// Discovered OA URL — surfaced to the caller for separate action,
61    /// **never followed by this orchestrator**. The Crossref response's
62    /// `message.link[]` array is mined first; the Unpaywall fallback
63    /// path uses `best_oa_location.url_for_pdf` (or `url`).
64    pub oa_url: Option<String>,
65    /// Source's native metadata payload. For Crossref this is the
66    /// `message` object; for Unpaywall the work record; for arXiv the
67    /// parsed Atom-feed JSON (see
68    /// `crate::sources::arxiv::parse_atom_feed`).
69    pub metadata: Value,
70}
71
72/// Resolve a [`Ref`] to metadata WITHOUT triggering a publisher PDF
73/// fetch.
74///
75/// Binding spec: `docs/MCP_TOOLS.md` §11 (NORMATIVE — this function
76/// MUST NOT call [`crate::http::HttpClient::fetch_pdf`] under any code
77/// path). The posture-lint workflow greps for that pattern; the test
78/// suite additionally exercises the DOI and arXiv branches end-to-end
79/// against wiremock to assert the OA URL is reported, not followed.
80///
81/// # Dispatch
82///
83/// - `Ref::Doi(_)` → Crossref first (bibliographic metadata + OA URL
84///   via `message.link[]`). If Crossref returns a usable payload the
85///   call returns immediately; Unpaywall is consulted only as a fallback
86///   when Crossref fails. The Unpaywall fallback surfaces a license
87///   string and may overwrite `oa_url` with the `best_oa_location`
88///   channel.
89/// - `Ref::Arxiv(_)` → [`ArxivSource::fetch_metadata_only`]: ONLY the
90///   Atom feed (`https://export.arxiv.org/api/query?id_list=<id>`) is
91///   consulted; the PDF endpoint is NOT touched. `license` is set to
92///   the platform-wide `"arxiv-default"` token, `oa_url` is `None`
93///   (the arXiv abstract page is not a PDF URL).
94///
95/// # Side effects
96///
97/// Each consulted source appends ONE `LogEvent::Fetch` row to
98/// `ctx.log` (arXiv emits its row under `Capability::Metadata`; the
99/// DOI sources emit under `Capability::Oa` — they pre-date this
100/// distinction and a follow-up slice may unify them). The orchestrator
101/// itself does NOT bracket the call with `SessionStart` / `SessionEnd`
102/// rows — that is the MCP server's responsibility (it owns the
103/// per-tool-call session boundary).
104///
105/// This function is the **pure resolver**: it consults the source(s)
106/// and emits provenance rows, but it does NOT write to the store.
107/// The `docs/MCP_TOOLS.md` §11 store-write SIDE EFFECT is provided by
108/// [`metadata_only_to_store`], which wraps this and persists the
109/// metadata TOML to `<root>/.metadata/<safekey>.toml`. Keeping the
110/// store-write in a *separate* entry point is exactly what lets
111/// [`resolve_only`] safely delegate here — its contract forbids any
112/// store write, and a pure `metadata_only` can never regress that
113/// invariant (#139).
114///
115/// # Errors
116///
117/// Returns [`FetchError`] from the underlying [`Source`] dispatch. The
118/// MCP boundary converts these to the closed [`crate::ErrorCode`] set
119/// via the existing `From<FetchError> for ErrorCode` impl.
120// Stays `pub` (a `pub(crate)` compile-time guard was considered and
121// rejected): `crates/doiget-core/tests/` integration tests
122// (`real_world_fixtures_e2e`) legitimately drive the PURE resolver
123// directly and assert its outcome, and `tests/` compiles as a separate
124// crate. The #139 pre-fix bug (an MCP caller
125// picking the pure variant when it needed persistence) is instead
126// prevented *structurally*: the MCP layer imports only
127// `metadata_only_to_store`, and `resolve_only` delegates to this pure
128// fn — neither can acquire or skip the store-write by mistake.
129pub async fn metadata_only(
130    ref_: &Ref,
131    profile: &CapabilityProfile,
132    ctx: &FetchContext,
133) -> Result<MetadataOnlyOutcome, FetchError> {
134    match ref_ {
135        Ref::Doi(doi) => metadata_only_doi(doi, ref_, profile, ctx).await,
136        Ref::Arxiv(id) => {
137            let arxiv = arxiv_source_from_env();
138            let metadata = arxiv.fetch_metadata_only(id, ctx).await?;
139            // Pure resolver — no store write here (see fn doc); the
140            // store-write side effect lives in `metadata_only_to_store`.
141            Ok(MetadataOnlyOutcome {
142                source: arxiv.name().to_string(),
143                resolver_profile: arxiv.name().to_string(),
144                license: Some("arxiv-default".to_string()),
145                oa_url: None,
146                metadata,
147            })
148        }
149    }
150}
151
152/// Resolve a [`Ref`] to metadata with **no local persistence**.
153///
154/// This is the audit-trail-preserving sibling of [`metadata_only`]: each
155/// consulted [`Source`] still emits its own `LogEvent::Fetch` row
156/// through `ctx.log` (so the provenance hash chain remains continuous,
157/// per `docs/PROVENANCE_LOG.md`), but the orchestrator MUST NOT write
158/// the metadata TOML to the store under any code path — present or
159/// future.
160///
161/// Binding spec: `docs/MCP_TOOLS.md` §1 (the `doiget_resolve_paper`
162/// tool — Slice 7).
163///
164/// # Why this exists as a distinct orchestrator
165///
166/// [`metadata_only`] is the **pure resolver** and never writes to the
167/// store; the store-write SIDE EFFECT lives only in the separate
168/// [`metadata_only_to_store`] wrapper. Because the write is in a
169/// *different* entry point that this function does not call,
170/// delegating to [`metadata_only`] is permanently safe — there is no
171/// code path by which `resolve_only` can acquire a store write, now or
172/// in future (#139). This structural separation is the entire reason
173/// `metadata_only` was split into a pure core + a persisting wrapper
174/// rather than gaining a `write: bool` parameter.
175///
176/// # Dispatch
177///
178/// Identical to [`metadata_only`] (DOI → Crossref-first with Unpaywall
179/// fallback; arXiv → Atom feed only). The `oa_url` and `license`
180/// outputs follow the same rules.
181///
182/// # Side effects
183///
184/// One `LogEvent::Fetch` row per consulted resolver, written by the
185/// underlying [`Source`] impls. No metadata TOML write. No PDF fetch.
186/// No store mutation.
187///
188/// # Errors
189///
190/// Returns [`FetchError`] from the underlying [`Source`] dispatch,
191/// identical to [`metadata_only`].
192pub async fn resolve_only(
193    ref_: &Ref,
194    profile: &CapabilityProfile,
195    ctx: &FetchContext,
196) -> Result<MetadataOnlyOutcome, FetchError> {
197    // Delegating to the PURE `metadata_only` is the contract-correct
198    // implementation, not a placeholder: `metadata_only` never writes
199    // to the store (the persisting path is the separate
200    // `metadata_only_to_store`, which this function does not call), so
201    // `resolve_only`'s "no store mutation" guarantee holds structurally
202    // and cannot regress (#139).
203    metadata_only(ref_, profile, ctx).await
204}
205
206/// Resolve a [`Ref`] to metadata **and persist the metadata TOML to the
207/// store** — the `docs/MCP_TOOLS.md` §11 `doiget_metadata_only` SIDE
208/// EFFECT (#139).
209///
210/// Wraps the pure [`metadata_only`]: it runs the same resolver dispatch
211/// (so the provenance hash chain is identical), then writes
212/// `<root>/.metadata/<safekey>.toml` via the same
213/// `write_metadata_and_pdf` path `fetch_paper` uses for its
214/// metadata-only fallback, emitting one `StoreWrite` provenance row.
215///
216/// [`resolve_only`] MUST NOT call this — its contract forbids any store
217/// write. The split (pure core vs. persisting wrapper) makes that
218/// invariant structural rather than a convention.
219///
220/// # Errors
221///
222/// [`FetchError`] from the underlying resolver dispatch, or — if the
223/// store write fails — [`FetchError::SourceSchema`] (the closest
224/// closed-set arm; there is no dedicated `FetchError::StoreError`, so
225/// the MCP boundary maps it to `INTERNAL_ERROR` — see the inline note
226/// in `write_metadata_and_pdf`). On store-write failure
227/// `write_metadata_and_pdf` makes a **best-effort** attempt to
228/// append a `StoreWrite`/`Err` provenance row before the error
229/// propagates (that append's own failure is not separately surfaced —
230/// this matches the pre-existing `fetch_paper` metadata-only fallback
231/// path and is out of scope for #139).
232pub async fn metadata_only_to_store(
233    ref_: &Ref,
234    profile: &CapabilityProfile,
235    ctx: &FetchContext,
236    store: &dyn Store,
237) -> Result<MetadataOnlyOutcome, FetchError> {
238    let outcome = metadata_only(ref_, profile, ctx).await?;
239    let safekey = ref_.safekey();
240    let metadata = build_metadata_only_metadata(ref_, &outcome);
241    // `pdf_src = None` => writes `<root>/.metadata/<safekey>.toml` and
242    // appends the `StoreWrite` row (the exact path `fetch_paper` uses
243    // for its DOI metadata-only fallback).
244    write_metadata_and_pdf(store, &safekey, &metadata, None, ctx)?;
245    Ok(outcome)
246}
247
248/// Build the [`Metadata`] persisted by [`metadata_only_to_store`].
249///
250/// Minimal but valid: enough that a subsequent `doiget_info` returns a
251/// non-null `metadata` object (the #139 acceptance criterion). Title is
252/// best-effort from the resolver payload (`title` as a string, or the
253/// first element if it is an array — Crossref's `message.title` is
254/// typically an array, arXiv/Unpaywall typically a string; the
255/// extractor tolerates either regardless of source); it falls back to
256/// the ref id so the required `title` field is never empty.
257/// Bibliographic enrichment
258/// (year, venue, …) is intentionally out of scope here — the
259/// metadata-only contract is "persist what the resolver returned", and
260/// the raw payload is preserved verbatim in `MetadataOnlyOutcome`.
261fn build_metadata_only_metadata(ref_: &Ref, outcome: &MetadataOnlyOutcome) -> Metadata {
262    let (doi, arxiv_id) = match ref_ {
263        Ref::Doi(d) => (Some(d.clone()), None),
264        Ref::Arxiv(a) => (None, Some(a.clone())),
265    };
266    let ref_id = ref_.as_input_str().to_string();
267    let title = match extract_metadata_title(&outcome.metadata) {
268        Some(t) => t,
269        None => {
270            // The resolver returned a payload with no usable title.
271            // Persisting the ref id keeps the entry valid (#139), but
272            // emit a diagnostic so a broken/partial resolver response is
273            // not silently indistinguishable from a genuine title.
274            tracing::warn!(
275                ref_id = %ref_id,
276                source = %outcome.source,
277                "metadata-only: no usable title in resolver payload; \
278                 persisting the ref id as the title placeholder"
279            );
280            ref_id
281        }
282    };
283    Metadata {
284        schema_version: SCHEMA_VERSION.to_string(),
285        title,
286        authors: extract_metadata_authors(&outcome.metadata),
287        year: None,
288        doi,
289        arxiv_id,
290        abstract_: None,
291        venue: None,
292        publisher: None,
293        issn: None,
294        isbn: None,
295        type_: None,
296        keywords: Vec::new(),
297        url: outcome.oa_url.clone(),
298        pdf_path: None,
299        doiget: Some(DoigetExtension {
300            fetched_at: Utc::now(),
301            source: outcome.source.clone(),
302            license: outcome
303                .license
304                .clone()
305                .unwrap_or_else(|| "unknown".to_string()),
306            size_bytes: 0,
307            mcp_call_id: None,
308        }),
309        other: BTreeMap::new(),
310    }
311}
312
313/// `title` from a resolver payload: a bare string, or the first
314/// **non-blank** element of an array (Crossref `message.title` is
315/// `[String]`; a leading empty/whitespace element is skipped rather
316/// than masking the real title). Trimmed. `None` if absent/blank.
317fn extract_metadata_title(meta: &Value) -> Option<String> {
318    let t = meta.get("title")?;
319    let s = match t.as_str() {
320        Some(s) => s.trim().to_string(),
321        None => t
322            .as_array()?
323            .iter()
324            .filter_map(Value::as_str)
325            .map(str::trim)
326            .find(|s| !s.is_empty())?
327            .to_string(),
328    };
329    if s.is_empty() {
330        None
331    } else {
332        Some(s)
333    }
334}
335
336/// Best-effort author list, tolerant of the resolver shapes we may see:
337/// Crossref `author: [{given,family}]`, arXiv `authors: [String]`, and
338/// a `z_authors: [{given,family}]` fallback. NOTE: doiget's Unpaywall
339/// source deserializes a *partial* `UnpaywallWork` that does not capture
340/// `z_authors`, so the `z_authors` branch is currently inert for the
341/// Unpaywall path (kept as forward-compat for if/when that struct
342/// captures it) — Unpaywall-sourced metadata-only entries get an empty
343/// author list. Returns `Vec::new()` when nothing is parseable (a valid
344/// metadata TOML — #139 only requires the entry to exist and be
345/// readable).
346fn extract_metadata_authors(meta: &Value) -> Vec<String> {
347    if let Some(arr) = meta.get("authors").and_then(Value::as_array) {
348        let v: Vec<String> = arr
349            .iter()
350            .filter_map(|a| a.as_str().map(str::to_string))
351            .collect();
352        if !v.is_empty() {
353            return v;
354        }
355    }
356    for key in ["author", "z_authors"] {
357        if let Some(arr) = meta.get(key).and_then(Value::as_array) {
358            let v: Vec<String> = arr
359                .iter()
360                .filter_map(|a| {
361                    let given = a.get("given").and_then(Value::as_str).unwrap_or("");
362                    let family = a.get("family").and_then(Value::as_str).unwrap_or("");
363                    let name = format!("{given} {family}");
364                    let name = name.trim();
365                    if name.is_empty() {
366                        a.get("name").and_then(Value::as_str).map(str::to_string)
367                    } else {
368                        Some(name.to_string())
369                    }
370                })
371                .collect();
372            if !v.is_empty() {
373                return v;
374            }
375        }
376    }
377    Vec::new()
378}
379
380// ---------------------------------------------------------------------------
381// Env-aware source constructors (mirrors doiget-cli::commands::fetch::build_*)
382//
383// These let MCP integration tests redirect the orchestrator at a
384// wiremock origin via `DOIGET_*_BASE` env vars, without inverting the
385// `doiget-mcp -> doiget-core` wiring by depending on `doiget-cli`. The
386// override surface is identical to the CLI's `fetch.rs::build_*_source`
387// helpers so a single test fixture can drive both crates.
388// ---------------------------------------------------------------------------
389
390/// `DOIGET_CONTACT_EMAIL`, defaulting to the same `doiget@localhost`
391/// the CLI uses (`crates/doiget-cli/src/commands/fetch.rs::OrchestratorConfig`).
392const FALLBACK_CONTACT_EMAIL: &str = "doiget@localhost";
393
394fn contact_email_from_env() -> String {
395    std::env::var("DOIGET_CONTACT_EMAIL").unwrap_or_else(|_| FALLBACK_CONTACT_EMAIL.to_string())
396}
397
398fn arxiv_source_from_env() -> ArxivSource {
399    if let Ok(s) = std::env::var("DOIGET_ARXIV_BASE") {
400        if let Ok(url) = url::Url::parse(&s) {
401            return ArxivSource::with_base(url);
402        }
403    }
404    ArxivSource::new()
405}
406
407fn crossref_source_from_env(contact: &str) -> CrossrefSource {
408    if let Ok(s) = std::env::var("DOIGET_CROSSREF_BASE") {
409        if let Ok(url) = url::Url::parse(&s) {
410            return CrossrefSource::with_base(url, contact.to_string());
411        }
412    }
413    CrossrefSource::new(contact.to_string())
414}
415
416fn unpaywall_source_from_env(contact: &str) -> UnpaywallSource {
417    if let Ok(s) = std::env::var("DOIGET_UNPAYWALL_BASE") {
418        if let Ok(url) = url::Url::parse(&s) {
419            return UnpaywallSource::with_base(url, contact.to_string());
420        }
421    }
422    UnpaywallSource::new(contact.to_string())
423}
424
425/// DOI branch — Crossref first, with Unpaywall as a fallback when
426/// Crossref fails. Crossref's `message.link[]` array (when present)
427/// supplies the OA URL hint without making a publisher request.
428async fn metadata_only_doi(
429    _doi: &Doi,
430    ref_: &Ref,
431    profile: &CapabilityProfile,
432    ctx: &FetchContext,
433) -> Result<MetadataOnlyOutcome, FetchError> {
434    let contact = contact_email_from_env();
435    let crossref = crossref_source_from_env(&contact);
436    match crossref.fetch(ref_, profile, ctx).await {
437        Ok(res) => {
438            let metadata = res.metadata_json.unwrap_or(Value::Null);
439            let oa_url = extract_crossref_oa_url(&metadata);
440            // Pure resolver — no store write here (see `metadata_only`
441            // doc); persistence is `metadata_only_to_store`'s job.
442            Ok(MetadataOnlyOutcome {
443                source: crossref.name().to_string(),
444                resolver_profile: crossref.name().to_string(),
445                // Crossref does not surface a license directly; the
446                // license channel for DOI metadata is Unpaywall's
447                // `best_oa_location.license`. Leave `None` here; the
448                // agent can call `unpaywall` (or a follow-up slice's
449                // chained orchestrator) if it needs a license string.
450                license: None,
451                oa_url,
452                metadata,
453            })
454        }
455        Err(crossref_err) => {
456            // Crossref failed. Try Unpaywall as a fallback before
457            // surfacing the original error.
458            let unpaywall = unpaywall_source_from_env(&contact);
459            match unpaywall.fetch(ref_, profile, ctx).await {
460                Ok(res) => {
461                    let metadata = res.metadata_json.unwrap_or(Value::Null);
462                    let oa_url = extract_unpaywall_oa_url(&metadata);
463                    let license = if res.license == "unknown" {
464                        None
465                    } else {
466                        Some(res.license)
467                    };
468                    Ok(MetadataOnlyOutcome {
469                        source: unpaywall.name().to_string(),
470                        resolver_profile: unpaywall.name().to_string(),
471                        license,
472                        oa_url,
473                        metadata,
474                    })
475                }
476                Err(_unpaywall_err) => {
477                    // Both sources failed; surface the Crossref error
478                    // (the primary path) for diagnosability.
479                    Err(crossref_err)
480                }
481            }
482        }
483    }
484}
485
486/// Defensively pull a Crossref OA URL out of a `message.link[]` entry.
487///
488/// The Crossref `Link` model documents `link[].URL` as the OA URL string
489/// when the work has one (see
490/// `<https://api.crossref.org/swagger-ui/index.html>`). Multiple entries
491/// may be present; we return the first non-empty `URL` field
492/// encountered. Returns `None` if the array is missing, empty, or
493/// contains no usable URL string.
494fn extract_crossref_oa_url(msg: &Value) -> Option<String> {
495    let arr = msg.get("link")?.as_array()?;
496    arr.iter()
497        .filter_map(|entry| entry.get("URL").and_then(Value::as_str))
498        .find(|s| !s.is_empty())
499        .map(|s| s.to_string())
500}
501
502/// Defensively pull Unpaywall's preferred OA URL
503/// (`best_oa_location.url_for_pdf`, falling back to `.url`) out of a
504/// metadata payload.
505fn extract_unpaywall_oa_url(meta: &Value) -> Option<String> {
506    let loc = meta.get("best_oa_location")?;
507    loc.get("url_for_pdf")
508        .and_then(Value::as_str)
509        .or_else(|| loc.get("url").and_then(Value::as_str))
510        .map(|s| s.to_string())
511}
512
513// ---------------------------------------------------------------------------
514// fetch_paper — single-ref orchestrator (Slice 2)
515// ---------------------------------------------------------------------------
516
517/// Outcome of a successful [`fetch_paper`] call.
518///
519/// Wire shape mirrors `docs/MCP_TOOLS.md` §5 `FetchResult` minus the
520/// envelope chrome the MCP server wraps it in (`ok: true`, `ref`,
521/// optional `error`).
522///
523/// `path` is the absolute path of the resource the orchestrator wrote to
524/// the store. For arXiv refs and successful DOI OA-PDF fetches this is
525/// `<root>/<safekey>.pdf`; for the DOI metadata-only fallback (OA URL
526/// host off the `oa-publisher` allowlist, or PDF leg failed for another
527/// transport reason — `docs/REDIRECT_ALLOWLIST.md` §3 informed-best-
528/// effort posture) this is `<root>/.metadata/<safekey>.toml`.
529/// Outcome of the DOI OA-PDF leg, carried on [`FetchPaperOutcome`] so a
530/// caller can NEVER silently report a blocked PDF as a plain
531/// "metadata-only" success (issue #118). The product promise is
532/// "immediately explain WHY a paper can't be fetched" — the distinction
533/// between "there was no OA PDF to fetch" and "an OA PDF existed but we
534/// were blocked, and here is the reason" is exactly that explanation.
535#[derive(Debug, Clone)]
536#[non_exhaustive]
537pub enum PdfLegStatus {
538    /// A PDF was fetched and written to disk (arXiv always; DOI when
539    /// the OA-publisher leg succeeded).
540    Fetched,
541    /// No OA URL was discovered (Unpaywall reported no
542    /// `best_oa_location`). Metadata-only is the correct, expected
543    /// result here — not a failure.
544    NoOaUrl,
545    /// An OA URL *was* discovered but the PDF could not be retrieved
546    /// (host outside the oa-publisher allowlist, not-a-PDF body,
547    /// transport failure, …). Metadata was still written, but the
548    /// caller MUST surface this reason rather than pretending the
549    /// fetch was a clean metadata-only success.
550    Blocked {
551        /// Closed-set code, mapped from the underlying transport error
552        /// via the canonical `From<FetchError> for ErrorCode`.
553        code: crate::ErrorCode,
554        /// Human-readable one-line reason (the `FetchError` display).
555        message: String,
556        /// Structured denial side-channel (ADR-0023) when the failure
557        /// was an allowlist / scheme denial; `None` otherwise.
558        denial: Option<crate::DenialContext>,
559        /// Actionable suggested arXiv ID for the same paper when Unpaywall
560        /// metadata includes an arXiv alternative but the PDF leg was blocked.
561        suggested_arxiv_id: Option<String>,
562    },
563}
564
565/// What `fetch_paper` wrote to disk and how.
566///
567/// `path` is the PDF (`<root>/<safekey>.pdf`) on a successful PDF
568/// fetch, or the metadata TOML (`<root>/.metadata/<safekey>.toml`)
569/// when the DOI path fell back to metadata-only. [`Self::pdf_leg`]
570/// disambiguates *why* there is no PDF (genuinely none available vs.
571/// available-but-blocked) so callers never report a blocked PDF as a
572/// silent success (issue #118).
573#[derive(Debug, Clone)]
574#[non_exhaustive]
575pub struct FetchPaperOutcome {
576    /// `Source::name()` of the resolver whose payload landed on disk:
577    /// `"arxiv"` for an arXiv ref, `"oa-publisher"` when the DOI OA PDF
578    /// leg succeeded, or `"crossref"` / `"unpaywall"` when the DOI path
579    /// fell back to metadata-only. Mirrors the value written to
580    /// `[doiget].source` in the metadata TOML.
581    pub source: String,
582    /// Resolver profile under which the canonical-digest (ADR-0021 §1)
583    /// was minted for the final artifact. For an arXiv fetch this is
584    /// `"arxiv"`; for a successful DOI OA PDF leg this is
585    /// `"oa-publisher"`; for the DOI metadata-only fallback this is the
586    /// metadata source key (`"crossref"` / `"unpaywall"`). Equal to
587    /// [`Self::source`] verbatim in Slice 4 but kept distinct so future
588    /// slices can decouple "which resolver wrote to disk" from "which
589    /// resolver is the audit identity". Surfaced through the
590    /// `doiget_fetch_paper` MCP envelope per ADR-0021 §4.
591    pub resolver_profile: String,
592    /// OA license string (`"CC-BY-4.0"`, `"cc-by"`, `"arxiv-default"`,
593    /// `"unknown"`). Mirrors `[doiget].license`.
594    pub license: String,
595    /// Absolute path of the artifact actually written
596    /// (`<root>/<safekey>.pdf` on success, `<root>/.metadata/<safekey>.toml`
597    /// on metadata-only fallback).
598    pub path: Utf8PathBuf,
599    /// Stored PDF size in bytes; `0` on the metadata-only fallback
600    /// (`docs/REDIRECT_ALLOWLIST.md` §3.5).
601    pub size_bytes: u64,
602    /// The schema version of the metadata TOML written
603    /// (always [`crate::SCHEMA_VERSION`] for this build).
604    pub schema_version: String,
605    /// What happened on the PDF leg (issue #118). `Fetched` /
606    /// `NoOaUrl` are clean outcomes; `Blocked` carries the structured
607    /// reason an OA PDF existed but could not be retrieved, so the
608    /// CLI / MCP surface it instead of a silent metadata-only success.
609    pub pdf_leg: PdfLegStatus,
610    /// Per-ref [`crate::Safekey`] stringified (`Ref::safekey().as_str()`).
611    /// Exposed on the outcome so JSON-mode CLI / MCP callers can
612    /// emit a structured success body without re-parsing the input
613    /// ref (#210 / `docs/ERRORS.md` §3). Always populated.
614    pub safekey: String,
615    /// ADR-0021 §1 canonical-digest as 64-char lowercase hex for the
616    /// resolver_profile that produced this outcome's audit identity.
617    /// For an arXiv fetch this is the digest under `"arxiv"`; for a
618    /// DOI OA PDF leg this is under `"oa-publisher"`; for the DOI
619    /// metadata-only fallback this is under the metadata source key
620    /// (`"crossref"` / `"unpaywall"`). Always populated.
621    pub canonical_digest: String,
622}
623
624impl FetchPaperOutcome {
625    /// Test-only constructor for downstream crates (`doiget-cli`,
626    /// `doiget-mcp`) that need to drive classification / rendering
627    /// logic without running the full orchestrator. Produces a
628    /// minimal but structurally-valid outcome — all required fields
629    /// populated with defensible stubs — so unit tests can assert
630    /// the surrounding behavior (JSONL shape, exit-code mapping,
631    /// PDF-leg branching) in isolation.
632    ///
633    /// `#[doc(hidden)]` because this is not a stable public API; the
634    /// signature may change to fit test needs without a CHANGELOG
635    /// `[BREAKING]` callout.
636    #[doc(hidden)]
637    pub fn for_test_synthetic(
638        safekey: impl Into<String>,
639        source: impl Into<String>,
640        pdf_leg: PdfLegStatus,
641    ) -> Self {
642        let safekey: String = safekey.into();
643        let source: String = source.into();
644        Self {
645            source: source.clone(),
646            resolver_profile: source.clone(),
647            license: "unknown".to_string(),
648            path: Utf8PathBuf::from(format!("/tmp/{safekey}.pdf")),
649            size_bytes: 0,
650            schema_version: SCHEMA_VERSION.to_string(),
651            pdf_leg,
652            safekey: safekey.clone(),
653            // 32 bytes of `0x00` → a stable, non-secret digest stub
654            // that's still 64 chars of lowercase hex.
655            canonical_digest: "00".repeat(32),
656        }
657    }
658}
659
660/// Resolve a [`Ref`] to a PDF (or metadata-only fallback) and write it
661/// through `store`.
662///
663/// Binding spec: `docs/MCP_TOOLS.md` §4 (`doiget_fetch_paper`),
664/// `docs/REDIRECT_ALLOWLIST.md` §3 (informed-best-effort posture for the
665/// DOI OA PDF leg), `docs/PROVENANCE_LOG.md` §3 (per-attempt `Fetch` rows
666/// emitted by the source impls; `StoreWrite` row emitted by this
667/// orchestrator).
668///
669/// # Dispatch
670///
671/// - `Ref::Arxiv(_)` → [`ArxivSource::fetch`]; the source returns PDF
672///   bytes + Atom-feed metadata. The orchestrator writes both the PDF
673///   and the metadata TOML.
674/// - `Ref::Doi(_)` → Crossref metadata + Unpaywall license/OA-URL
675///   enrichment + (when the OA URL host is on the `oa-publisher`
676///   allowlist) a publisher PDF leg. A failure on the PDF leg is
677///   non-fatal: the metadata is still written and the orchestrator
678///   returns `Ok(...)` with `source` set to the metadata source.
679///
680/// # Side effects
681///
682/// Each consulted source emits one `LogEvent::Fetch` row via
683/// `ctx.log.append`. The orchestrator additionally emits one
684/// `LogEvent::StoreWrite` row on the successful write. Session bookend
685/// rows are the caller's responsibility (the CLI's
686/// `commands::fetch::run_with_options` wraps the call; the MCP server's
687/// `doiget_fetch_paper` tool method wraps it too).
688///
689/// # Errors
690///
691/// Returns [`FetchError`] from the underlying [`Source`] dispatch. The
692/// MCP boundary converts these to the closed [`crate::ErrorCode`] set
693/// via the existing `From<FetchError> for ErrorCode` impl.
694pub async fn fetch_paper(
695    ref_: &Ref,
696    profile: &CapabilityProfile,
697    ctx: &FetchContext,
698    store: &dyn Store,
699    store_root: &Utf8Path,
700) -> Result<FetchPaperOutcome, FetchError> {
701    let safekey = ref_.safekey();
702    match ref_ {
703        Ref::Arxiv(id) => {
704            fetch_paper_arxiv(id, ref_, profile, ctx, store, store_root, &safekey).await
705        }
706        Ref::Doi(doi) => {
707            fetch_paper_doi(doi, ref_, profile, ctx, store, store_root, &safekey).await
708        }
709    }
710}
711
712/// Build the dry-run preview ([`FetchPlan`]) for a single ref without
713/// touching the network, store, or provenance log. Thin re-export of
714/// [`crate::dry_run::build_fetch_plan`] under the slice-2 naming the
715/// MCP tool surfaces use; kept here so the MCP `doiget_fetch_paper`
716/// tool method does not have to reach across two modules.
717pub fn fetch_paper_plan(ref_: &Ref, store_root: &Utf8Path) -> FetchPlan {
718    build_fetch_plan(ref_, store_root)
719}
720
721/// Fallible sibling of [`fetch_paper_plan`] — propagates an internal
722/// allowlist-contract drift as a typed [`FetchError::SourceSchema`]
723/// instead of degrading to an empty `candidate_hosts` list (issue
724/// #156 ②). Thin re-export of [`crate::dry_run::try_build_fetch_plan`].
725/// Added alongside the infallible [`fetch_paper_plan`] rather than
726/// changing its signature, because `fetch_paper_plan` is `pub` and
727/// called from `doiget-mcp`, which is out of scope for this batch.
728///
729/// # Errors
730///
731/// See [`crate::dry_run::try_build_fetch_plan`].
732pub fn try_fetch_paper_plan(ref_: &Ref, store_root: &Utf8Path) -> Result<FetchPlan, FetchError> {
733    try_build_fetch_plan(ref_, store_root)
734}
735
736/// arXiv branch of [`fetch_paper`]. Internal — public callers go
737/// through `fetch_paper`.
738async fn fetch_paper_arxiv(
739    id: &ArxivId,
740    ref_: &Ref,
741    profile: &CapabilityProfile,
742    ctx: &FetchContext,
743    store: &dyn Store,
744    store_root: &Utf8Path,
745    safekey: &Safekey,
746) -> Result<FetchPaperOutcome, FetchError> {
747    let source = arxiv_source_from_env();
748    if !source.can_serve(profile, ref_) {
749        return Err(FetchError::NotEligible {
750            source_key: source.name().to_string(),
751        });
752    }
753
754    let FetchResult {
755        license,
756        pdf_bytes,
757        final_url,
758        ..
759    } = source.fetch(ref_, profile, ctx).await?;
760    let pdf = pdf_bytes.ok_or_else(|| FetchError::SourceSchema {
761        hint: "arxiv source returned no PDF bytes".to_string(),
762    })?;
763    let size_bytes = pdf.len() as u64;
764
765    // Phase 1 minimal metadata. Full Atom-feed extraction (title /
766    // authors) lives in `ArxivSource::fetch_metadata_only` and the
767    // metadata-only orchestrator; the fetch path keeps the placeholder
768    // for now (a follow-up slice may chain in Atom-parse here).
769    let metadata = Metadata {
770        schema_version: SCHEMA_VERSION.to_string(),
771        title: format!("arxiv:{}", id.as_str()),
772        authors: Vec::new(),
773        year: None,
774        doi: None,
775        arxiv_id: Some(id.clone()),
776        abstract_: None,
777        venue: None,
778        publisher: None,
779        issn: None,
780        isbn: None,
781        type_: None,
782        keywords: Vec::new(),
783        url: final_url.as_ref().map(|u| u.to_string()),
784        pdf_path: Some(format!("{}.pdf", safekey.as_str())),
785        doiget: Some(DoigetExtension {
786            fetched_at: Utc::now(),
787            source: "arxiv".to_string(),
788            license: license.clone(),
789            size_bytes,
790            mcp_call_id: None,
791        }),
792        other: BTreeMap::new(),
793    };
794
795    let tmp = stage_pdf_to_tempfile(&pdf)?;
796    let pdf_src = Utf8Path::from_path(tmp.path())
797        .ok_or_else(|| FetchError::SourceSchema {
798            hint: "staging tempfile path is not UTF-8".to_string(),
799        })?
800        .to_path_buf();
801    write_metadata_and_pdf(store, safekey, &metadata, Some(&pdf_src), ctx)?;
802    drop(tmp);
803
804    let path = store_root.join(format!("{}.pdf", safekey.as_str()));
805    let canonical_digest =
806        crate::CanonicalRef::new(crate::SourceType::Arxiv, id.as_str(), "arxiv", None).digest_hex();
807    Ok(FetchPaperOutcome {
808        source: "arxiv".to_string(),
809        resolver_profile: "arxiv".to_string(),
810        license,
811        path,
812        size_bytes,
813        schema_version: SCHEMA_VERSION.to_string(),
814        // arXiv always delivers the PDF (or the whole fn already
815        // returned Err above) — there is no metadata-only fallback.
816        pdf_leg: PdfLegStatus::Fetched,
817        safekey: safekey.as_str().to_string(),
818        canonical_digest,
819    })
820}
821
822/// DOI branch of [`fetch_paper`] — Crossref + Unpaywall + (when allowed)
823/// OA-publisher PDF leg. Mirrors the CLI's `fetch_doi` implementation
824/// (`crates/doiget-cli/src/commands/fetch.rs`) — the CLI now delegates
825/// here so both surfaces share one source of truth.
826async fn fetch_paper_doi(
827    doi: &Doi,
828    ref_: &Ref,
829    profile: &CapabilityProfile,
830    ctx: &FetchContext,
831    store: &dyn Store,
832    store_root: &Utf8Path,
833    safekey: &Safekey,
834) -> Result<FetchPaperOutcome, FetchError> {
835    let contact = contact_email_from_env();
836    let unpaywall_contact = unpaywall_email_from_env(&contact);
837    let crossref = crossref_source_from_env(&contact);
838    // Issue #120: Crossref is NON-fatal. A transient Crossref failure
839    // must not abort the whole DOI fetch when Unpaywall alone can
840    // still deliver the OA PDF. We keep the error and only surface it
841    // if nothing usable comes back (see the both-failed guard below).
842    let (cross, crossref_err) = match crossref.fetch(ref_, profile, ctx).await {
843        Ok(r) => (Some(r), None),
844        Err(e) => {
845            tracing::warn!(
846                error = %e,
847                "crossref fetch failed; continuing with unpaywall-only metadata + OA leg"
848            );
849            (None, Some(e))
850        }
851    };
852    let crossref_meta = cross
853        .as_ref()
854        .and_then(|c| c.metadata_json.clone())
855        .unwrap_or(Value::Null);
856    let extracted = extract_crossref_fields(&crossref_meta);
857
858    // Unpaywall second — license enrichment + OA URL chain discovery.
859    // A failure here is non-fatal: we still write the Crossref-
860    // derived metadata.
861    let unpaywall = unpaywall_source_from_env(&unpaywall_contact);
862    let upw_result = unpaywall.fetch(ref_, profile, ctx).await;
863    let (license, source_label, oa_chain) = match upw_result {
864        Ok(r) => {
865            let chain = extract_oa_url_chain(r.metadata_json.as_ref());
866            let label = if r.license != "unknown" {
867                "unpaywall".to_string()
868            } else {
869                "crossref".to_string()
870            };
871            (r.license, label, chain)
872        }
873        Err(e) => {
874            // Unpaywall unreachable / errored. We continue with the
875            // Crossref-only metadata, but the resulting empty OA
876            // chain will be reported downstream as
877            // `PdfLegStatus::NoOaUrl` — semantically distinct from
878            // "Unpaywall confirmed no OA URL". The provenance log
879            // already carries an Unpaywall Fetch err row (the
880            // Unpaywall source impl logged its own attempt before
881            // returning), so the audit trail captures the cause; the
882            // tracing line below makes the orchestrator-level signal
883            // loud as well. Surfacing the distinction at the
884            // `PdfLegStatus` level (a new variant like
885            // `MetadataSourceUnavailable`) is a deliberate
886            // follow-up — see CHANGELOG `[0.4.0]` Notes.
887            tracing::warn!(
888                error = %e,
889                doi = %doi.as_str(),
890                "unpaywall fetch failed; OA chain will be empty (downstream PdfLegStatus::NoOaUrl \
891                 is conservative — Unpaywall was unreachable, not authoritatively oa-free)"
892            );
893            ("unknown".to_string(), "crossref".to_string(), Vec::new())
894        }
895    };
896
897    // OA PDF leg — ADR-0029 fetch chain. Walk the candidate URL list
898    // in order; first successful PDF wins, all-failed surfaces as
899    // `PdfLegStatus::Blocked` with the LAST attempt's error (the most
900    // informative for the operator — typically the network /
901    // allowlist reason the chain could not be exhausted). Each
902    // `try_fetch_oa_pdf` call already emits its own per-attempt
903    // provenance row (`oa-publisher` Fetch ok / err), so the audit
904    // trail captures every external request without orchestrator-
905    // side bookkeeping.
906    //
907    // Issue #118: a failure here is NEVER silently turned into a
908    // clean metadata-only success — the structured reason is carried
909    // out on `PdfLegStatus::Blocked`.
910    let (pdf_leg, pdf_bytes) = if oa_chain.is_empty() {
911        (PdfLegStatus::NoOaUrl, None)
912    } else {
913        let mut succeeded: Option<Vec<u8>> = None;
914        let mut last_err: Option<HttpError> = None;
915        let total = oa_chain.len();
916        for (idx, candidate) in oa_chain.iter().enumerate() {
917            let attempt = idx + 1;
918            tracing::debug!(
919                attempt,
920                total,
921                url = %candidate,
922                "trying OA PDF candidate (ADR-0029 chain)"
923            );
924            match try_fetch_oa_pdf(doi, candidate, ctx).await {
925                Ok((bytes, _final_url)) => {
926                    if attempt > 1 {
927                        tracing::info!(
928                            attempt,
929                            total,
930                            url = %candidate,
931                            "OA PDF chain succeeded on fallback candidate (ADR-0029)"
932                        );
933                    }
934                    succeeded = Some(bytes);
935                    break;
936                }
937                Err(e) => {
938                    tracing::warn!(
939                        attempt,
940                        total,
941                        url = %candidate,
942                        error = %e,
943                        "OA PDF candidate failed; advancing to next (ADR-0029 chain)"
944                    );
945                    last_err = Some(e);
946                }
947            }
948        }
949        match (succeeded, last_err) {
950            (Some(bytes), _) => (PdfLegStatus::Fetched, Some(bytes)),
951            (None, Some(e)) => {
952                let fe = FetchError::Http(e);
953                let denial: Option<crate::DenialContext> = (&fe).into();
954                let message = fe.to_string();
955                let code: crate::ErrorCode = fe.into();
956                let suggested_arxiv_id = oa_chain.iter().find_map(extract_arxiv_id_from_url);
957                (
958                    PdfLegStatus::Blocked {
959                        code,
960                        message,
961                        denial,
962                        suggested_arxiv_id,
963                    },
964                    None,
965                )
966            }
967            // Defensive fallback. `oa_chain` is non-empty in this
968            // branch, so structurally at least one iteration must set
969            // either `succeeded` or `last_err`. If a future refactor
970            // breaks the invariant we fail CLOSED — surface a
971            // `Blocked` outcome with a self-describing message
972            // rather than `NoOaUrl` (which would falsely tell the
973            // caller no candidate URL was ever discovered). Routes
974            // to `INTERNAL_ERROR` so the CLI's exit-code mapping
975            // signals a doiget bug, not a remote failure.
976            (None, None) => {
977                tracing::error!(
978                    total = oa_chain.len(),
979                    "OA PDF chain walker exhausted without recording success or error \
980                     (defensive fallback — should be unreachable)"
981                );
982                (
983                    PdfLegStatus::Blocked {
984                        code: crate::ErrorCode::InternalError,
985                        message:
986                            "OA PDF chain walker exhausted without recording success or error \
987                             (orchestrator bug — please report)"
988                                .to_string(),
989                        denial: None,
990                        suggested_arxiv_id: None,
991                    },
992                    None,
993                )
994            }
995        }
996    };
997
998    // Issue #120: Crossref is non-fatal, but if it failed AND the OA
999    // PDF leg produced nothing, writing a DOI-only stub entry would
1000    // mask a total failure and violate the "explain why" promise.
1001    // Surface the Crossref error so the caller reports a real reason.
1002    if let Some(e) = crossref_err {
1003        if pdf_bytes.is_none() {
1004            return Err(e);
1005        }
1006    }
1007
1008    let (final_source_label, size_bytes, pdf_path_relative, pdf_staged) = match &pdf_bytes {
1009        Some(bytes) => {
1010            let staged = stage_pdf_to_tempfile(bytes)?;
1011            (
1012                "oa-publisher".to_string(),
1013                bytes.len() as u64,
1014                Some(format!("{}.pdf", safekey.as_str())),
1015                Some(staged),
1016            )
1017        }
1018        None => (source_label, 0u64, None, None),
1019    };
1020
1021    let metadata = Metadata {
1022        schema_version: SCHEMA_VERSION.to_string(),
1023        title: extracted.title.unwrap_or_else(|| doi.as_str().to_string()),
1024        authors: extracted.authors,
1025        year: extracted.year,
1026        doi: Some(doi.clone()),
1027        arxiv_id: None,
1028        abstract_: None,
1029        venue: extracted.venue,
1030        publisher: None,
1031        issn: None,
1032        isbn: None,
1033        type_: extracted.type_,
1034        keywords: Vec::new(),
1035        url: cross
1036            .as_ref()
1037            .and_then(|c| c.final_url.as_ref())
1038            .map(|u| u.to_string()),
1039        pdf_path: pdf_path_relative,
1040        doiget: Some(DoigetExtension {
1041            fetched_at: Utc::now(),
1042            source: final_source_label.clone(),
1043            license: license.clone(),
1044            size_bytes,
1045            mcp_call_id: None,
1046        }),
1047        other: BTreeMap::new(),
1048    };
1049
1050    let pdf_src_path = pdf_staged
1051        .as_ref()
1052        .and_then(|tmp| Utf8Path::from_path(tmp.path()).map(|p| p.to_path_buf()));
1053    write_metadata_and_pdf(store, safekey, &metadata, pdf_src_path.as_deref(), ctx)?;
1054    drop(pdf_staged);
1055
1056    let path = if pdf_bytes.is_some() {
1057        store_root.join(format!("{}.pdf", safekey.as_str()))
1058    } else {
1059        store_root
1060            .join(".metadata")
1061            .join(format!("{}.toml", safekey.as_str()))
1062    };
1063    let canonical_digest = crate::CanonicalRef::new(
1064        crate::SourceType::Doi,
1065        doi.as_str(),
1066        &final_source_label,
1067        None,
1068    )
1069    .digest_hex();
1070    Ok(FetchPaperOutcome {
1071        source: final_source_label.clone(),
1072        resolver_profile: final_source_label,
1073        license,
1074        path,
1075        size_bytes,
1076        schema_version: SCHEMA_VERSION.to_string(),
1077        pdf_leg,
1078        safekey: safekey.as_str().to_string(),
1079        canonical_digest,
1080    })
1081}
1082
1083/// Stage PDF bytes to a tempfile so the existing `Store::write` atomic-
1084/// rename code path applies (the store takes a path, not bytes).
1085fn stage_pdf_to_tempfile(bytes: &[u8]) -> Result<tempfile::NamedTempFile, FetchError> {
1086    let tmp = tempfile::NamedTempFile::new().map_err(|e| FetchError::SourceSchema {
1087        hint: format!("creating PDF staging tempfile: {e}"),
1088    })?;
1089    std::fs::write(tmp.path(), bytes).map_err(|e| FetchError::SourceSchema {
1090        hint: format!("staging PDF bytes: {e}"),
1091    })?;
1092    Ok(tmp)
1093}
1094
1095/// Persist `metadata` (and optionally a PDF at `pdf_src`) through the
1096/// trait-object [`Store`] and emit a `StoreWrite` provenance row.
1097fn write_metadata_and_pdf(
1098    store: &dyn Store,
1099    safekey: &Safekey,
1100    metadata: &Metadata,
1101    pdf_src: Option<&Utf8Path>,
1102    ctx: &FetchContext,
1103) -> Result<(), FetchError> {
1104    let store_path_relative = if pdf_src.is_some() {
1105        format!("{}.pdf", safekey.as_str())
1106    } else {
1107        format!(".metadata/{}.toml", safekey.as_str())
1108    };
1109    let size_bytes = metadata.doiget.as_ref().map(|d| d.size_bytes).unwrap_or(0);
1110    let license = metadata.doiget.as_ref().map(|d| d.license.as_str());
1111    let source_name = metadata.doiget.as_ref().map(|d| d.source.as_str());
1112
1113    // ADR-0021 §1 canonical-digest for the StoreWrite row. The store
1114    // entry is keyed on the ref + the resolver that produced its
1115    // metadata (already captured in `metadata.doiget.source`). Build a
1116    // CanonicalRef from whichever id slot is populated.
1117    let canonical_digest: Option<String> = match (metadata.doi.as_ref(), metadata.arxiv_id.as_ref())
1118    {
1119        (Some(d), _) => source_name.map(|s| {
1120            crate::CanonicalRef::new(crate::SourceType::Doi, d.as_str(), s, None).digest_hex()
1121        }),
1122        (None, Some(a)) => source_name.map(|s| {
1123            crate::CanonicalRef::new(crate::SourceType::Arxiv, a.as_str(), s, None).digest_hex()
1124        }),
1125        (None, None) => None,
1126    };
1127
1128    match store.write(safekey, metadata, pdf_src) {
1129        Ok(()) => {
1130            ctx.log.append(RowInput {
1131                event: LogEvent::StoreWrite,
1132                result: LogResult::Ok,
1133                capability: Capability::Oa,
1134                ref_: metadata
1135                    .doi
1136                    .as_ref()
1137                    .map(|d| d.as_str())
1138                    .or_else(|| metadata.arxiv_id.as_ref().map(|a| a.as_str())),
1139                source: source_name,
1140                error_code: None,
1141                size_bytes: Some(size_bytes),
1142                license,
1143                store_path: Some(&store_path_relative),
1144                canonical_digest: canonical_digest.as_deref(),
1145            })?;
1146            Ok(())
1147        }
1148        Err(e) => {
1149            // Best-effort: record the StoreWrite failure before
1150            // propagating the store.write error. We do NOT
1151            // propagate the log-append error itself here — we're
1152            // already in an error state from the store, and the
1153            // primary failure is what the caller needs to act on.
1154            // But the log-append failure is observable via tracing
1155            // so an operator can spot a broken hash chain when
1156            // both fail. Surface as `SourceSchema` so the
1157            // FetchError -> ErrorCode collapse routes it to
1158            // `INTERNAL_ERROR` (closest closed-set fit; `StoreError`
1159            // does not have a direct closed-set arm).
1160            if let Err(log_err) = ctx.log.append(RowInput {
1161                event: LogEvent::StoreWrite,
1162                result: LogResult::Err,
1163                capability: Capability::Oa,
1164                ref_: metadata
1165                    .doi
1166                    .as_ref()
1167                    .map(|d| d.as_str())
1168                    .or_else(|| metadata.arxiv_id.as_ref().map(|a| a.as_str())),
1169                source: source_name,
1170                error_code: Some("STORE_ERROR"),
1171                size_bytes: None,
1172                license: None,
1173                store_path: Some(&store_path_relative),
1174                canonical_digest: canonical_digest.as_deref(),
1175            }) {
1176                tracing::error!(
1177                    store_err = %e,
1178                    log_err = %log_err,
1179                    "BOTH store.write AND provenance log append failed; \
1180                     audit trail is broken for this attempt"
1181                );
1182            }
1183            Err(FetchError::SourceSchema {
1184                hint: format!("store write failed: {e}"),
1185            })
1186        }
1187    }
1188}
1189
1190/// Attempt the OA PDF fetch under the `"oa-publisher"` source key.
1191async fn try_fetch_oa_pdf(
1192    doi: &Doi,
1193    url: &url::Url,
1194    ctx: &FetchContext,
1195) -> Result<(Vec<u8>, url::Url), HttpError> {
1196    const SOURCE: &str = "oa-publisher";
1197    let _permit = ctx.rate_limiter.acquire(SOURCE).await;
1198    // ADR-0021 §1: the oa-publisher PDF leg is a DISTINCT audit
1199    // identity from the Crossref/Unpaywall metadata legs even though
1200    // the ref is the same DOI — that's the whole point of carrying
1201    // `resolver_profile` into the digest. Compute once and re-use for
1202    // both the ok and err row variants below.
1203    let canonical =
1204        crate::CanonicalRef::new(crate::SourceType::Doi, doi.as_str(), SOURCE, None).digest_hex();
1205
1206    // Pre-fetch host allowlist check on the metadata-discovered OA URL
1207    // (issue #145; `docs/REDIRECT_ALLOWLIST.md` §1 — NORMATIVE). The
1208    // per-source `redirect_hosts` allowlist is, by §1, consulted "on the
1209    // OA URL discovered through metadata sources before the actual PDF
1210    // fetch is issued", not only on redirect hops. The redirect closure in
1211    // `crate::http` only fires when an *actual redirect* occurs; an OA URL
1212    // whose host is off the `oa-publisher` allowlist that resolves WITHOUT
1213    // a redirect would otherwise reach connect and be misclassified as a
1214    // transport error, violating §1. This is scoped strictly to the
1215    // `"oa-publisher"` PDF leg — §6 explicitly exempts the initial
1216    // template-constructed URL, and `fetch_bytes`/metadata-only/resolve-
1217    // only paths (which never follow the OA URL) are deliberately NOT
1218    // touched. On a host MISS we return the *same* `HttpError::RedirectDenied`
1219    // value the redirect closure produces (same `source_key`, lowercased
1220    // `host`, and `expected_hosts` snapshot), reusing the identical
1221    // allowlist the closure captured (queried via `source_allowlist`, not
1222    // re-derived) so the single source of truth cannot drift. Returning
1223    // that exact variant means the existing `Err(e)` arm below, the
1224    // `From<&HttpError> for Option<DenialContext>` mapping
1225    // (`DenialReason::RedirectNotInAllowlist`), the `PdfLegStatus::Blocked`
1226    // construction in the caller, and PR #162's CLI classification all see
1227    // a byte-identical downstream shape with no new code path.
1228    if let Some(allowlist) = ctx.http.source_allowlist(SOURCE) {
1229        // `Url::host_str()` is `None` for hostless URLs (e.g. `data:`);
1230        // treat that exactly as the redirect closure does (an allowlist
1231        // miss with an empty host string).
1232        let host = url
1233            .host_str()
1234            .map(|h| h.to_ascii_lowercase())
1235            .unwrap_or_default();
1236        if !allowlist.matches(&host) {
1237            let e = HttpError::RedirectDenied {
1238                source_key: SOURCE.to_string(),
1239                host: host.clone(),
1240                expected_hosts: allowlist.redirect_hosts.clone(),
1241            };
1242            tracing::info!(
1243                oa_url = %url,
1244                denied_host = %host,
1245                "OA URL host outside oa-publisher allowlist (pre-fetch check, \
1246                 docs/REDIRECT_ALLOWLIST.md §1 / issue #145)"
1247            );
1248            // Emit the SAME provenance row the post-fetch redirect-denied
1249            // path emits: a `Fetch` `Err` row under the `oa-publisher`
1250            // source key with the closed-set `NETWORK_ERROR` code and the
1251            // same canonical digest. Mirrors the `Err(e)` arm below so the
1252            // audit trail is indistinguishable from a redirect-time denial.
1253            let _ = ctx.log.append(RowInput {
1254                event: LogEvent::Fetch,
1255                result: LogResult::Err,
1256                capability: Capability::Oa,
1257                ref_: Some(doi.as_str()),
1258                source: Some(SOURCE),
1259                error_code: Some(crate::ErrorCode::NetworkError.as_wire()),
1260                size_bytes: None,
1261                license: None,
1262                store_path: None,
1263                canonical_digest: Some(&canonical),
1264            });
1265            return Err(e);
1266        }
1267    }
1268
1269    match ctx.http.fetch_pdf(SOURCE, url.clone()).await {
1270        Ok((body, final_url)) => {
1271            let size_bytes = body.len() as u64;
1272            if let Err(e) = ctx.log.append(RowInput {
1273                event: LogEvent::Fetch,
1274                result: LogResult::Ok,
1275                capability: Capability::Oa,
1276                ref_: Some(doi.as_str()),
1277                source: Some(SOURCE),
1278                error_code: None,
1279                size_bytes: Some(size_bytes),
1280                license: None,
1281                store_path: None,
1282                canonical_digest: Some(&canonical),
1283            }) {
1284                tracing::warn!(error = %e, "appending oa-publisher Fetch ok row failed");
1285            }
1286            Ok((body.to_vec(), final_url))
1287        }
1288        Err(e) => {
1289            match &e {
1290                HttpError::RedirectDenied { host, .. } => {
1291                    tracing::info!(
1292                        oa_url = %url,
1293                        denied_host = %host,
1294                        "OA URL host outside oa-publisher allowlist"
1295                    );
1296                }
1297                HttpError::NotAPdf { .. } => {
1298                    tracing::info!(
1299                        oa_url = %url,
1300                        "OA URL did not return a PDF magic byte"
1301                    );
1302                }
1303                other => {
1304                    tracing::warn!(
1305                        oa_url = %url,
1306                        error = %other,
1307                        "OA PDF fetch failed"
1308                    );
1309                }
1310            }
1311            // Provenance `error_code` is the CLOSED-set code. Every
1312            // `HttpError` collapses to `NETWORK_ERROR` through the
1313            // canonical `From<FetchError> for ErrorCode` (the closed
1314            // set has no finer transport code by design) — so this is
1315            // the correct mapped value, not the misattribution the
1316            // previous hardcode implied. The *fine* reason
1317            // (RedirectDenied vs NotAPdf vs …) is preserved for the
1318            // user via `PdfLegStatus::Blocked.denial` / `.message`
1319            // built by the caller from the returned `HttpError`
1320            // (issue #118). Rendered via `ErrorCode::as_wire` so the
1321            // token can never drift from the enum.
1322            let _ = ctx.log.append(RowInput {
1323                event: LogEvent::Fetch,
1324                result: LogResult::Err,
1325                capability: Capability::Oa,
1326                ref_: Some(doi.as_str()),
1327                source: Some(SOURCE),
1328                error_code: Some(crate::ErrorCode::NetworkError.as_wire()),
1329                size_bytes: None,
1330                license: None,
1331                store_path: None,
1332                canonical_digest: Some(&canonical),
1333            });
1334            Err(e)
1335        }
1336    }
1337}
1338
1339/// Subset of Crossref `message` fields populated into the on-disk metadata.
1340pub(crate) struct CrossrefFields {
1341    pub(crate) title: Option<String>,
1342    pub(crate) authors: Vec<String>,
1343    pub(crate) year: Option<i32>,
1344    pub(crate) venue: Option<String>,
1345    pub(crate) type_: Option<String>,
1346}
1347
1348/// Defensively pull bibliographic fields out of a Crossref envelope's
1349/// message object. Every field is optional; malformed shapes degrade
1350/// to None rather than panicking.
1351pub(crate) fn extract_crossref_fields(msg: &Value) -> CrossrefFields {
1352    let title = msg
1353        .get("title")
1354        .and_then(|v| v.as_array())
1355        .and_then(|arr| arr.first())
1356        .and_then(|v| v.as_str())
1357        .map(|s| s.to_string());
1358
1359    let authors = msg
1360        .get("author")
1361        .and_then(|v| v.as_array())
1362        .map(|arr| {
1363            arr.iter()
1364                .filter_map(|a| {
1365                    let family = a.get("family").and_then(|v| v.as_str());
1366                    let given = a.get("given").and_then(|v| v.as_str());
1367                    match (family, given) {
1368                        (Some(f), Some(g)) => Some(format!("{f}, {g}")),
1369                        (Some(f), None) => Some(f.to_string()),
1370                        (None, Some(g)) => Some(g.to_string()),
1371                        _ => None,
1372                    }
1373                })
1374                .collect()
1375        })
1376        .unwrap_or_default();
1377
1378    let year = msg
1379        .get("issued")
1380        .and_then(|v| v.get("date-parts"))
1381        .and_then(|v| v.as_array())
1382        .and_then(|arr| arr.first())
1383        .and_then(|v| v.as_array())
1384        .and_then(|arr| arr.first())
1385        .and_then(|v| v.as_i64())
1386        .and_then(|n| i32::try_from(n).ok());
1387
1388    let venue = msg
1389        .get("container-title")
1390        .and_then(|v| v.as_array())
1391        .and_then(|arr| arr.first())
1392        .and_then(|v| v.as_str())
1393        .map(|s| s.to_string());
1394
1395    let type_ = msg
1396        .get("type")
1397        .and_then(|v| v.as_str())
1398        .map(|s| s.to_string());
1399
1400    CrossrefFields {
1401        title,
1402        authors,
1403        year,
1404        venue,
1405        type_,
1406    }
1407}
1408
1409/// Pull the ordered chain of candidate OA URLs out of an Unpaywall
1410/// `metadata_json` envelope per ADR-0029 D2.
1411///
1412/// Order is `best_oa_location` first (when present), then every
1413/// distinct entry in `oa_locations[]`. Duplicate URLs are deduped by
1414/// exact string match so a candidate that appears as both the "best"
1415/// entry and an array element is fetched at most once.
1416///
1417/// Each location's URL is resolved via the same `url_for_pdf` →
1418/// `url` fallback the single-URL extractor uses.
1419///
1420/// Returns `Vec::new()` when no OA location was reported (the chain
1421/// is empty and the caller surfaces [`PdfLegStatus::NoOaUrl`]).
1422fn extract_oa_url_chain(meta: Option<&Value>) -> Vec<url::Url> {
1423    let meta = match meta {
1424        Some(m) => m,
1425        None => return Vec::new(),
1426    };
1427    let mut out: Vec<url::Url> = Vec::new();
1428    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
1429    let mut push_unique = |u: url::Url| {
1430        let key = u.as_str().to_string();
1431        if seen.insert(key) {
1432            out.push(u);
1433        }
1434    };
1435
1436    // Priority 1: best_oa_location (Unpaywall's own quality-ordered
1437    // pick — ADR-0029 D2 NORMATIVE: defer to the metadata source's
1438    // ordering).
1439    if let Some(best) = meta.get("best_oa_location") {
1440        if let Some(u) = pull_oa_url_from_location(best) {
1441            push_unique(u);
1442        }
1443    }
1444    // Priority 2: every entry in oa_locations[] after the best one.
1445    // The fallback target this ADR exists to enable is precisely the
1446    // arXiv preprint that lives here when `best_oa_location` is a
1447    // WAF-blocked publisher URL.
1448    if let Some(arr) = meta.get("oa_locations").and_then(|v| v.as_array()) {
1449        for loc in arr {
1450            if let Some(u) = pull_oa_url_from_location(loc) {
1451                push_unique(u);
1452            }
1453        }
1454    }
1455    out
1456}
1457
1458/// Resolve a single OA location object to a `url::Url`. Tries
1459/// `url_for_pdf` first (the direct PDF link Unpaywall annotates when
1460/// it knows one), falling back to `url` (the landing page). Returns
1461/// `None` if neither field is present or parses.
1462fn pull_oa_url_from_location(loc: &Value) -> Option<url::Url> {
1463    let candidate = loc
1464        .get("url_for_pdf")
1465        .and_then(|v| v.as_str())
1466        .or_else(|| loc.get("url").and_then(|v| v.as_str()))?;
1467    url::Url::parse(candidate).ok()
1468}
1469
1470/// Helper to parse clean arXiv IDs from URLs like arxiv.org/pdf/1901.12345.pdf.
1471///
1472/// Strips the trailing `.pdf` extension and any version suffix (`v1`, `v2`, …)
1473/// so the returned ID refers to the latest version rather than pinning a
1474/// specific one. Returns `None` for non-arXiv hosts or unrecognised path shapes.
1475fn extract_arxiv_id_from_url(url: &url::Url) -> Option<String> {
1476    let host = url.host_str()?;
1477    let is_arxiv = matches!(
1478        host,
1479        "arxiv.org" | "www.arxiv.org" | "export.arxiv.org" | "e-print.arxiv.org"
1480    );
1481    if !is_arxiv {
1482        return None;
1483    }
1484    let path = url.path();
1485    let raw = if path.starts_with("/pdf/") {
1486        let s = path.strip_prefix("/pdf/")?;
1487        s.strip_suffix(".pdf").unwrap_or(s)
1488    } else if path.starts_with("/abs/") {
1489        path.strip_prefix("/abs/")?
1490    } else {
1491        return None;
1492    };
1493    Some(strip_arxiv_version(raw).to_string())
1494}
1495
1496/// Strip a trailing arXiv version suffix (`v1`, `v2`, …) from an ID string.
1497///
1498/// Recognises the suffix only when the `v` is **preceded by a digit** (ruling
1499/// out category fragments like `quant-ph`) and followed by one or more ASCII
1500/// digits. Leaves IDs without a recognisable version suffix unchanged.
1501fn strip_arxiv_version(id: &str) -> &str {
1502    if let Some(v_pos) = id.rfind('v') {
1503        let before_v = id[..v_pos].chars().next_back();
1504        let suffix = &id[v_pos + 1..];
1505        if before_v.is_some_and(|c| c.is_ascii_digit())
1506            && !suffix.is_empty()
1507            && suffix.bytes().all(|b| b.is_ascii_digit())
1508        {
1509            return &id[..v_pos];
1510        }
1511    }
1512    id
1513}
1514
1515fn unpaywall_email_from_env(fallback_contact: &str) -> String {
1516    std::env::var("DOIGET_UNPAYWALL_EMAIL").unwrap_or_else(|_| fallback_contact.to_string())
1517}
1518
1519// ---------------------------------------------------------------------------
1520// batch_fetch — multi-ref orchestrator (Slice 2)
1521// ---------------------------------------------------------------------------
1522
1523/// Per-ref outcome carried inside [`BatchOutcome::results`].
1524///
1525/// Each entry's `outcome` is independent — a single `Err(...)` does not
1526/// abort sibling refs. The MCP `doiget_batch_fetch` tool method
1527/// serializes the success-or-error per row inside `results[]`.
1528#[derive(Debug)]
1529pub struct BatchResultEntry {
1530    /// The parsed ref this entry describes.
1531    pub ref_: Ref,
1532    /// `Ok(...)` on a successful fetch through [`fetch_paper`];
1533    /// `Err(...)` on a per-ref failure (the outer call still returned
1534    /// `Ok(BatchOutcome)`).
1535    pub outcome: Result<FetchPaperOutcome, FetchError>,
1536}
1537
1538/// Outcome of a successful [`batch_fetch`] call.
1539///
1540/// The outer call returns `Err(_)` only on whole-call failures (the
1541/// only such variant in Slice 2 is [`FetchError::TooManyRefs`]). Each
1542/// per-ref result lives inside `results[]` so the agent can see every
1543/// outcome without losing sibling successes.
1544#[derive(Debug)]
1545#[non_exhaustive]
1546pub struct BatchOutcome {
1547    /// One entry per supplied ref, in input order.
1548    pub results: Vec<BatchResultEntry>,
1549}
1550
1551/// Iterate over `refs` through [`fetch_paper`], collecting one
1552/// [`BatchResultEntry`] per ref.
1553///
1554/// **Cap**: caller must supply at most [`MAX_BATCH_REFS`] refs; otherwise
1555/// the function returns `Err(FetchError::TooManyRefs { got, max })`
1556/// before any fetch is attempted. The cap mirrors the CLI's
1557/// `commands::batch` enforcement (`MCP_BATCH_MAX_SIZE`).
1558///
1559/// **Concurrency**: Slice 2 dispatches refs serially through
1560/// [`fetch_paper`]. The CLI's existing `commands::batch::run_with_options`
1561/// keeps its bounded-concurrency `JoinSet`+semaphore path for backward
1562/// compatibility; the MCP server uses this serial loop because the MCP
1563/// tool boundary already serializes calls per session.
1564///
1565/// **Session bookkeeping**: this function does NOT emit `SessionStart`
1566/// / `SessionEnd` rows — that is the caller's responsibility.
1567pub async fn batch_fetch(
1568    refs: &[Ref],
1569    profile: &CapabilityProfile,
1570    ctx: &FetchContext,
1571    store: &dyn Store,
1572    store_root: &Utf8Path,
1573) -> Result<BatchOutcome, FetchError> {
1574    if refs.len() > MAX_BATCH_REFS {
1575        return Err(FetchError::TooManyRefs {
1576            got: refs.len(),
1577            max: MAX_BATCH_REFS,
1578        });
1579    }
1580    let mut results = Vec::with_capacity(refs.len());
1581    for ref_ in refs {
1582        let outcome = fetch_paper(ref_, profile, ctx, store, store_root).await;
1583        results.push(BatchResultEntry {
1584            ref_: ref_.clone(),
1585            outcome,
1586        });
1587    }
1588    Ok(BatchOutcome { results })
1589}
1590
1591/// Dry-run preview for a batch — one [`FetchPlan`] per ref. Enforces
1592/// the same [`MAX_BATCH_REFS`] cap [`batch_fetch`] does.
1593///
1594/// Returns `Err(FetchError::TooManyRefs)` when over the cap, or
1595/// `Err(FetchError::SourceSchema)` if the dry-run allowlist invariant
1596/// has drifted (issue #156 ②: this now propagates as a typed error via
1597/// [`try_build_fetch_plan`] rather than silently emitting an empty
1598/// `candidate_hosts` list — the signature already returned `Result`, so
1599/// this is an in-crate behavior tightening with no caller-visible type
1600/// change). Otherwise `Ok(Vec<(Ref, FetchPlan)>)` parallel to the input
1601/// order.
1602pub fn batch_fetch_plans(
1603    refs: &[Ref],
1604    store_root: &Utf8Path,
1605) -> Result<Vec<(Ref, FetchPlan)>, FetchError> {
1606    if refs.len() > MAX_BATCH_REFS {
1607        return Err(FetchError::TooManyRefs {
1608            got: refs.len(),
1609            max: MAX_BATCH_REFS,
1610        });
1611    }
1612    refs.iter()
1613        .map(|r| try_build_fetch_plan(r, store_root).map(|p| (r.clone(), p)))
1614        .collect()
1615}
1616
1617// ---------------------------------------------------------------------------
1618// Tests
1619// ---------------------------------------------------------------------------
1620
1621#[cfg(test)]
1622#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
1623mod tests {
1624    use super::*;
1625
1626    #[test]
1627    fn test_extract_arxiv_id_from_url() {
1628        let urls = [
1629            // Basic new-style ID
1630            ("https://arxiv.org/pdf/1901.12345.pdf", Some("1901.12345")),
1631            ("https://arxiv.org/abs/1901.12345", Some("1901.12345")),
1632            // Version suffix is stripped
1633            ("https://arxiv.org/pdf/1901.12345v2.pdf", Some("1901.12345")),
1634            ("https://arxiv.org/abs/1901.12345v3", Some("1901.12345")),
1635            // Old-style category/ID
1636            (
1637                "https://www.arxiv.org/pdf/cond-mat/9501001.pdf",
1638                Some("cond-mat/9501001"),
1639            ),
1640            (
1641                "https://export.arxiv.org/abs/cond-mat/9501001",
1642                Some("cond-mat/9501001"),
1643            ),
1644            // Old-style with version stripped
1645            (
1646                "https://arxiv.org/pdf/cond-mat/9501001v1.pdf",
1647                Some("cond-mat/9501001"),
1648            ),
1649            // e-print subdomain
1650            (
1651                "https://e-print.arxiv.org/pdf/2401.12345.pdf",
1652                Some("2401.12345"),
1653            ),
1654            // Non-arXiv host
1655            ("https://example.org/pdf/1901.12345.pdf", None),
1656        ];
1657        for (url_str, expected) in urls {
1658            let url = url::Url::parse(url_str).unwrap();
1659            assert_eq!(
1660                extract_arxiv_id_from_url(&url),
1661                expected.map(String::from),
1662                "url: {url_str}"
1663            );
1664        }
1665    }
1666
1667    #[test]
1668    fn test_strip_arxiv_version() {
1669        assert_eq!(strip_arxiv_version("2401.12345v2"), "2401.12345");
1670        assert_eq!(strip_arxiv_version("2401.12345v10"), "2401.12345");
1671        assert_eq!(strip_arxiv_version("2401.12345"), "2401.12345");
1672        assert_eq!(
1673            strip_arxiv_version("cond-mat/9501001v3"),
1674            "cond-mat/9501001"
1675        );
1676        // "v" not followed by digits — unchanged
1677        assert_eq!(strip_arxiv_version("quant-phv5"), "quant-phv5");
1678    }
1679
1680    #[test]
1681    fn extract_crossref_oa_url_finds_first_url() {
1682        let msg = serde_json::json!({
1683            "link": [
1684                {"URL": "https://example.org/free.pdf"},
1685                {"URL": "https://example.org/alt.pdf"}
1686            ]
1687        });
1688        assert_eq!(
1689            extract_crossref_oa_url(&msg),
1690            Some("https://example.org/free.pdf".to_string())
1691        );
1692    }
1693
1694    #[test]
1695    fn extract_crossref_oa_url_returns_none_when_absent() {
1696        let msg = serde_json::json!({});
1697        assert!(extract_crossref_oa_url(&msg).is_none());
1698    }
1699
1700    #[test]
1701    fn extract_crossref_oa_url_skips_empty_url_strings() {
1702        let msg = serde_json::json!({
1703            "link": [
1704                {"URL": ""},
1705                {"URL": "https://example.org/real.pdf"}
1706            ]
1707        });
1708        assert_eq!(
1709            extract_crossref_oa_url(&msg),
1710            Some("https://example.org/real.pdf".to_string())
1711        );
1712    }
1713
1714    #[test]
1715    fn extract_unpaywall_oa_url_prefers_url_for_pdf() {
1716        let meta = serde_json::json!({
1717            "best_oa_location": {
1718                "url_for_pdf": "https://example.org/pdf",
1719                "url": "https://example.org/landing"
1720            }
1721        });
1722        assert_eq!(
1723            extract_unpaywall_oa_url(&meta),
1724            Some("https://example.org/pdf".to_string())
1725        );
1726    }
1727
1728    #[test]
1729    fn extract_unpaywall_oa_url_falls_back_to_url() {
1730        let meta = serde_json::json!({
1731            "best_oa_location": {
1732                "url": "https://example.org/landing"
1733            }
1734        });
1735        assert_eq!(
1736            extract_unpaywall_oa_url(&meta),
1737            Some("https://example.org/landing".to_string())
1738        );
1739    }
1740
1741    #[test]
1742    fn extract_unpaywall_oa_url_returns_none_when_absent() {
1743        let meta = serde_json::json!({});
1744        assert!(extract_unpaywall_oa_url(&meta).is_none());
1745    }
1746
1747    // ---------------------------------------------------------------
1748    // Slice 2: fetch_paper / batch_fetch coverage. The wiremock-driven
1749    // happy-path tests live in `crates/doiget-mcp/tests/...` (they need
1750    // a real `Store` impl and an HTTP client wired to `FetchContext`,
1751    // both of which the MCP integration tests already stand up). The
1752    // unit tests here pin the pure-function pieces (extractors, cap
1753    // enforcement, plan-shape preservation).
1754    // ---------------------------------------------------------------
1755
1756    #[test]
1757    fn extract_crossref_fields_parses_minimal_shape() {
1758        let msg = serde_json::json!({
1759            "title": ["Example Title"],
1760            "author": [{ "family": "Smith", "given": "Alice" }],
1761            "issued": { "date-parts": [[2024, 1, 15]] },
1762            "container-title": ["Phys. Rev. X"],
1763            "type": "journal-article"
1764        });
1765        let f = extract_crossref_fields(&msg);
1766        assert_eq!(f.title.as_deref(), Some("Example Title"));
1767        assert_eq!(f.authors, vec!["Smith, Alice".to_string()]);
1768        assert_eq!(f.year, Some(2024));
1769        assert_eq!(f.venue.as_deref(), Some("Phys. Rev. X"));
1770        assert_eq!(f.type_.as_deref(), Some("journal-article"));
1771    }
1772
1773    #[test]
1774    fn extract_crossref_fields_tolerates_missing() {
1775        let f = extract_crossref_fields(&serde_json::json!({}));
1776        assert!(f.title.is_none());
1777        assert!(f.authors.is_empty());
1778        assert!(f.year.is_none());
1779        assert!(f.venue.is_none());
1780        assert!(f.type_.is_none());
1781    }
1782
1783    #[test]
1784    fn extract_oa_url_chain_prefers_best_url_for_pdf() {
1785        // `best_oa_location.url_for_pdf` is the highest-priority
1786        // candidate (ADR-0029 D2 — defer to the metadata source's
1787        // ordering). Falls back to `best_oa_location.url` only when
1788        // no PDF link is annotated.
1789        let meta = serde_json::json!({
1790            "best_oa_location": {
1791                "url_for_pdf": "https://example.org/pdf",
1792                "url": "https://example.org/landing"
1793            }
1794        });
1795        let chain = extract_oa_url_chain(Some(&meta));
1796        assert_eq!(chain.len(), 1);
1797        assert_eq!(chain[0].as_str(), "https://example.org/pdf");
1798    }
1799
1800    #[test]
1801    fn extract_oa_url_chain_falls_back_to_url_when_url_for_pdf_absent() {
1802        let meta = serde_json::json!({
1803            "best_oa_location": {
1804                "url": "https://example.org/landing"
1805            }
1806        });
1807        let chain = extract_oa_url_chain(Some(&meta));
1808        assert_eq!(chain.len(), 1);
1809        assert_eq!(chain[0].as_str(), "https://example.org/landing");
1810    }
1811
1812    #[test]
1813    fn extract_oa_url_chain_is_empty_when_no_locations() {
1814        let meta = serde_json::json!({});
1815        assert!(extract_oa_url_chain(Some(&meta)).is_empty());
1816        assert!(extract_oa_url_chain(None).is_empty());
1817    }
1818
1819    #[test]
1820    fn extract_oa_url_chain_appends_oa_locations_after_best() {
1821        // ADR-0029 D2: best_oa_location first, then the rest of
1822        // oa_locations in metadata-source order. This is the load-
1823        // bearing test: it pins the fact that an arXiv preprint
1824        // listed *after* a WAF-blocked publisher in oa_locations[]
1825        // becomes a fallback candidate the chain walker can reach.
1826        let meta = serde_json::json!({
1827            "best_oa_location": {
1828                "url_for_pdf": "https://publisher.example.org/pdf"
1829            },
1830            "oa_locations": [
1831                {"url_for_pdf": "https://publisher.example.org/pdf"},
1832                {"url_for_pdf": "https://arxiv.org/pdf/2401.12345"},
1833                {"url": "https://repo.example.edu/handle/123"}
1834            ]
1835        });
1836        let chain = extract_oa_url_chain(Some(&meta));
1837        let strs: Vec<&str> = chain.iter().map(|u| u.as_str()).collect();
1838        assert_eq!(
1839            strs,
1840            vec![
1841                "https://publisher.example.org/pdf",
1842                "https://arxiv.org/pdf/2401.12345",
1843                "https://repo.example.edu/handle/123",
1844            ],
1845            "chain ordering MUST be best_oa_location first, oa_locations[] verbatim after"
1846        );
1847    }
1848
1849    #[test]
1850    fn extract_oa_url_chain_dedupes_repeated_urls() {
1851        // A URL that appears as both `best_oa_location` and an entry
1852        // in `oa_locations[]` is fetched at most once. Without this,
1853        // a publisher whose record has the same URL in both slots
1854        // would consume two HTTP requests + two rate-limit ticks.
1855        let meta = serde_json::json!({
1856            "best_oa_location": {
1857                "url_for_pdf": "https://example.org/pdf"
1858            },
1859            "oa_locations": [
1860                {"url_for_pdf": "https://example.org/pdf"},
1861                {"url_for_pdf": "https://example.org/pdf"},
1862                {"url_for_pdf": "https://arxiv.org/pdf/2401.12345"}
1863            ]
1864        });
1865        let chain = extract_oa_url_chain(Some(&meta));
1866        assert_eq!(chain.len(), 2);
1867        assert_eq!(chain[0].as_str(), "https://example.org/pdf");
1868        assert_eq!(chain[1].as_str(), "https://arxiv.org/pdf/2401.12345");
1869    }
1870
1871    #[test]
1872    fn extract_oa_url_chain_skips_unparsable_urls() {
1873        // A malformed URL in oa_locations[] is dropped silently
1874        // rather than aborting the chain — the metadata source can
1875        // emit a stray entry without poisoning the whole fetch.
1876        let meta = serde_json::json!({
1877            "best_oa_location": {
1878                "url_for_pdf": "https://good.example.org/pdf"
1879            },
1880            "oa_locations": [
1881                {"url_for_pdf": "not a url"},
1882                {"url_for_pdf": "https://arxiv.org/pdf/2401.12345"}
1883            ]
1884        });
1885        let chain = extract_oa_url_chain(Some(&meta));
1886        assert_eq!(chain.len(), 2);
1887        assert_eq!(chain[0].as_str(), "https://good.example.org/pdf");
1888        assert_eq!(chain[1].as_str(), "https://arxiv.org/pdf/2401.12345");
1889    }
1890
1891    #[test]
1892    fn fetch_paper_plan_matches_build_fetch_plan() {
1893        // The slice-2-named alias is a thin pass-through to
1894        // `dry_run::build_fetch_plan`. Pin behavioral equivalence so
1895        // a future refactor that diverges them surfaces here.
1896        use crate::{ArxivId, Doi};
1897        let r = Ref::Doi(Doi("10.1234/example".to_string()));
1898        let root = Utf8PathBuf::from("/tmp/doiget-test");
1899        let plan_a = fetch_paper_plan(&r, &root);
1900        let plan_b = build_fetch_plan(&r, &root);
1901        assert_eq!(plan_a.metadata_sources, plan_b.metadata_sources);
1902        assert_eq!(plan_a.target_pdf_path, plan_b.target_pdf_path);
1903        assert_eq!(plan_a.target_metadata_path, plan_b.target_metadata_path);
1904
1905        let r2 = Ref::Arxiv(ArxivId("2401.12345".to_string()));
1906        let plan_c = fetch_paper_plan(&r2, &root);
1907        let plan_d = build_fetch_plan(&r2, &root);
1908        assert_eq!(plan_c.pdf_sources[0].key, plan_d.pdf_sources[0].key);
1909    }
1910
1911    #[test]
1912    fn batch_fetch_plans_returns_plan_per_ref_in_order() {
1913        use crate::{ArxivId, Doi};
1914        let refs = vec![
1915            Ref::Doi(Doi("10.1234/alpha".to_string())),
1916            Ref::Arxiv(ArxivId("2401.12345".to_string())),
1917        ];
1918        let root = Utf8PathBuf::from("/tmp/doiget-batch-test");
1919        let plans = batch_fetch_plans(&refs, &root).expect("under cap returns Ok");
1920        assert_eq!(plans.len(), 2);
1921        // Order preserved.
1922        assert!(matches!(plans[0].0, Ref::Doi(_)));
1923        assert!(matches!(plans[1].0, Ref::Arxiv(_)));
1924        // DOI plan carries the crossref + unpaywall metadata sources.
1925        assert_eq!(plans[0].1.metadata_sources, vec!["crossref", "unpaywall"]);
1926        // arXiv plan has the arxiv PDF source key.
1927        assert_eq!(plans[1].1.pdf_sources[0].key, "arxiv");
1928    }
1929
1930    #[test]
1931    fn batch_fetch_plans_too_many_refs_returns_err() {
1932        use crate::Doi;
1933        // Build MAX_BATCH_REFS + 1 entries — boundary case.
1934        let n = MAX_BATCH_REFS + 1;
1935        let refs: Vec<Ref> = (0..n)
1936            .map(|i| Ref::Doi(Doi(format!("10.1234/n{}", i))))
1937            .collect();
1938        let root = Utf8PathBuf::from("/tmp/doiget-toomany");
1939        let err = batch_fetch_plans(&refs, &root).expect_err("over cap returns Err");
1940        match err {
1941            FetchError::TooManyRefs { got, max } => {
1942                assert_eq!(got, n);
1943                assert_eq!(max, MAX_BATCH_REFS);
1944            }
1945            other => panic!("expected TooManyRefs, got: {other:?}"),
1946        }
1947    }
1948
1949    #[tokio::test]
1950    async fn batch_fetch_too_many_refs_returns_err_before_any_fetch() {
1951        // The cap is enforced before any per-ref work, so we don't need
1952        // a working store/network here — pass a sentinel store_root and
1953        // a dummy FetchContext that would panic on use.
1954        use crate::http::{tier_1_allowlist, HttpClient};
1955        use crate::provenance::ProvenanceLog;
1956        use crate::rate_limiter::RateLimiter;
1957        use crate::store::FsStore;
1958        use crate::{Doi, RateLimits};
1959        use std::sync::Arc;
1960
1961        let td = tempfile::TempDir::new().expect("tempdir");
1962        let log_path = Utf8Path::from_path(td.path())
1963            .expect("utf-8")
1964            .join("log.jsonl");
1965        let store_root = Utf8Path::from_path(td.path())
1966            .expect("utf-8")
1967            .join("papers");
1968
1969        let ctx = FetchContext {
1970            http: Arc::new(HttpClient::new(tier_1_allowlist()).expect("http client")),
1971            rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
1972            log: Arc::new(
1973                ProvenanceLog::open(log_path, "01J0000000000000000000TEST".into())
1974                    .expect("provenance log"),
1975            ),
1976            session_id: "01J0000000000000000000TEST".into(),
1977        };
1978        let profile = CapabilityProfile::from_env().expect("clean env");
1979        let store = FsStore::new(store_root.clone()).expect("fs store");
1980
1981        let n = MAX_BATCH_REFS + 1;
1982        let refs: Vec<Ref> = (0..n)
1983            .map(|i| Ref::Doi(Doi(format!("10.1234/n{}", i))))
1984            .collect();
1985
1986        let err = batch_fetch(&refs, &profile, &ctx, &store, &store_root)
1987            .await
1988            .expect_err("over cap returns Err");
1989        match err {
1990            FetchError::TooManyRefs { got, max } => {
1991                assert_eq!(got, n);
1992                assert_eq!(max, MAX_BATCH_REFS);
1993            }
1994            other => panic!("expected TooManyRefs, got: {other:?}"),
1995        }
1996    }
1997
1998    // Issue #118: a non-PDF OA body must surface as `Err(HttpError)`
1999    // from `try_fetch_oa_pdf` (previously silently flattened to
2000    // `None`, which `fetch_paper_doi` then reported as a clean
2001    // metadata-only success). The compiler-checked `Err(e) =>
2002    // PdfLegStatus::Blocked` arm in `fetch_paper_doi` does the rest.
2003    #[tokio::test]
2004    async fn try_fetch_oa_pdf_non_pdf_body_is_err_not_silent_none() {
2005        use crate::http::HttpClient;
2006        use crate::provenance::ProvenanceLog;
2007        use crate::rate_limiter::RateLimiter;
2008        use crate::{Doi, RateLimits};
2009        use std::sync::Arc;
2010        use wiremock::matchers::method;
2011        use wiremock::{Mock, MockServer, ResponseTemplate};
2012
2013        let server = MockServer::start().await;
2014        Mock::given(method("GET"))
2015            .respond_with(
2016                ResponseTemplate::new(200).set_body_bytes(b"<html>not a pdf</html>".to_vec()),
2017            )
2018            .mount(&server)
2019            .await;
2020        let host = server
2021            .uri()
2022            .parse::<url::Url>()
2023            .expect("uri")
2024            .host_str()
2025            .expect("host")
2026            .to_string();
2027
2028        let td = tempfile::TempDir::new().expect("tempdir");
2029        let log_path = Utf8Path::from_path(td.path())
2030            .expect("utf-8")
2031            .join("log.jsonl");
2032        let ctx = FetchContext {
2033            http: Arc::new(HttpClient::new_for_tests_allow_http("oa-publisher", &host)),
2034            rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
2035            log: Arc::new(
2036                ProvenanceLog::open(log_path, "01J0000000000000000000TEST".into())
2037                    .expect("provenance log"),
2038            ),
2039            session_id: "01J0000000000000000000TEST".into(),
2040        };
2041
2042        let doi = Doi("10.1234/example".to_string());
2043        let url: url::Url = format!("{}/oa.pdf", server.uri()).parse().expect("url");
2044        let res = try_fetch_oa_pdf(&doi, &url, &ctx).await;
2045        match res {
2046            Err(HttpError::NotAPdf { .. }) => {}
2047            other => panic!("expected Err(NotAPdf), got: {other:?}"),
2048        }
2049    }
2050
2051    // Issue #145 / `docs/REDIRECT_ALLOWLIST.md` §1: the `oa-publisher`
2052    // host allowlist MUST be consulted on the metadata-discovered OA URL
2053    // *before the actual PDF fetch is issued*, not only on redirect hops.
2054    // An OA URL whose host is OFF the allowlist and that resolves WITHOUT
2055    // a redirect previously slipped past the redirect closure entirely and
2056    // was misclassified as a transport error. This test pins the fix: the
2057    // pre-fetch check rejects it with the SAME `HttpError::RedirectDenied`
2058    // the redirect closure produces, the OA fetch is NEVER issued (the
2059    // wiremock origin records ZERO requests, proving no PDF bytes were
2060    // requested / written), and the provenance trail is the byte-identical
2061    // `Fetch`/`err`/`oa-publisher`/`NETWORK_ERROR` row the redirect-denied
2062    // path emits.
2063    #[tokio::test]
2064    async fn try_fetch_oa_pdf_off_allowlist_host_no_redirect_is_redirect_denied_145() {
2065        use crate::http::HttpClient;
2066        use crate::provenance::ProvenanceLog;
2067        use crate::rate_limiter::RateLimiter;
2068        use crate::{DenialContext, DenialReason, Doi, RateLimits};
2069        use std::sync::Arc;
2070        use wiremock::matchers::method;
2071        use wiremock::{Mock, MockServer, ResponseTemplate};
2072
2073        // The wiremock origin would serve a valid PDF with NO redirect —
2074        // if the pre-check were absent the fetch would *succeed* against
2075        // an off-allowlist host, which is exactly the §1 violation.
2076        let server = MockServer::start().await;
2077        Mock::given(method("GET"))
2078            .respond_with(ResponseTemplate::new(200).set_body_bytes(b"%PDF-1.7 real pdf".to_vec()))
2079            .mount(&server)
2080            .await;
2081
2082        // Register a DIFFERENT host as the `oa-publisher` allowlist so the
2083        // wiremock origin (127.0.0.1) is OFF it. `evil.example.com` is a
2084        // valid host string the allowlist will not match.
2085        let td = tempfile::TempDir::new().expect("tempdir");
2086        let log_path = Utf8Path::from_path(td.path())
2087            .expect("utf-8")
2088            .join("log.jsonl");
2089        let ctx = FetchContext {
2090            http: Arc::new(HttpClient::new_for_tests_allow_http(
2091                "oa-publisher",
2092                "allowed-publisher.example.com",
2093            )),
2094            rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
2095            log: Arc::new(
2096                ProvenanceLog::open(log_path.clone(), "01J0000000000000000000TEST".into())
2097                    .expect("provenance log"),
2098            ),
2099            session_id: "01J0000000000000000000TEST".into(),
2100        };
2101
2102        let doi = Doi("10.1234/example".to_string());
2103        // The OA URL Unpaywall handed back resolves to the wiremock host,
2104        // which is OFF the `oa-publisher` allowlist.
2105        let off_host_url: url::Url = format!("{}/oa.pdf", server.uri()).parse().expect("url");
2106        let res = try_fetch_oa_pdf(&doi, &off_host_url, &ctx).await;
2107
2108        // 1. Same error variant the redirect closure produces.
2109        let err = match res {
2110            Err(e @ HttpError::RedirectDenied { .. }) => e,
2111            other => {
2112                panic!("expected Err(RedirectDenied) from the pre-fetch check, got: {other:?}")
2113            }
2114        };
2115        match &err {
2116            HttpError::RedirectDenied {
2117                source_key,
2118                host,
2119                expected_hosts,
2120            } => {
2121                assert_eq!(source_key, "oa-publisher");
2122                // The host is lowercased, exactly as the redirect closure
2123                // would record it.
2124                assert_eq!(
2125                    host,
2126                    off_host_url
2127                        .host_str()
2128                        .expect("wiremock host")
2129                        .to_ascii_lowercase()
2130                        .as_str()
2131                );
2132                assert_eq!(
2133                    expected_hosts,
2134                    &vec!["allowed-publisher.example.com".to_string()]
2135                );
2136            }
2137            _ => unreachable!(),
2138        }
2139
2140        // 2. The OA fetch was NEVER issued — the wiremock origin saw zero
2141        //    requests, so no PDF bytes were requested or written.
2142        assert!(
2143            server
2144                .received_requests()
2145                .await
2146                .unwrap_or_default()
2147                .is_empty(),
2148            "the off-allowlist OA URL must NOT be fetched: the pre-check \
2149             (REDIRECT_ALLOWLIST.md §1) rejects it before any request is \
2150             issued; wiremock recorded request(s)",
2151        );
2152
2153        // 3. The structured denial side-channel is byte-identical to the
2154        //    redirect-closure path: `RedirectNotInAllowlist`, source key,
2155        //    attempted host, expected allowlist snapshot.
2156        let dc: Option<DenialContext> = (&err).into();
2157        let dc = dc.expect("pre-fetch RedirectDenied -> Some(DenialContext)");
2158        assert_eq!(dc.reason, DenialReason::RedirectNotInAllowlist);
2159        assert_eq!(dc.source.as_deref(), Some("oa-publisher"));
2160        assert_eq!(
2161            dc.attempted,
2162            Some(off_host_url.host_str().expect("host").to_ascii_lowercase()),
2163            "attempted host must be the rejected OA URL host, lowercased — \
2164             identical to what the redirect closure records",
2165        );
2166        assert_eq!(
2167            dc.expected,
2168            Some(vec!["allowed-publisher.example.com".to_string()]),
2169        );
2170
2171        // 4. Provenance: exactly the `Fetch`/`err`/`oa-publisher`/
2172        //    `NETWORK_ERROR` row the post-fetch redirect-denied arm emits
2173        //    (same row kind + source key + closed-set code).
2174        let log_txt = std::fs::read_to_string(&log_path).expect("read provenance log");
2175        let fetch_err_row = log_txt
2176            .lines()
2177            .filter_map(|l| serde_json::from_str::<serde_json::Value>(l).ok())
2178            .find(|v| {
2179                v.get("event").and_then(|e| e.as_str()) == Some("fetch")
2180                    && v.get("result").and_then(|r| r.as_str()) == Some("err")
2181            })
2182            .expect("a Fetch/err provenance row was written");
2183        assert_eq!(
2184            fetch_err_row.get("source").and_then(|s| s.as_str()),
2185            Some("oa-publisher"),
2186        );
2187        assert_eq!(
2188            fetch_err_row.get("error_code").and_then(|c| c.as_str()),
2189            Some("NETWORK_ERROR"),
2190        );
2191        assert_eq!(
2192            fetch_err_row.get("ref").and_then(|r| r.as_str()),
2193            Some("10.1234/example"),
2194        );
2195    }
2196
2197    // Issue #145 positive / no-regression: an ON-allowlist OA URL still
2198    // fetches the PDF normally. The pre-fetch check must be a pure gate —
2199    // it must not perturb the happy path.
2200    #[tokio::test]
2201    async fn try_fetch_oa_pdf_on_allowlist_host_still_fetches_pdf_no_regression_145() {
2202        use crate::http::HttpClient;
2203        use crate::provenance::ProvenanceLog;
2204        use crate::rate_limiter::RateLimiter;
2205        use crate::{Doi, RateLimits};
2206        use std::sync::Arc;
2207        use wiremock::matchers::method;
2208        use wiremock::{Mock, MockServer, ResponseTemplate};
2209
2210        let server = MockServer::start().await;
2211        let body = b"%PDF-1.7\nhello pdf".to_vec();
2212        Mock::given(method("GET"))
2213            .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
2214            .mount(&server)
2215            .await;
2216        // The wiremock host IS the registered `oa-publisher` allowlist, so
2217        // the pre-check passes and the fetch proceeds as before.
2218        let host = server
2219            .uri()
2220            .parse::<url::Url>()
2221            .expect("uri")
2222            .host_str()
2223            .expect("host")
2224            .to_string();
2225
2226        let td = tempfile::TempDir::new().expect("tempdir");
2227        let log_path = Utf8Path::from_path(td.path())
2228            .expect("utf-8")
2229            .join("log.jsonl");
2230        let ctx = FetchContext {
2231            http: Arc::new(HttpClient::new_for_tests_allow_http("oa-publisher", &host)),
2232            rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
2233            log: Arc::new(
2234                ProvenanceLog::open(log_path, "01J0000000000000000000TEST".into())
2235                    .expect("provenance log"),
2236            ),
2237            session_id: "01J0000000000000000000TEST".into(),
2238        };
2239
2240        let doi = Doi("10.1234/example".to_string());
2241        let url: url::Url = format!("{}/oa.pdf", server.uri()).parse().expect("url");
2242        let (bytes, _final_url) = try_fetch_oa_pdf(&doi, &url, &ctx)
2243            .await
2244            .expect("on-allowlist OA URL still fetches the PDF");
2245        assert_eq!(bytes, body, "PDF bytes must be returned unchanged");
2246    }
2247
2248    // Issue #145: the pre-fetch denial and the redirect-closure denial
2249    // MUST produce a byte-identical `DenialContext` so PR #162's CLI
2250    // classification (CAPABILITY_DENIED / exit 3) handles both unchanged.
2251    // This pins the equivalence at the value level: the same source key +
2252    // host + allowlist snapshot map through the SAME
2253    // `From<&HttpError> for Option<DenialContext>` impl to equal structs.
2254    #[test]
2255    fn pre_fetch_denial_produces_byte_identical_denial_context_as_redirect_denied_145() {
2256        use crate::{DenialContext, DenialReason};
2257
2258        // Shape produced by the pre-fetch check in `try_fetch_oa_pdf`.
2259        let pre_fetch = HttpError::RedirectDenied {
2260            source_key: "oa-publisher".to_string(),
2261            host: "attacker.test".to_string(),
2262            expected_hosts: vec!["*.springer.com".to_string(), "*.plos.org".to_string()],
2263        };
2264        // Shape produced by the redirect closure in `crate::http` for the
2265        // identical inputs.
2266        let redirect_closure = HttpError::RedirectDenied {
2267            source_key: "oa-publisher".to_string(),
2268            host: "attacker.test".to_string(),
2269            expected_hosts: vec!["*.springer.com".to_string(), "*.plos.org".to_string()],
2270        };
2271
2272        let dc_pre: Option<DenialContext> = (&pre_fetch).into();
2273        let dc_red: Option<DenialContext> = (&redirect_closure).into();
2274        let dc_pre = dc_pre.expect("pre-fetch -> Some");
2275        let dc_red = dc_red.expect("redirect -> Some");
2276
2277        // Byte-identical: same reason, same source, same attempted host,
2278        // same expected snapshot, all auxiliary channels None.
2279        assert_eq!(dc_pre, dc_red);
2280        assert_eq!(dc_pre.reason, DenialReason::RedirectNotInAllowlist);
2281        assert_eq!(dc_pre.source.as_deref(), Some("oa-publisher"));
2282        assert_eq!(dc_pre.attempted.as_deref(), Some("attacker.test"));
2283        assert_eq!(
2284            dc_pre.expected,
2285            Some(vec!["*.springer.com".to_string(), "*.plos.org".to_string()]),
2286        );
2287        assert_eq!(dc_pre.hop_index, None);
2288        assert_eq!(dc_pre.cap, None);
2289        assert_eq!(dc_pre.actual, None);
2290    }
2291
2292    // -----------------------------------------------------------------
2293    // #139 — metadata_only_to_store writes the metadata TOML;
2294    //        resolve_only / pure metadata_only write NOTHING.
2295    // -----------------------------------------------------------------
2296
2297    /// Build a ctx + FsStore under a fresh tempdir and point Crossref at
2298    /// a wiremock origin that returns one minimal `message`. Returns
2299    /// `(server, ctx, store, store_root, _td)` — `_td` keeps the tempdir
2300    /// alive for the test body.
2301    async fn md139_harness() -> (
2302        wiremock::MockServer,
2303        FetchContext,
2304        crate::store::FsStore,
2305        Utf8PathBuf,
2306        tempfile::TempDir,
2307    ) {
2308        use crate::http::HttpClient;
2309        use crate::provenance::ProvenanceLog;
2310        use crate::rate_limiter::RateLimiter;
2311        use crate::store::FsStore;
2312        use crate::RateLimits;
2313        use std::sync::Arc;
2314        use wiremock::matchers::method;
2315        use wiremock::{Mock, MockServer, ResponseTemplate};
2316
2317        let server = MockServer::start().await;
2318        Mock::given(method("GET"))
2319            .respond_with(ResponseTemplate::new(200).set_body_string(
2320                r#"{"status":"ok","message":{"title":["Example Paper"],"author":[{"given":"Ada","family":"Lovelace"}]}}"#,
2321            ))
2322            .mount(&server)
2323            .await;
2324        std::env::set_var("DOIGET_CROSSREF_BASE", server.uri());
2325
2326        // wiremock serves http://127.0.0.1:PORT; the production client is
2327        // https_only, so the test ctx uses the allow-http test client
2328        // scoped to the crossref/unpaywall source keys + the wiremock host.
2329        let host = server
2330            .uri()
2331            .parse::<url::Url>()
2332            .expect("uri")
2333            .host_str()
2334            .expect("host")
2335            .to_string();
2336
2337        let td = tempfile::TempDir::new().expect("tempdir");
2338        let base = Utf8Path::from_path(td.path()).expect("utf-8");
2339        let log_path = base.join("log.jsonl");
2340        let store_root = base.join("papers");
2341        let ctx = FetchContext {
2342            http: Arc::new(HttpClient::new_for_tests_allow_http_multi(&[
2343                ("crossref", &host),
2344                ("unpaywall", &host),
2345            ])),
2346            rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
2347            log: Arc::new(
2348                ProvenanceLog::open(log_path, "01J0000000000000000000TEST".into())
2349                    .expect("provenance log"),
2350            ),
2351            session_id: "01J0000000000000000000TEST".into(),
2352        };
2353        let store = FsStore::new(store_root.clone()).expect("fs store");
2354        (server, ctx, store, store_root, td)
2355    }
2356
2357    fn metadata_dir_tomls(store_root: &Utf8Path) -> Vec<Utf8PathBuf> {
2358        let md = store_root.join(".metadata");
2359        match std::fs::read_dir(md.as_std_path()) {
2360            Ok(rd) => rd
2361                .filter_map(|e| e.ok())
2362                .filter_map(|e| Utf8PathBuf::from_path_buf(e.path()).ok())
2363                .filter(|p| p.extension() == Some("toml"))
2364                .collect(),
2365            Err(_) => Vec::new(),
2366        }
2367    }
2368
2369    #[tokio::test]
2370    #[serial_test::serial]
2371    async fn metadata_only_to_store_writes_metadata_toml_139() {
2372        let (_server, ctx, store, store_root, _td) = md139_harness().await;
2373        let profile = CapabilityProfile::from_env().expect("clean env");
2374        let ref_ = Ref::Doi(Doi("10.1234/example".to_string()));
2375
2376        let outcome = metadata_only_to_store(&ref_, &profile, &ctx, &store)
2377            .await
2378            .expect("metadata_only_to_store ok");
2379        assert_eq!(outcome.source, "crossref");
2380
2381        let tomls = metadata_dir_tomls(&store_root);
2382        assert_eq!(
2383            tomls.len(),
2384            1,
2385            "exactly one .metadata/*.toml must be written (MCP_TOOLS.md §11 SIDE EFFECT, #139); got {tomls:?}"
2386        );
2387        let body = std::fs::read_to_string(&tomls[0]).expect("read metadata toml");
2388        let meta: crate::store::Metadata = toml::from_str(&body).expect("parse metadata toml");
2389        assert_eq!(meta.title, "Example Paper");
2390        assert_eq!(
2391            meta.doi.as_ref().map(|d| d.as_str()),
2392            Some("10.1234/example")
2393        );
2394        let ext = meta.doiget.expect("[doiget] table present");
2395        assert_eq!(ext.source, "crossref");
2396        assert_eq!(ext.size_bytes, 0, "metadata-only entry has no PDF");
2397
2398        std::env::remove_var("DOIGET_CROSSREF_BASE");
2399    }
2400
2401    #[tokio::test]
2402    #[serial_test::serial]
2403    async fn resolve_only_and_pure_metadata_only_write_nothing_139() {
2404        let (_server, ctx, _store, store_root, _td) = md139_harness().await;
2405        let profile = CapabilityProfile::from_env().expect("clean env");
2406        let ref_ = Ref::Doi(Doi("10.1234/example".to_string()));
2407
2408        // resolve_only: contractually MUST NOT touch the store.
2409        let r = resolve_only(&ref_, &profile, &ctx)
2410            .await
2411            .expect("resolve_only ok");
2412        assert_eq!(r.source, "crossref");
2413        assert!(
2414            metadata_dir_tomls(&store_root).is_empty(),
2415            "resolve_only MUST NOT write a metadata TOML (docs/MCP_TOOLS.md §1; #139)"
2416        );
2417
2418        // The pure metadata_only is also write-free (the store-write
2419        // lives only in metadata_only_to_store).
2420        let m = metadata_only(&ref_, &profile, &ctx)
2421            .await
2422            .expect("metadata_only ok");
2423        assert_eq!(m.source, "crossref");
2424        assert!(
2425            metadata_dir_tomls(&store_root).is_empty(),
2426            "pure metadata_only MUST NOT write to the store (#139)"
2427        );
2428
2429        std::env::remove_var("DOIGET_CROSSREF_BASE");
2430    }
2431
2432    /// #139 — the arXiv branch of `metadata_only_to_store` must also
2433    /// write the metadata TOML (different code path: Atom feed,
2434    /// source="arxiv", license="arxiv-default", doi=None). Review I3/C1.
2435    #[tokio::test]
2436    #[serial_test::serial]
2437    async fn metadata_only_to_store_arxiv_writes_metadata_toml_139() {
2438        use crate::http::HttpClient;
2439        use crate::provenance::ProvenanceLog;
2440        use crate::rate_limiter::RateLimiter;
2441        use crate::store::FsStore;
2442        use crate::RateLimits;
2443        use std::sync::Arc;
2444        use wiremock::matchers::method;
2445        use wiremock::{Mock, MockServer, ResponseTemplate};
2446
2447        let atom = r#"<?xml version="1.0" encoding="UTF-8"?>
2448<feed xmlns="http://www.w3.org/2005/Atom">
2449  <entry>
2450    <id>http://arxiv.org/abs/2401.12345v1</id>
2451    <published>2024-01-15T00:00:00Z</published>
2452    <title>Example arXiv Paper Title</title>
2453    <summary>Example abstract.</summary>
2454    <author><name>Jane Doe</name></author>
2455    <category term="cs.LG" scheme="http://arxiv.org/schemas/atom"/>
2456  </entry>
2457</feed>"#;
2458        let server = MockServer::start().await;
2459        Mock::given(method("GET"))
2460            .respond_with(ResponseTemplate::new(200).set_body_string(atom))
2461            .mount(&server)
2462            .await;
2463        std::env::set_var("DOIGET_ARXIV_BASE", server.uri());
2464        let host = server
2465            .uri()
2466            .parse::<url::Url>()
2467            .expect("uri")
2468            .host_str()
2469            .expect("host")
2470            .to_string();
2471
2472        let td = tempfile::TempDir::new().expect("tempdir");
2473        let base = Utf8Path::from_path(td.path()).expect("utf-8");
2474        let store_root = base.join("papers");
2475        let ctx = FetchContext {
2476            http: Arc::new(HttpClient::new_for_tests_allow_http("arxiv", &host)),
2477            rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
2478            log: Arc::new(
2479                ProvenanceLog::open(base.join("log.jsonl"), "01J0000000000000000000TEST".into())
2480                    .expect("provenance log"),
2481            ),
2482            session_id: "01J0000000000000000000TEST".into(),
2483        };
2484        let store = FsStore::new(store_root.clone()).expect("fs store");
2485        let profile = CapabilityProfile::from_env().expect("clean env");
2486        let ref_ = Ref::Arxiv(crate::ArxivId::parse("2401.12345").expect("arxiv id"));
2487
2488        let outcome = metadata_only_to_store(&ref_, &profile, &ctx, &store)
2489            .await
2490            .expect("metadata_only_to_store (arxiv) ok");
2491        assert_eq!(outcome.source, "arxiv");
2492
2493        let tomls = metadata_dir_tomls(&store_root);
2494        assert_eq!(
2495            tomls.len(),
2496            1,
2497            "arXiv metadata-only must write one TOML; got {tomls:?}"
2498        );
2499        let meta: crate::store::Metadata =
2500            toml::from_str(&std::fs::read_to_string(&tomls[0]).expect("read")).expect("parse");
2501        assert_eq!(meta.title, "Example arXiv Paper Title");
2502        assert_eq!(
2503            meta.arxiv_id.as_ref().map(|a| a.as_str()),
2504            Some("2401.12345")
2505        );
2506        assert!(meta.doi.is_none(), "arXiv entry has no DOI");
2507        let ext = meta.doiget.expect("[doiget] table");
2508        assert_eq!(ext.source, "arxiv");
2509        assert_eq!(ext.license, "arxiv-default");
2510
2511        std::env::remove_var("DOIGET_ARXIV_BASE");
2512    }
2513
2514    // ----- pure-function unit tests for the #139 extraction helpers ----
2515
2516    #[test]
2517    fn extract_metadata_title_handles_string_array_missing_blank() {
2518        use serde_json::json;
2519        // bare string (arXiv/Unpaywall shape)
2520        assert_eq!(
2521            extract_metadata_title(&json!({"title": "Hello"})),
2522            Some("Hello".to_string())
2523        );
2524        // single-element array (Crossref `message.title` in practice)
2525        assert_eq!(
2526            extract_metadata_title(&json!({"title": ["Real Title"]})),
2527            Some("Real Title".to_string())
2528        );
2529        // missing key -> None (caller falls back to ref id)
2530        assert_eq!(extract_metadata_title(&json!({"x": 1})), None);
2531        // blank string -> None (must not persist an empty title)
2532        assert_eq!(extract_metadata_title(&json!({"title": "   "})), None);
2533        // empty array -> None
2534        assert_eq!(extract_metadata_title(&json!({"title": []})), None);
2535        // A leading blank/whitespace array element is SKIPPED — the first
2536        // non-blank element is taken (a stray leading empty element must
2537        // not mask the real Crossref title).
2538        assert_eq!(
2539            extract_metadata_title(&json!({"title": ["  ", "Real Title"]})),
2540            Some("Real Title".to_string())
2541        );
2542        // all-blank array -> None (caller falls back to ref id)
2543        assert_eq!(extract_metadata_title(&json!({"title": ["  ", ""]})), None);
2544    }
2545
2546    #[test]
2547    fn extract_metadata_authors_handles_each_resolver_shape() {
2548        use serde_json::json;
2549        // arXiv: authors: [String]
2550        assert_eq!(
2551            extract_metadata_authors(&json!({"authors": ["Jane Doe", "John Roe"]})),
2552            vec!["Jane Doe".to_string(), "John Roe".to_string()]
2553        );
2554        // Crossref: author: [{given,family}]
2555        assert_eq!(
2556            extract_metadata_authors(&json!({"author": [{"given": "Ada", "family": "Lovelace"}]})),
2557            vec!["Ada Lovelace".to_string()]
2558        );
2559        // family-only (given absent) -> trimmed, no leading space
2560        assert_eq!(
2561            extract_metadata_authors(&json!({"author": [{"family": "Onsager"}]})),
2562            vec!["Onsager".to_string()]
2563        );
2564        // `name` fallback when given+family both absent
2565        assert_eq!(
2566            extract_metadata_authors(&json!({"author": [{"name": "K. Wilson"}]})),
2567            vec!["K. Wilson".to_string()]
2568        );
2569        // z_authors fallback shape (forward-compat branch)
2570        assert_eq!(
2571            extract_metadata_authors(&json!({"z_authors": [{"given": "L", "family": "Kadanoff"}]})),
2572            vec!["L Kadanoff".to_string()]
2573        );
2574        // nothing parseable -> empty (still a valid TOML)
2575        assert!(extract_metadata_authors(&json!({"x": 1})).is_empty());
2576        assert!(extract_metadata_authors(&json!({"authors": []})).is_empty());
2577    }
2578}