doiget_core/orchestrator.rs
1//! Cross-source orchestrators that compose multiple [`Source`] impls into
2//! a single user-facing operation.
3//!
4//! Slice 2 of the doiget roadmap promotes [`fetch_paper`] and
5//! [`batch_fetch`] from `doiget-cli` into this module so the MCP server
6//! (`doiget-mcp`) and the CLI share one source of truth for the per-ref
7//! orchestration. The CLI's `commands::fetch::fetch_one` is now a thin
8//! wrapper that delegates here and adds the human-facing stderr print
9//! line. Dry-run preview helpers live as [`fetch_paper_plan`] and
10//! [`batch_fetch_plans`].
11//!
12//! [`Source`]: crate::source::Source
13
14use std::collections::BTreeMap;
15
16use camino::{Utf8Path, Utf8PathBuf};
17use chrono::Utc;
18use serde_json::Value;
19
20use crate::dry_run::{build_fetch_plan, try_build_fetch_plan, FetchPlan};
21use crate::http::HttpError;
22use crate::provenance::{Capability, LogEvent, LogResult, RowInput};
23use crate::source::{FetchContext, FetchError, FetchResult, Source};
24use crate::sources::arxiv::ArxivSource;
25use crate::sources::crossref::CrossrefSource;
26use crate::sources::unpaywall::UnpaywallSource;
27use crate::store::{DoigetExtension, Metadata, Store};
28use crate::{ArxivId, CapabilityProfile, Doi, Ref, Safekey, MAX_BATCH_REFS, SCHEMA_VERSION};
29
30/// Outcome of a successful [`metadata_only`] call.
31///
32/// Mirrors the wire shape documented in `docs/MCP_TOOLS.md` §11: the
33/// `source` identifies which resolver produced the metadata, `license`
34/// is the OA license string when known (Unpaywall channel), `oa_url` is
35/// the discovered OA URL **(never followed by this orchestrator)**, and
36/// `metadata` is the source's native JSON payload (Crossref `message`,
37/// Unpaywall work record, or the parsed arXiv Atom-feed object).
38///
39/// `metadata` is serialized as-is by the MCP envelope builder
40/// (`crates/doiget-mcp/src/lib.rs`); we deliberately do NOT normalize
41/// here so the agent can see exactly what the source returned.
42#[derive(Debug, Clone)]
43#[non_exhaustive]
44pub struct MetadataOnlyOutcome {
45 /// Resolver key that produced the metadata payload. One of
46 /// `"crossref"`, `"unpaywall"`, `"arxiv"` (the closed set named in
47 /// `docs/MCP_TOOLS.md` §11 type alias).
48 pub source: String,
49 /// Resolver profile under which the canonical-digest (ADR-0021 §1)
50 /// was minted for this call. In Slice 4 this equals
51 /// [`Self::source`] verbatim (the metadata-only path emits one row
52 /// per consulted resolver); future slices that introduce overlapping
53 /// resolvers MAY have `resolver_profile != source`. Surfaced through
54 /// the `doiget_metadata_only` MCP envelope per ADR-0021 §4.
55 pub resolver_profile: String,
56 /// OA license string when the resolver could supply one (today only
57 /// the Unpaywall fallback path populates this). `None` when the
58 /// primary source did not surface a license.
59 pub license: Option<String>,
60 /// Discovered OA URL — surfaced to the caller for separate action,
61 /// **never followed by this orchestrator**. The Crossref response's
62 /// `message.link[]` array is mined first; the Unpaywall fallback
63 /// path uses `best_oa_location.url_for_pdf` (or `url`).
64 pub oa_url: Option<String>,
65 /// Source's native metadata payload. For Crossref this is the
66 /// `message` object; for Unpaywall the work record; for arXiv the
67 /// parsed Atom-feed JSON (see
68 /// `crate::sources::arxiv::parse_atom_feed`).
69 pub metadata: Value,
70}
71
72/// Resolve a [`Ref`] to metadata WITHOUT triggering a publisher PDF
73/// fetch.
74///
75/// Binding spec: `docs/MCP_TOOLS.md` §11 (NORMATIVE — this function
76/// MUST NOT call [`crate::http::HttpClient::fetch_pdf`] under any code
77/// path). The posture-lint workflow greps for that pattern; the test
78/// suite additionally exercises the DOI and arXiv branches end-to-end
79/// against wiremock to assert the OA URL is reported, not followed.
80///
81/// # Dispatch
82///
83/// - `Ref::Doi(_)` → Crossref first (bibliographic metadata + OA URL
84/// via `message.link[]`). If Crossref returns a usable payload the
85/// call returns immediately; Unpaywall is consulted only as a fallback
86/// when Crossref fails. The Unpaywall fallback surfaces a license
87/// string and may overwrite `oa_url` with the `best_oa_location`
88/// channel.
89/// - `Ref::Arxiv(_)` → [`ArxivSource::fetch_metadata_only`]: ONLY the
90/// Atom feed (`https://export.arxiv.org/api/query?id_list=<id>`) is
91/// consulted; the PDF endpoint is NOT touched. `license` is set to
92/// the platform-wide `"arxiv-default"` token, `oa_url` is `None`
93/// (the arXiv abstract page is not a PDF URL).
94///
95/// # Side effects
96///
97/// Each consulted source appends ONE `LogEvent::Fetch` row to
98/// `ctx.log` (arXiv emits its row under `Capability::Metadata`; the
99/// DOI sources emit under `Capability::Oa` — they pre-date this
100/// distinction and a follow-up slice may unify them). The orchestrator
101/// itself does NOT bracket the call with `SessionStart` / `SessionEnd`
102/// rows — that is the MCP server's responsibility (it owns the
103/// per-tool-call session boundary).
104///
105/// This function is the **pure resolver**: it consults the source(s)
106/// and emits provenance rows, but it does NOT write to the store.
107/// The `docs/MCP_TOOLS.md` §11 store-write SIDE EFFECT is provided by
108/// [`metadata_only_to_store`], which wraps this and persists the
109/// metadata TOML to `<root>/.metadata/<safekey>.toml`. Keeping the
110/// store-write in a *separate* entry point is exactly what lets
111/// [`resolve_only`] safely delegate here — its contract forbids any
112/// store write, and a pure `metadata_only` can never regress that
113/// invariant (#139).
114///
115/// # Errors
116///
117/// Returns [`FetchError`] from the underlying [`Source`] dispatch. The
118/// MCP boundary converts these to the closed [`crate::ErrorCode`] set
119/// via the existing `From<FetchError> for ErrorCode` impl.
120// Stays `pub` (a `pub(crate)` compile-time guard was considered and
121// rejected): `crates/doiget-core/tests/` integration tests
122// (`real_world_fixtures_e2e`) legitimately drive the PURE resolver
123// directly and assert its outcome, and `tests/` compiles as a separate
124// crate. The #139 pre-fix bug (an MCP caller
125// picking the pure variant when it needed persistence) is instead
126// prevented *structurally*: the MCP layer imports only
127// `metadata_only_to_store`, and `resolve_only` delegates to this pure
128// fn — neither can acquire or skip the store-write by mistake.
129pub async fn metadata_only(
130 ref_: &Ref,
131 profile: &CapabilityProfile,
132 ctx: &FetchContext,
133) -> Result<MetadataOnlyOutcome, FetchError> {
134 match ref_ {
135 Ref::Doi(doi) => metadata_only_doi(doi, ref_, profile, ctx).await,
136 Ref::Arxiv(id) => {
137 let arxiv = arxiv_source_from_env();
138 let metadata = arxiv.fetch_metadata_only(id, ctx).await?;
139 // Pure resolver — no store write here (see fn doc); the
140 // store-write side effect lives in `metadata_only_to_store`.
141 Ok(MetadataOnlyOutcome {
142 source: arxiv.name().to_string(),
143 resolver_profile: arxiv.name().to_string(),
144 license: Some("arxiv-default".to_string()),
145 oa_url: None,
146 metadata,
147 })
148 }
149 }
150}
151
152/// Resolve a [`Ref`] to metadata with **no local persistence**.
153///
154/// This is the audit-trail-preserving sibling of [`metadata_only`]: each
155/// consulted [`Source`] still emits its own `LogEvent::Fetch` row
156/// through `ctx.log` (so the provenance hash chain remains continuous,
157/// per `docs/PROVENANCE_LOG.md`), but the orchestrator MUST NOT write
158/// the metadata TOML to the store under any code path — present or
159/// future.
160///
161/// Binding spec: `docs/MCP_TOOLS.md` §1 (the `doiget_resolve_paper`
162/// tool — Slice 7).
163///
164/// # Why this exists as a distinct orchestrator
165///
166/// [`metadata_only`] is the **pure resolver** and never writes to the
167/// store; the store-write SIDE EFFECT lives only in the separate
168/// [`metadata_only_to_store`] wrapper. Because the write is in a
169/// *different* entry point that this function does not call,
170/// delegating to [`metadata_only`] is permanently safe — there is no
171/// code path by which `resolve_only` can acquire a store write, now or
172/// in future (#139). This structural separation is the entire reason
173/// `metadata_only` was split into a pure core + a persisting wrapper
174/// rather than gaining a `write: bool` parameter.
175///
176/// # Dispatch
177///
178/// Identical to [`metadata_only`] (DOI → Crossref-first with Unpaywall
179/// fallback; arXiv → Atom feed only). The `oa_url` and `license`
180/// outputs follow the same rules.
181///
182/// # Side effects
183///
184/// One `LogEvent::Fetch` row per consulted resolver, written by the
185/// underlying [`Source`] impls. No metadata TOML write. No PDF fetch.
186/// No store mutation.
187///
188/// # Errors
189///
190/// Returns [`FetchError`] from the underlying [`Source`] dispatch,
191/// identical to [`metadata_only`].
192pub async fn resolve_only(
193 ref_: &Ref,
194 profile: &CapabilityProfile,
195 ctx: &FetchContext,
196) -> Result<MetadataOnlyOutcome, FetchError> {
197 // Delegating to the PURE `metadata_only` is the contract-correct
198 // implementation, not a placeholder: `metadata_only` never writes
199 // to the store (the persisting path is the separate
200 // `metadata_only_to_store`, which this function does not call), so
201 // `resolve_only`'s "no store mutation" guarantee holds structurally
202 // and cannot regress (#139).
203 metadata_only(ref_, profile, ctx).await
204}
205
206/// Resolve a [`Ref`] to metadata **and persist the metadata TOML to the
207/// store** — the `docs/MCP_TOOLS.md` §11 `doiget_metadata_only` SIDE
208/// EFFECT (#139).
209///
210/// Wraps the pure [`metadata_only`]: it runs the same resolver dispatch
211/// (so the provenance hash chain is identical), then writes
212/// `<root>/.metadata/<safekey>.toml` via the same
213/// `write_metadata_and_pdf` path `fetch_paper` uses for its
214/// metadata-only fallback, emitting one `StoreWrite` provenance row.
215///
216/// [`resolve_only`] MUST NOT call this — its contract forbids any store
217/// write. The split (pure core vs. persisting wrapper) makes that
218/// invariant structural rather than a convention.
219///
220/// # Errors
221///
222/// [`FetchError`] from the underlying resolver dispatch, or — if the
223/// store write fails — [`FetchError::SourceSchema`] (the closest
224/// closed-set arm; there is no dedicated `FetchError::StoreError`, so
225/// the MCP boundary maps it to `INTERNAL_ERROR` — see the inline note
226/// in `write_metadata_and_pdf`). On store-write failure
227/// `write_metadata_and_pdf` makes a **best-effort** attempt to
228/// append a `StoreWrite`/`Err` provenance row before the error
229/// propagates (that append's own failure is not separately surfaced —
230/// this matches the pre-existing `fetch_paper` metadata-only fallback
231/// path and is out of scope for #139).
232pub async fn metadata_only_to_store(
233 ref_: &Ref,
234 profile: &CapabilityProfile,
235 ctx: &FetchContext,
236 store: &dyn Store,
237) -> Result<MetadataOnlyOutcome, FetchError> {
238 let outcome = metadata_only(ref_, profile, ctx).await?;
239 let safekey = ref_.safekey();
240 let metadata = build_metadata_only_metadata(ref_, &outcome);
241 // `pdf_src = None` => writes `<root>/.metadata/<safekey>.toml` and
242 // appends the `StoreWrite` row (the exact path `fetch_paper` uses
243 // for its DOI metadata-only fallback).
244 write_metadata_and_pdf(store, &safekey, &metadata, None, ctx)?;
245 Ok(outcome)
246}
247
248/// Build the [`Metadata`] persisted by [`metadata_only_to_store`].
249///
250/// Minimal but valid: enough that a subsequent `doiget_info` returns a
251/// non-null `metadata` object (the #139 acceptance criterion). Title is
252/// best-effort from the resolver payload (`title` as a string, or the
253/// first element if it is an array — Crossref's `message.title` is
254/// typically an array, arXiv/Unpaywall typically a string; the
255/// extractor tolerates either regardless of source); it falls back to
256/// the ref id so the required `title` field is never empty.
257/// Bibliographic enrichment
258/// (year, venue, …) is intentionally out of scope here — the
259/// metadata-only contract is "persist what the resolver returned", and
260/// the raw payload is preserved verbatim in `MetadataOnlyOutcome`.
261fn build_metadata_only_metadata(ref_: &Ref, outcome: &MetadataOnlyOutcome) -> Metadata {
262 let (doi, arxiv_id) = match ref_ {
263 Ref::Doi(d) => (Some(d.clone()), None),
264 Ref::Arxiv(a) => (None, Some(a.clone())),
265 };
266 let ref_id = ref_.as_input_str().to_string();
267 let title = match extract_metadata_title(&outcome.metadata) {
268 Some(t) => t,
269 None => {
270 // The resolver returned a payload with no usable title.
271 // Persisting the ref id keeps the entry valid (#139), but
272 // emit a diagnostic so a broken/partial resolver response is
273 // not silently indistinguishable from a genuine title.
274 tracing::warn!(
275 ref_id = %ref_id,
276 source = %outcome.source,
277 "metadata-only: no usable title in resolver payload; \
278 persisting the ref id as the title placeholder"
279 );
280 ref_id
281 }
282 };
283 Metadata {
284 schema_version: SCHEMA_VERSION.to_string(),
285 title,
286 authors: extract_metadata_authors(&outcome.metadata),
287 year: None,
288 doi,
289 arxiv_id,
290 abstract_: None,
291 venue: None,
292 publisher: None,
293 issn: None,
294 isbn: None,
295 type_: None,
296 keywords: Vec::new(),
297 url: outcome.oa_url.clone(),
298 pdf_path: None,
299 doiget: Some(DoigetExtension {
300 fetched_at: Utc::now(),
301 source: outcome.source.clone(),
302 license: outcome
303 .license
304 .clone()
305 .unwrap_or_else(|| "unknown".to_string()),
306 size_bytes: 0,
307 mcp_call_id: None,
308 }),
309 other: BTreeMap::new(),
310 }
311}
312
313/// `title` from a resolver payload: a bare string, or the first
314/// **non-blank** element of an array (Crossref `message.title` is
315/// `[String]`; a leading empty/whitespace element is skipped rather
316/// than masking the real title). Trimmed. `None` if absent/blank.
317fn extract_metadata_title(meta: &Value) -> Option<String> {
318 let t = meta.get("title")?;
319 let s = match t.as_str() {
320 Some(s) => s.trim().to_string(),
321 None => t
322 .as_array()?
323 .iter()
324 .filter_map(Value::as_str)
325 .map(str::trim)
326 .find(|s| !s.is_empty())?
327 .to_string(),
328 };
329 if s.is_empty() {
330 None
331 } else {
332 Some(s)
333 }
334}
335
336/// Best-effort author list, tolerant of the resolver shapes we may see:
337/// Crossref `author: [{given,family}]`, arXiv `authors: [String]`, and
338/// a `z_authors: [{given,family}]` fallback. NOTE: doiget's Unpaywall
339/// source deserializes a *partial* `UnpaywallWork` that does not capture
340/// `z_authors`, so the `z_authors` branch is currently inert for the
341/// Unpaywall path (kept as forward-compat for if/when that struct
342/// captures it) — Unpaywall-sourced metadata-only entries get an empty
343/// author list. Returns `Vec::new()` when nothing is parseable (a valid
344/// metadata TOML — #139 only requires the entry to exist and be
345/// readable).
346fn extract_metadata_authors(meta: &Value) -> Vec<String> {
347 if let Some(arr) = meta.get("authors").and_then(Value::as_array) {
348 let v: Vec<String> = arr
349 .iter()
350 .filter_map(|a| a.as_str().map(str::to_string))
351 .collect();
352 if !v.is_empty() {
353 return v;
354 }
355 }
356 for key in ["author", "z_authors"] {
357 if let Some(arr) = meta.get(key).and_then(Value::as_array) {
358 let v: Vec<String> = arr
359 .iter()
360 .filter_map(|a| {
361 let given = a.get("given").and_then(Value::as_str).unwrap_or("");
362 let family = a.get("family").and_then(Value::as_str).unwrap_or("");
363 let name = format!("{given} {family}");
364 let name = name.trim();
365 if name.is_empty() {
366 a.get("name").and_then(Value::as_str).map(str::to_string)
367 } else {
368 Some(name.to_string())
369 }
370 })
371 .collect();
372 if !v.is_empty() {
373 return v;
374 }
375 }
376 }
377 Vec::new()
378}
379
380// ---------------------------------------------------------------------------
381// Env-aware source constructors (mirrors doiget-cli::commands::fetch::build_*)
382//
383// These let MCP integration tests redirect the orchestrator at a
384// wiremock origin via `DOIGET_*_BASE` env vars, without inverting the
385// `doiget-mcp -> doiget-core` wiring by depending on `doiget-cli`. The
386// override surface is identical to the CLI's `fetch.rs::build_*_source`
387// helpers so a single test fixture can drive both crates.
388// ---------------------------------------------------------------------------
389
390/// `DOIGET_CONTACT_EMAIL`, defaulting to the same `doiget@localhost`
391/// the CLI uses (`crates/doiget-cli/src/commands/fetch.rs::OrchestratorConfig`).
392const FALLBACK_CONTACT_EMAIL: &str = "doiget@localhost";
393
394fn contact_email_from_env() -> String {
395 std::env::var("DOIGET_CONTACT_EMAIL").unwrap_or_else(|_| FALLBACK_CONTACT_EMAIL.to_string())
396}
397
398fn arxiv_source_from_env() -> ArxivSource {
399 if let Ok(s) = std::env::var("DOIGET_ARXIV_BASE") {
400 if let Ok(url) = url::Url::parse(&s) {
401 return ArxivSource::with_base(url);
402 }
403 }
404 ArxivSource::new()
405}
406
407fn crossref_source_from_env(contact: &str) -> CrossrefSource {
408 if let Ok(s) = std::env::var("DOIGET_CROSSREF_BASE") {
409 if let Ok(url) = url::Url::parse(&s) {
410 return CrossrefSource::with_base(url, contact.to_string());
411 }
412 }
413 CrossrefSource::new(contact.to_string())
414}
415
416fn unpaywall_source_from_env(contact: &str) -> UnpaywallSource {
417 if let Ok(s) = std::env::var("DOIGET_UNPAYWALL_BASE") {
418 if let Ok(url) = url::Url::parse(&s) {
419 return UnpaywallSource::with_base(url, contact.to_string());
420 }
421 }
422 UnpaywallSource::new(contact.to_string())
423}
424
425/// DOI branch — Crossref first, with Unpaywall as a fallback when
426/// Crossref fails. Crossref's `message.link[]` array (when present)
427/// supplies the OA URL hint without making a publisher request.
428async fn metadata_only_doi(
429 _doi: &Doi,
430 ref_: &Ref,
431 profile: &CapabilityProfile,
432 ctx: &FetchContext,
433) -> Result<MetadataOnlyOutcome, FetchError> {
434 let contact = contact_email_from_env();
435 let crossref = crossref_source_from_env(&contact);
436 match crossref.fetch(ref_, profile, ctx).await {
437 Ok(res) => {
438 let metadata = res.metadata_json.unwrap_or(Value::Null);
439 let oa_url = extract_crossref_oa_url(&metadata);
440 // Pure resolver — no store write here (see `metadata_only`
441 // doc); persistence is `metadata_only_to_store`'s job.
442 Ok(MetadataOnlyOutcome {
443 source: crossref.name().to_string(),
444 resolver_profile: crossref.name().to_string(),
445 // Crossref does not surface a license directly; the
446 // license channel for DOI metadata is Unpaywall's
447 // `best_oa_location.license`. Leave `None` here; the
448 // agent can call `unpaywall` (or a follow-up slice's
449 // chained orchestrator) if it needs a license string.
450 license: None,
451 oa_url,
452 metadata,
453 })
454 }
455 Err(crossref_err) => {
456 // Crossref failed. Try Unpaywall as a fallback before
457 // surfacing the original error.
458 let unpaywall = unpaywall_source_from_env(&contact);
459 match unpaywall.fetch(ref_, profile, ctx).await {
460 Ok(res) => {
461 let metadata = res.metadata_json.unwrap_or(Value::Null);
462 let oa_url = extract_unpaywall_oa_url(&metadata);
463 let license = if res.license == "unknown" {
464 None
465 } else {
466 Some(res.license)
467 };
468 Ok(MetadataOnlyOutcome {
469 source: unpaywall.name().to_string(),
470 resolver_profile: unpaywall.name().to_string(),
471 license,
472 oa_url,
473 metadata,
474 })
475 }
476 Err(_unpaywall_err) => {
477 // Both sources failed; surface the Crossref error
478 // (the primary path) for diagnosability.
479 Err(crossref_err)
480 }
481 }
482 }
483 }
484}
485
486/// Defensively pull a Crossref OA URL out of a `message.link[]` entry.
487///
488/// The Crossref `Link` model documents `link[].URL` as the OA URL string
489/// when the work has one (see
490/// `<https://api.crossref.org/swagger-ui/index.html>`). Multiple entries
491/// may be present; we return the first non-empty `URL` field
492/// encountered. Returns `None` if the array is missing, empty, or
493/// contains no usable URL string.
494fn extract_crossref_oa_url(msg: &Value) -> Option<String> {
495 let arr = msg.get("link")?.as_array()?;
496 arr.iter()
497 .filter_map(|entry| entry.get("URL").and_then(Value::as_str))
498 .find(|s| !s.is_empty())
499 .map(|s| s.to_string())
500}
501
502/// Defensively pull Unpaywall's preferred OA URL
503/// (`best_oa_location.url_for_pdf`, falling back to `.url`) out of a
504/// metadata payload.
505fn extract_unpaywall_oa_url(meta: &Value) -> Option<String> {
506 let loc = meta.get("best_oa_location")?;
507 loc.get("url_for_pdf")
508 .and_then(Value::as_str)
509 .or_else(|| loc.get("url").and_then(Value::as_str))
510 .map(|s| s.to_string())
511}
512
513// ---------------------------------------------------------------------------
514// fetch_paper — single-ref orchestrator (Slice 2)
515// ---------------------------------------------------------------------------
516
517/// Outcome of a successful [`fetch_paper`] call.
518///
519/// Wire shape mirrors `docs/MCP_TOOLS.md` §5 `FetchResult` minus the
520/// envelope chrome the MCP server wraps it in (`ok: true`, `ref`,
521/// optional `error`).
522///
523/// `path` is the absolute path of the resource the orchestrator wrote to
524/// the store. For arXiv refs and successful DOI OA-PDF fetches this is
525/// `<root>/<safekey>.pdf`; for the DOI metadata-only fallback (OA URL
526/// host off the `oa-publisher` allowlist, or PDF leg failed for another
527/// transport reason — `docs/REDIRECT_ALLOWLIST.md` §3 informed-best-
528/// effort posture) this is `<root>/.metadata/<safekey>.toml`.
529/// Outcome of the DOI OA-PDF leg, carried on [`FetchPaperOutcome`] so a
530/// caller can NEVER silently report a blocked PDF as a plain
531/// "metadata-only" success (issue #118). The product promise is
532/// "immediately explain WHY a paper can't be fetched" — the distinction
533/// between "there was no OA PDF to fetch" and "an OA PDF existed but we
534/// were blocked, and here is the reason" is exactly that explanation.
535#[derive(Debug, Clone)]
536#[non_exhaustive]
537pub enum PdfLegStatus {
538 /// A PDF was fetched and written to disk (arXiv always; DOI when
539 /// the OA-publisher leg succeeded).
540 Fetched,
541 /// No OA URL was discovered (Unpaywall reported no
542 /// `best_oa_location`). Metadata-only is the correct, expected
543 /// result here — not a failure.
544 NoOaUrl,
545 /// An OA URL *was* discovered but the PDF could not be retrieved
546 /// (host outside the oa-publisher allowlist, not-a-PDF body,
547 /// transport failure, …). Metadata was still written, but the
548 /// caller MUST surface this reason rather than pretending the
549 /// fetch was a clean metadata-only success.
550 Blocked {
551 /// Closed-set code, mapped from the underlying transport error
552 /// via the canonical `From<FetchError> for ErrorCode`.
553 code: crate::ErrorCode,
554 /// Human-readable one-line reason (the `FetchError` display).
555 message: String,
556 /// Structured denial side-channel (ADR-0023) when the failure
557 /// was an allowlist / scheme denial; `None` otherwise.
558 denial: Option<crate::DenialContext>,
559 /// Actionable suggested arXiv ID for the same paper when Unpaywall
560 /// metadata includes an arXiv alternative but the PDF leg was blocked.
561 suggested_arxiv_id: Option<String>,
562 },
563}
564
565/// What `fetch_paper` wrote to disk and how.
566///
567/// `path` is the PDF (`<root>/<safekey>.pdf`) on a successful PDF
568/// fetch, or the metadata TOML (`<root>/.metadata/<safekey>.toml`)
569/// when the DOI path fell back to metadata-only. [`Self::pdf_leg`]
570/// disambiguates *why* there is no PDF (genuinely none available vs.
571/// available-but-blocked) so callers never report a blocked PDF as a
572/// silent success (issue #118).
573#[derive(Debug, Clone)]
574#[non_exhaustive]
575pub struct FetchPaperOutcome {
576 /// `Source::name()` of the resolver whose payload landed on disk:
577 /// `"arxiv"` for an arXiv ref, `"oa-publisher"` when the DOI OA PDF
578 /// leg succeeded, or `"crossref"` / `"unpaywall"` when the DOI path
579 /// fell back to metadata-only. Mirrors the value written to
580 /// `[doiget].source` in the metadata TOML.
581 pub source: String,
582 /// Resolver profile under which the canonical-digest (ADR-0021 §1)
583 /// was minted for the final artifact. For an arXiv fetch this is
584 /// `"arxiv"`; for a successful DOI OA PDF leg this is
585 /// `"oa-publisher"`; for the DOI metadata-only fallback this is the
586 /// metadata source key (`"crossref"` / `"unpaywall"`). Equal to
587 /// [`Self::source`] verbatim in Slice 4 but kept distinct so future
588 /// slices can decouple "which resolver wrote to disk" from "which
589 /// resolver is the audit identity". Surfaced through the
590 /// `doiget_fetch_paper` MCP envelope per ADR-0021 §4.
591 pub resolver_profile: String,
592 /// OA license string (`"CC-BY-4.0"`, `"cc-by"`, `"arxiv-default"`,
593 /// `"unknown"`). Mirrors `[doiget].license`.
594 pub license: String,
595 /// Absolute path of the artifact actually written
596 /// (`<root>/<safekey>.pdf` on success, `<root>/.metadata/<safekey>.toml`
597 /// on metadata-only fallback).
598 pub path: Utf8PathBuf,
599 /// Stored PDF size in bytes; `0` on the metadata-only fallback
600 /// (`docs/REDIRECT_ALLOWLIST.md` §3.5).
601 pub size_bytes: u64,
602 /// The schema version of the metadata TOML written
603 /// (always [`crate::SCHEMA_VERSION`] for this build).
604 pub schema_version: String,
605 /// What happened on the PDF leg (issue #118). `Fetched` /
606 /// `NoOaUrl` are clean outcomes; `Blocked` carries the structured
607 /// reason an OA PDF existed but could not be retrieved, so the
608 /// CLI / MCP surface it instead of a silent metadata-only success.
609 pub pdf_leg: PdfLegStatus,
610 /// Per-ref [`crate::Safekey`] stringified (`Ref::safekey().as_str()`).
611 /// Exposed on the outcome so JSON-mode CLI / MCP callers can
612 /// emit a structured success body without re-parsing the input
613 /// ref (#210 / `docs/ERRORS.md` §3). Always populated.
614 pub safekey: String,
615 /// ADR-0021 §1 canonical-digest as 64-char lowercase hex for the
616 /// resolver_profile that produced this outcome's audit identity.
617 /// For an arXiv fetch this is the digest under `"arxiv"`; for a
618 /// DOI OA PDF leg this is under `"oa-publisher"`; for the DOI
619 /// metadata-only fallback this is under the metadata source key
620 /// (`"crossref"` / `"unpaywall"`). Always populated.
621 pub canonical_digest: String,
622}
623
624impl FetchPaperOutcome {
625 /// Test-only constructor for downstream crates (`doiget-cli`,
626 /// `doiget-mcp`) that need to drive classification / rendering
627 /// logic without running the full orchestrator. Produces a
628 /// minimal but structurally-valid outcome — all required fields
629 /// populated with defensible stubs — so unit tests can assert
630 /// the surrounding behavior (JSONL shape, exit-code mapping,
631 /// PDF-leg branching) in isolation.
632 ///
633 /// `#[doc(hidden)]` because this is not a stable public API; the
634 /// signature may change to fit test needs without a CHANGELOG
635 /// `[BREAKING]` callout.
636 #[doc(hidden)]
637 pub fn for_test_synthetic(
638 safekey: impl Into<String>,
639 source: impl Into<String>,
640 pdf_leg: PdfLegStatus,
641 ) -> Self {
642 let safekey: String = safekey.into();
643 let source: String = source.into();
644 Self {
645 source: source.clone(),
646 resolver_profile: source.clone(),
647 license: "unknown".to_string(),
648 path: Utf8PathBuf::from(format!("/tmp/{safekey}.pdf")),
649 size_bytes: 0,
650 schema_version: SCHEMA_VERSION.to_string(),
651 pdf_leg,
652 safekey: safekey.clone(),
653 // 32 bytes of `0x00` → a stable, non-secret digest stub
654 // that's still 64 chars of lowercase hex.
655 canonical_digest: "00".repeat(32),
656 }
657 }
658}
659
660/// Resolve a [`Ref`] to a PDF (or metadata-only fallback) and write it
661/// through `store`.
662///
663/// Binding spec: `docs/MCP_TOOLS.md` §4 (`doiget_fetch_paper`),
664/// `docs/REDIRECT_ALLOWLIST.md` §3 (informed-best-effort posture for the
665/// DOI OA PDF leg), `docs/PROVENANCE_LOG.md` §3 (per-attempt `Fetch` rows
666/// emitted by the source impls; `StoreWrite` row emitted by this
667/// orchestrator).
668///
669/// # Dispatch
670///
671/// - `Ref::Arxiv(_)` → [`ArxivSource::fetch`]; the source returns PDF
672/// bytes + Atom-feed metadata. The orchestrator writes both the PDF
673/// and the metadata TOML.
674/// - `Ref::Doi(_)` → Crossref metadata + Unpaywall license/OA-URL
675/// enrichment + (when the OA URL host is on the `oa-publisher`
676/// allowlist) a publisher PDF leg. A failure on the PDF leg is
677/// non-fatal: the metadata is still written and the orchestrator
678/// returns `Ok(...)` with `source` set to the metadata source.
679///
680/// # Side effects
681///
682/// Each consulted source emits one `LogEvent::Fetch` row via
683/// `ctx.log.append`. The orchestrator additionally emits one
684/// `LogEvent::StoreWrite` row on the successful write. Session bookend
685/// rows are the caller's responsibility (the CLI's
686/// `commands::fetch::run_with_options` wraps the call; the MCP server's
687/// `doiget_fetch_paper` tool method wraps it too).
688///
689/// # Errors
690///
691/// Returns [`FetchError`] from the underlying [`Source`] dispatch. The
692/// MCP boundary converts these to the closed [`crate::ErrorCode`] set
693/// via the existing `From<FetchError> for ErrorCode` impl.
694pub async fn fetch_paper(
695 ref_: &Ref,
696 profile: &CapabilityProfile,
697 ctx: &FetchContext,
698 store: &dyn Store,
699 store_root: &Utf8Path,
700) -> Result<FetchPaperOutcome, FetchError> {
701 let safekey = ref_.safekey();
702 match ref_ {
703 Ref::Arxiv(id) => {
704 fetch_paper_arxiv(id, ref_, profile, ctx, store, store_root, &safekey).await
705 }
706 Ref::Doi(doi) => {
707 fetch_paper_doi(doi, ref_, profile, ctx, store, store_root, &safekey).await
708 }
709 }
710}
711
712/// Build the dry-run preview ([`FetchPlan`]) for a single ref without
713/// touching the network, store, or provenance log. Thin re-export of
714/// [`crate::dry_run::build_fetch_plan`] under the slice-2 naming the
715/// MCP tool surfaces use; kept here so the MCP `doiget_fetch_paper`
716/// tool method does not have to reach across two modules.
717pub fn fetch_paper_plan(ref_: &Ref, store_root: &Utf8Path) -> FetchPlan {
718 build_fetch_plan(ref_, store_root)
719}
720
721/// Fallible sibling of [`fetch_paper_plan`] — propagates an internal
722/// allowlist-contract drift as a typed [`FetchError::SourceSchema`]
723/// instead of degrading to an empty `candidate_hosts` list (issue
724/// #156 ②). Thin re-export of [`crate::dry_run::try_build_fetch_plan`].
725/// Added alongside the infallible [`fetch_paper_plan`] rather than
726/// changing its signature, because `fetch_paper_plan` is `pub` and
727/// called from `doiget-mcp`, which is out of scope for this batch.
728///
729/// # Errors
730///
731/// See [`crate::dry_run::try_build_fetch_plan`].
732pub fn try_fetch_paper_plan(ref_: &Ref, store_root: &Utf8Path) -> Result<FetchPlan, FetchError> {
733 try_build_fetch_plan(ref_, store_root)
734}
735
736/// arXiv branch of [`fetch_paper`]. Internal — public callers go
737/// through `fetch_paper`.
738async fn fetch_paper_arxiv(
739 id: &ArxivId,
740 ref_: &Ref,
741 profile: &CapabilityProfile,
742 ctx: &FetchContext,
743 store: &dyn Store,
744 store_root: &Utf8Path,
745 safekey: &Safekey,
746) -> Result<FetchPaperOutcome, FetchError> {
747 let source = arxiv_source_from_env();
748 if !source.can_serve(profile, ref_) {
749 return Err(FetchError::NotEligible {
750 source_key: source.name().to_string(),
751 });
752 }
753
754 let FetchResult {
755 license,
756 pdf_bytes,
757 final_url,
758 ..
759 } = source.fetch(ref_, profile, ctx).await?;
760 let pdf = pdf_bytes.ok_or_else(|| FetchError::SourceSchema {
761 hint: "arxiv source returned no PDF bytes".to_string(),
762 })?;
763 let size_bytes = pdf.len() as u64;
764
765 // Phase 1 minimal metadata. Full Atom-feed extraction (title /
766 // authors) lives in `ArxivSource::fetch_metadata_only` and the
767 // metadata-only orchestrator; the fetch path keeps the placeholder
768 // for now (a follow-up slice may chain in Atom-parse here).
769 let metadata = Metadata {
770 schema_version: SCHEMA_VERSION.to_string(),
771 title: format!("arxiv:{}", id.as_str()),
772 authors: Vec::new(),
773 year: None,
774 doi: None,
775 arxiv_id: Some(id.clone()),
776 abstract_: None,
777 venue: None,
778 publisher: None,
779 issn: None,
780 isbn: None,
781 type_: None,
782 keywords: Vec::new(),
783 url: final_url.as_ref().map(|u| u.to_string()),
784 pdf_path: Some(format!("{}.pdf", safekey.as_str())),
785 doiget: Some(DoigetExtension {
786 fetched_at: Utc::now(),
787 source: "arxiv".to_string(),
788 license: license.clone(),
789 size_bytes,
790 mcp_call_id: None,
791 }),
792 other: BTreeMap::new(),
793 };
794
795 let tmp = stage_pdf_to_tempfile(&pdf)?;
796 let pdf_src = Utf8Path::from_path(tmp.path())
797 .ok_or_else(|| FetchError::SourceSchema {
798 hint: "staging tempfile path is not UTF-8".to_string(),
799 })?
800 .to_path_buf();
801 write_metadata_and_pdf(store, safekey, &metadata, Some(&pdf_src), ctx)?;
802 drop(tmp);
803
804 let path = store_root.join(format!("{}.pdf", safekey.as_str()));
805 let canonical_digest =
806 crate::CanonicalRef::new(crate::SourceType::Arxiv, id.as_str(), "arxiv", None).digest_hex();
807 Ok(FetchPaperOutcome {
808 source: "arxiv".to_string(),
809 resolver_profile: "arxiv".to_string(),
810 license,
811 path,
812 size_bytes,
813 schema_version: SCHEMA_VERSION.to_string(),
814 // arXiv always delivers the PDF (or the whole fn already
815 // returned Err above) — there is no metadata-only fallback.
816 pdf_leg: PdfLegStatus::Fetched,
817 safekey: safekey.as_str().to_string(),
818 canonical_digest,
819 })
820}
821
822/// DOI branch of [`fetch_paper`] — Crossref + Unpaywall + (when allowed)
823/// OA-publisher PDF leg. Mirrors the CLI's `fetch_doi` implementation
824/// (`crates/doiget-cli/src/commands/fetch.rs`) — the CLI now delegates
825/// here so both surfaces share one source of truth.
826async fn fetch_paper_doi(
827 doi: &Doi,
828 ref_: &Ref,
829 profile: &CapabilityProfile,
830 ctx: &FetchContext,
831 store: &dyn Store,
832 store_root: &Utf8Path,
833 safekey: &Safekey,
834) -> Result<FetchPaperOutcome, FetchError> {
835 let contact = contact_email_from_env();
836 let unpaywall_contact = unpaywall_email_from_env(&contact);
837 let crossref = crossref_source_from_env(&contact);
838 // Issue #120: Crossref is NON-fatal. A transient Crossref failure
839 // must not abort the whole DOI fetch when Unpaywall alone can
840 // still deliver the OA PDF. We keep the error and only surface it
841 // if nothing usable comes back (see the both-failed guard below).
842 let (cross, crossref_err) = match crossref.fetch(ref_, profile, ctx).await {
843 Ok(r) => (Some(r), None),
844 Err(e) => {
845 tracing::warn!(
846 error = %e,
847 "crossref fetch failed; continuing with unpaywall-only metadata + OA leg"
848 );
849 (None, Some(e))
850 }
851 };
852 let crossref_meta = cross
853 .as_ref()
854 .and_then(|c| c.metadata_json.clone())
855 .unwrap_or(Value::Null);
856 let extracted = extract_crossref_fields(&crossref_meta);
857
858 // Unpaywall second — license enrichment + OA URL chain discovery.
859 // A failure here is non-fatal: we still write the Crossref-
860 // derived metadata.
861 let unpaywall = unpaywall_source_from_env(&unpaywall_contact);
862 let upw_result = unpaywall.fetch(ref_, profile, ctx).await;
863 let (license, source_label, oa_chain) = match upw_result {
864 Ok(r) => {
865 let chain = extract_oa_url_chain(r.metadata_json.as_ref());
866 let label = if r.license != "unknown" {
867 "unpaywall".to_string()
868 } else {
869 "crossref".to_string()
870 };
871 (r.license, label, chain)
872 }
873 Err(e) => {
874 // Unpaywall unreachable / errored. We continue with the
875 // Crossref-only metadata, but the resulting empty OA
876 // chain will be reported downstream as
877 // `PdfLegStatus::NoOaUrl` — semantically distinct from
878 // "Unpaywall confirmed no OA URL". The provenance log
879 // already carries an Unpaywall Fetch err row (the
880 // Unpaywall source impl logged its own attempt before
881 // returning), so the audit trail captures the cause; the
882 // tracing line below makes the orchestrator-level signal
883 // loud as well. Surfacing the distinction at the
884 // `PdfLegStatus` level (a new variant like
885 // `MetadataSourceUnavailable`) is a deliberate
886 // follow-up — see CHANGELOG `[0.4.0]` Notes.
887 tracing::warn!(
888 error = %e,
889 doi = %doi.as_str(),
890 "unpaywall fetch failed; OA chain will be empty (downstream PdfLegStatus::NoOaUrl \
891 is conservative — Unpaywall was unreachable, not authoritatively oa-free)"
892 );
893 ("unknown".to_string(), "crossref".to_string(), Vec::new())
894 }
895 };
896
897 // OA PDF leg — ADR-0029 fetch chain. Walk the candidate URL list
898 // in order; first successful PDF wins, all-failed surfaces as
899 // `PdfLegStatus::Blocked` with the LAST attempt's error (the most
900 // informative for the operator — typically the network /
901 // allowlist reason the chain could not be exhausted). Each
902 // `try_fetch_oa_pdf` call already emits its own per-attempt
903 // provenance row (`oa-publisher` Fetch ok / err), so the audit
904 // trail captures every external request without orchestrator-
905 // side bookkeeping.
906 //
907 // Issue #118: a failure here is NEVER silently turned into a
908 // clean metadata-only success — the structured reason is carried
909 // out on `PdfLegStatus::Blocked`.
910 let (pdf_leg, pdf_bytes) = if oa_chain.is_empty() {
911 (PdfLegStatus::NoOaUrl, None)
912 } else {
913 let mut succeeded: Option<Vec<u8>> = None;
914 let mut last_err: Option<HttpError> = None;
915 let total = oa_chain.len();
916 for (idx, candidate) in oa_chain.iter().enumerate() {
917 let attempt = idx + 1;
918 tracing::debug!(
919 attempt,
920 total,
921 url = %candidate,
922 "trying OA PDF candidate (ADR-0029 chain)"
923 );
924 match try_fetch_oa_pdf(doi, candidate, ctx).await {
925 Ok((bytes, _final_url)) => {
926 if attempt > 1 {
927 tracing::info!(
928 attempt,
929 total,
930 url = %candidate,
931 "OA PDF chain succeeded on fallback candidate (ADR-0029)"
932 );
933 }
934 succeeded = Some(bytes);
935 break;
936 }
937 Err(e) => {
938 tracing::warn!(
939 attempt,
940 total,
941 url = %candidate,
942 error = %e,
943 "OA PDF candidate failed; advancing to next (ADR-0029 chain)"
944 );
945 last_err = Some(e);
946 }
947 }
948 }
949 match (succeeded, last_err) {
950 (Some(bytes), _) => (PdfLegStatus::Fetched, Some(bytes)),
951 (None, Some(e)) => {
952 let fe = FetchError::Http(e);
953 let denial: Option<crate::DenialContext> = (&fe).into();
954 let message = fe.to_string();
955 let code: crate::ErrorCode = fe.into();
956 let suggested_arxiv_id = oa_chain.iter().find_map(extract_arxiv_id_from_url);
957 (
958 PdfLegStatus::Blocked {
959 code,
960 message,
961 denial,
962 suggested_arxiv_id,
963 },
964 None,
965 )
966 }
967 // Defensive fallback. `oa_chain` is non-empty in this
968 // branch, so structurally at least one iteration must set
969 // either `succeeded` or `last_err`. If a future refactor
970 // breaks the invariant we fail CLOSED — surface a
971 // `Blocked` outcome with a self-describing message
972 // rather than `NoOaUrl` (which would falsely tell the
973 // caller no candidate URL was ever discovered). Routes
974 // to `INTERNAL_ERROR` so the CLI's exit-code mapping
975 // signals a doiget bug, not a remote failure.
976 (None, None) => {
977 tracing::error!(
978 total = oa_chain.len(),
979 "OA PDF chain walker exhausted without recording success or error \
980 (defensive fallback — should be unreachable)"
981 );
982 (
983 PdfLegStatus::Blocked {
984 code: crate::ErrorCode::InternalError,
985 message:
986 "OA PDF chain walker exhausted without recording success or error \
987 (orchestrator bug — please report)"
988 .to_string(),
989 denial: None,
990 suggested_arxiv_id: None,
991 },
992 None,
993 )
994 }
995 }
996 };
997
998 // Issue #120: Crossref is non-fatal, but if it failed AND the OA
999 // PDF leg produced nothing, writing a DOI-only stub entry would
1000 // mask a total failure and violate the "explain why" promise.
1001 // Surface the Crossref error so the caller reports a real reason.
1002 if let Some(e) = crossref_err {
1003 if pdf_bytes.is_none() {
1004 return Err(e);
1005 }
1006 }
1007
1008 let (final_source_label, size_bytes, pdf_path_relative, pdf_staged) = match &pdf_bytes {
1009 Some(bytes) => {
1010 let staged = stage_pdf_to_tempfile(bytes)?;
1011 (
1012 "oa-publisher".to_string(),
1013 bytes.len() as u64,
1014 Some(format!("{}.pdf", safekey.as_str())),
1015 Some(staged),
1016 )
1017 }
1018 None => (source_label, 0u64, None, None),
1019 };
1020
1021 let metadata = Metadata {
1022 schema_version: SCHEMA_VERSION.to_string(),
1023 title: extracted.title.unwrap_or_else(|| doi.as_str().to_string()),
1024 authors: extracted.authors,
1025 year: extracted.year,
1026 doi: Some(doi.clone()),
1027 arxiv_id: None,
1028 abstract_: None,
1029 venue: extracted.venue,
1030 publisher: None,
1031 issn: None,
1032 isbn: None,
1033 type_: extracted.type_,
1034 keywords: Vec::new(),
1035 url: cross
1036 .as_ref()
1037 .and_then(|c| c.final_url.as_ref())
1038 .map(|u| u.to_string()),
1039 pdf_path: pdf_path_relative,
1040 doiget: Some(DoigetExtension {
1041 fetched_at: Utc::now(),
1042 source: final_source_label.clone(),
1043 license: license.clone(),
1044 size_bytes,
1045 mcp_call_id: None,
1046 }),
1047 other: BTreeMap::new(),
1048 };
1049
1050 let pdf_src_path = pdf_staged
1051 .as_ref()
1052 .and_then(|tmp| Utf8Path::from_path(tmp.path()).map(|p| p.to_path_buf()));
1053 write_metadata_and_pdf(store, safekey, &metadata, pdf_src_path.as_deref(), ctx)?;
1054 drop(pdf_staged);
1055
1056 let path = if pdf_bytes.is_some() {
1057 store_root.join(format!("{}.pdf", safekey.as_str()))
1058 } else {
1059 store_root
1060 .join(".metadata")
1061 .join(format!("{}.toml", safekey.as_str()))
1062 };
1063 let canonical_digest = crate::CanonicalRef::new(
1064 crate::SourceType::Doi,
1065 doi.as_str(),
1066 &final_source_label,
1067 None,
1068 )
1069 .digest_hex();
1070 Ok(FetchPaperOutcome {
1071 source: final_source_label.clone(),
1072 resolver_profile: final_source_label,
1073 license,
1074 path,
1075 size_bytes,
1076 schema_version: SCHEMA_VERSION.to_string(),
1077 pdf_leg,
1078 safekey: safekey.as_str().to_string(),
1079 canonical_digest,
1080 })
1081}
1082
1083/// Stage PDF bytes to a tempfile so the existing `Store::write` atomic-
1084/// rename code path applies (the store takes a path, not bytes).
1085fn stage_pdf_to_tempfile(bytes: &[u8]) -> Result<tempfile::NamedTempFile, FetchError> {
1086 let tmp = tempfile::NamedTempFile::new().map_err(|e| FetchError::SourceSchema {
1087 hint: format!("creating PDF staging tempfile: {e}"),
1088 })?;
1089 std::fs::write(tmp.path(), bytes).map_err(|e| FetchError::SourceSchema {
1090 hint: format!("staging PDF bytes: {e}"),
1091 })?;
1092 Ok(tmp)
1093}
1094
1095/// Persist `metadata` (and optionally a PDF at `pdf_src`) through the
1096/// trait-object [`Store`] and emit a `StoreWrite` provenance row.
1097fn write_metadata_and_pdf(
1098 store: &dyn Store,
1099 safekey: &Safekey,
1100 metadata: &Metadata,
1101 pdf_src: Option<&Utf8Path>,
1102 ctx: &FetchContext,
1103) -> Result<(), FetchError> {
1104 let store_path_relative = if pdf_src.is_some() {
1105 format!("{}.pdf", safekey.as_str())
1106 } else {
1107 format!(".metadata/{}.toml", safekey.as_str())
1108 };
1109 let size_bytes = metadata.doiget.as_ref().map(|d| d.size_bytes).unwrap_or(0);
1110 let license = metadata.doiget.as_ref().map(|d| d.license.as_str());
1111 let source_name = metadata.doiget.as_ref().map(|d| d.source.as_str());
1112
1113 // ADR-0021 §1 canonical-digest for the StoreWrite row. The store
1114 // entry is keyed on the ref + the resolver that produced its
1115 // metadata (already captured in `metadata.doiget.source`). Build a
1116 // CanonicalRef from whichever id slot is populated.
1117 let canonical_digest: Option<String> = match (metadata.doi.as_ref(), metadata.arxiv_id.as_ref())
1118 {
1119 (Some(d), _) => source_name.map(|s| {
1120 crate::CanonicalRef::new(crate::SourceType::Doi, d.as_str(), s, None).digest_hex()
1121 }),
1122 (None, Some(a)) => source_name.map(|s| {
1123 crate::CanonicalRef::new(crate::SourceType::Arxiv, a.as_str(), s, None).digest_hex()
1124 }),
1125 (None, None) => None,
1126 };
1127
1128 match store.write(safekey, metadata, pdf_src) {
1129 Ok(()) => {
1130 ctx.log.append(RowInput {
1131 event: LogEvent::StoreWrite,
1132 result: LogResult::Ok,
1133 capability: Capability::Oa,
1134 ref_: metadata
1135 .doi
1136 .as_ref()
1137 .map(|d| d.as_str())
1138 .or_else(|| metadata.arxiv_id.as_ref().map(|a| a.as_str())),
1139 source: source_name,
1140 error_code: None,
1141 size_bytes: Some(size_bytes),
1142 license,
1143 store_path: Some(&store_path_relative),
1144 canonical_digest: canonical_digest.as_deref(),
1145 })?;
1146 Ok(())
1147 }
1148 Err(e) => {
1149 // Best-effort: record the StoreWrite failure before
1150 // propagating the store.write error. We do NOT
1151 // propagate the log-append error itself here — we're
1152 // already in an error state from the store, and the
1153 // primary failure is what the caller needs to act on.
1154 // But the log-append failure is observable via tracing
1155 // so an operator can spot a broken hash chain when
1156 // both fail. Surface as `SourceSchema` so the
1157 // FetchError -> ErrorCode collapse routes it to
1158 // `INTERNAL_ERROR` (closest closed-set fit; `StoreError`
1159 // does not have a direct closed-set arm).
1160 if let Err(log_err) = ctx.log.append(RowInput {
1161 event: LogEvent::StoreWrite,
1162 result: LogResult::Err,
1163 capability: Capability::Oa,
1164 ref_: metadata
1165 .doi
1166 .as_ref()
1167 .map(|d| d.as_str())
1168 .or_else(|| metadata.arxiv_id.as_ref().map(|a| a.as_str())),
1169 source: source_name,
1170 error_code: Some("STORE_ERROR"),
1171 size_bytes: None,
1172 license: None,
1173 store_path: Some(&store_path_relative),
1174 canonical_digest: canonical_digest.as_deref(),
1175 }) {
1176 tracing::error!(
1177 store_err = %e,
1178 log_err = %log_err,
1179 "BOTH store.write AND provenance log append failed; \
1180 audit trail is broken for this attempt"
1181 );
1182 }
1183 Err(FetchError::SourceSchema {
1184 hint: format!("store write failed: {e}"),
1185 })
1186 }
1187 }
1188}
1189
1190/// Attempt the OA PDF fetch under the `"oa-publisher"` source key.
1191async fn try_fetch_oa_pdf(
1192 doi: &Doi,
1193 url: &url::Url,
1194 ctx: &FetchContext,
1195) -> Result<(Vec<u8>, url::Url), HttpError> {
1196 const SOURCE: &str = "oa-publisher";
1197 let _permit = ctx.rate_limiter.acquire(SOURCE).await;
1198 // ADR-0021 §1: the oa-publisher PDF leg is a DISTINCT audit
1199 // identity from the Crossref/Unpaywall metadata legs even though
1200 // the ref is the same DOI — that's the whole point of carrying
1201 // `resolver_profile` into the digest. Compute once and re-use for
1202 // both the ok and err row variants below.
1203 let canonical =
1204 crate::CanonicalRef::new(crate::SourceType::Doi, doi.as_str(), SOURCE, None).digest_hex();
1205
1206 // Pre-fetch host allowlist check on the metadata-discovered OA URL
1207 // (issue #145; `docs/REDIRECT_ALLOWLIST.md` §1 — NORMATIVE). The
1208 // per-source `redirect_hosts` allowlist is, by §1, consulted "on the
1209 // OA URL discovered through metadata sources before the actual PDF
1210 // fetch is issued", not only on redirect hops. The redirect closure in
1211 // `crate::http` only fires when an *actual redirect* occurs; an OA URL
1212 // whose host is off the `oa-publisher` allowlist that resolves WITHOUT
1213 // a redirect would otherwise reach connect and be misclassified as a
1214 // transport error, violating §1. This is scoped strictly to the
1215 // `"oa-publisher"` PDF leg — §6 explicitly exempts the initial
1216 // template-constructed URL, and `fetch_bytes`/metadata-only/resolve-
1217 // only paths (which never follow the OA URL) are deliberately NOT
1218 // touched. On a host MISS we return the *same* `HttpError::RedirectDenied`
1219 // value the redirect closure produces (same `source_key`, lowercased
1220 // `host`, and `expected_hosts` snapshot), reusing the identical
1221 // allowlist the closure captured (queried via `source_allowlist`, not
1222 // re-derived) so the single source of truth cannot drift. Returning
1223 // that exact variant means the existing `Err(e)` arm below, the
1224 // `From<&HttpError> for Option<DenialContext>` mapping
1225 // (`DenialReason::RedirectNotInAllowlist`), the `PdfLegStatus::Blocked`
1226 // construction in the caller, and PR #162's CLI classification all see
1227 // a byte-identical downstream shape with no new code path.
1228 if let Some(allowlist) = ctx.http.source_allowlist(SOURCE) {
1229 // `Url::host_str()` is `None` for hostless URLs (e.g. `data:`);
1230 // treat that exactly as the redirect closure does (an allowlist
1231 // miss with an empty host string).
1232 let host = url
1233 .host_str()
1234 .map(|h| h.to_ascii_lowercase())
1235 .unwrap_or_default();
1236 if !allowlist.matches(&host) {
1237 let e = HttpError::RedirectDenied {
1238 source_key: SOURCE.to_string(),
1239 host: host.clone(),
1240 expected_hosts: allowlist.redirect_hosts.clone(),
1241 };
1242 tracing::info!(
1243 oa_url = %url,
1244 denied_host = %host,
1245 "OA URL host outside oa-publisher allowlist (pre-fetch check, \
1246 docs/REDIRECT_ALLOWLIST.md §1 / issue #145)"
1247 );
1248 // Emit the SAME provenance row the post-fetch redirect-denied
1249 // path emits: a `Fetch` `Err` row under the `oa-publisher`
1250 // source key with the closed-set `NETWORK_ERROR` code and the
1251 // same canonical digest. Mirrors the `Err(e)` arm below so the
1252 // audit trail is indistinguishable from a redirect-time denial.
1253 let _ = ctx.log.append(RowInput {
1254 event: LogEvent::Fetch,
1255 result: LogResult::Err,
1256 capability: Capability::Oa,
1257 ref_: Some(doi.as_str()),
1258 source: Some(SOURCE),
1259 error_code: Some(crate::ErrorCode::NetworkError.as_wire()),
1260 size_bytes: None,
1261 license: None,
1262 store_path: None,
1263 canonical_digest: Some(&canonical),
1264 });
1265 return Err(e);
1266 }
1267 }
1268
1269 match ctx.http.fetch_pdf(SOURCE, url.clone()).await {
1270 Ok((body, final_url)) => {
1271 let size_bytes = body.len() as u64;
1272 if let Err(e) = ctx.log.append(RowInput {
1273 event: LogEvent::Fetch,
1274 result: LogResult::Ok,
1275 capability: Capability::Oa,
1276 ref_: Some(doi.as_str()),
1277 source: Some(SOURCE),
1278 error_code: None,
1279 size_bytes: Some(size_bytes),
1280 license: None,
1281 store_path: None,
1282 canonical_digest: Some(&canonical),
1283 }) {
1284 tracing::warn!(error = %e, "appending oa-publisher Fetch ok row failed");
1285 }
1286 Ok((body.to_vec(), final_url))
1287 }
1288 Err(e) => {
1289 match &e {
1290 HttpError::RedirectDenied { host, .. } => {
1291 tracing::info!(
1292 oa_url = %url,
1293 denied_host = %host,
1294 "OA URL host outside oa-publisher allowlist"
1295 );
1296 }
1297 HttpError::NotAPdf { .. } => {
1298 tracing::info!(
1299 oa_url = %url,
1300 "OA URL did not return a PDF magic byte"
1301 );
1302 }
1303 other => {
1304 tracing::warn!(
1305 oa_url = %url,
1306 error = %other,
1307 "OA PDF fetch failed"
1308 );
1309 }
1310 }
1311 // Provenance `error_code` is the CLOSED-set code. Every
1312 // `HttpError` collapses to `NETWORK_ERROR` through the
1313 // canonical `From<FetchError> for ErrorCode` (the closed
1314 // set has no finer transport code by design) — so this is
1315 // the correct mapped value, not the misattribution the
1316 // previous hardcode implied. The *fine* reason
1317 // (RedirectDenied vs NotAPdf vs …) is preserved for the
1318 // user via `PdfLegStatus::Blocked.denial` / `.message`
1319 // built by the caller from the returned `HttpError`
1320 // (issue #118). Rendered via `ErrorCode::as_wire` so the
1321 // token can never drift from the enum.
1322 let _ = ctx.log.append(RowInput {
1323 event: LogEvent::Fetch,
1324 result: LogResult::Err,
1325 capability: Capability::Oa,
1326 ref_: Some(doi.as_str()),
1327 source: Some(SOURCE),
1328 error_code: Some(crate::ErrorCode::NetworkError.as_wire()),
1329 size_bytes: None,
1330 license: None,
1331 store_path: None,
1332 canonical_digest: Some(&canonical),
1333 });
1334 Err(e)
1335 }
1336 }
1337}
1338
1339/// Subset of Crossref `message` fields populated into the on-disk metadata.
1340pub(crate) struct CrossrefFields {
1341 pub(crate) title: Option<String>,
1342 pub(crate) authors: Vec<String>,
1343 pub(crate) year: Option<i32>,
1344 pub(crate) venue: Option<String>,
1345 pub(crate) type_: Option<String>,
1346}
1347
1348/// Defensively pull bibliographic fields out of a Crossref envelope's
1349/// message object. Every field is optional; malformed shapes degrade
1350/// to None rather than panicking.
1351pub(crate) fn extract_crossref_fields(msg: &Value) -> CrossrefFields {
1352 let title = msg
1353 .get("title")
1354 .and_then(|v| v.as_array())
1355 .and_then(|arr| arr.first())
1356 .and_then(|v| v.as_str())
1357 .map(|s| s.to_string());
1358
1359 let authors = msg
1360 .get("author")
1361 .and_then(|v| v.as_array())
1362 .map(|arr| {
1363 arr.iter()
1364 .filter_map(|a| {
1365 let family = a.get("family").and_then(|v| v.as_str());
1366 let given = a.get("given").and_then(|v| v.as_str());
1367 match (family, given) {
1368 (Some(f), Some(g)) => Some(format!("{f}, {g}")),
1369 (Some(f), None) => Some(f.to_string()),
1370 (None, Some(g)) => Some(g.to_string()),
1371 _ => None,
1372 }
1373 })
1374 .collect()
1375 })
1376 .unwrap_or_default();
1377
1378 let year = msg
1379 .get("issued")
1380 .and_then(|v| v.get("date-parts"))
1381 .and_then(|v| v.as_array())
1382 .and_then(|arr| arr.first())
1383 .and_then(|v| v.as_array())
1384 .and_then(|arr| arr.first())
1385 .and_then(|v| v.as_i64())
1386 .and_then(|n| i32::try_from(n).ok());
1387
1388 let venue = msg
1389 .get("container-title")
1390 .and_then(|v| v.as_array())
1391 .and_then(|arr| arr.first())
1392 .and_then(|v| v.as_str())
1393 .map(|s| s.to_string());
1394
1395 let type_ = msg
1396 .get("type")
1397 .and_then(|v| v.as_str())
1398 .map(|s| s.to_string());
1399
1400 CrossrefFields {
1401 title,
1402 authors,
1403 year,
1404 venue,
1405 type_,
1406 }
1407}
1408
1409/// Pull the ordered chain of candidate OA URLs out of an Unpaywall
1410/// `metadata_json` envelope per ADR-0029 D2.
1411///
1412/// Order is `best_oa_location` first (when present), then every
1413/// distinct entry in `oa_locations[]`. Duplicate URLs are deduped by
1414/// exact string match so a candidate that appears as both the "best"
1415/// entry and an array element is fetched at most once.
1416///
1417/// Each location's URL is resolved via the same `url_for_pdf` →
1418/// `url` fallback the single-URL extractor uses.
1419///
1420/// Returns `Vec::new()` when no OA location was reported (the chain
1421/// is empty and the caller surfaces [`PdfLegStatus::NoOaUrl`]).
1422fn extract_oa_url_chain(meta: Option<&Value>) -> Vec<url::Url> {
1423 let meta = match meta {
1424 Some(m) => m,
1425 None => return Vec::new(),
1426 };
1427 let mut out: Vec<url::Url> = Vec::new();
1428 let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
1429 let mut push_unique = |u: url::Url| {
1430 let key = u.as_str().to_string();
1431 if seen.insert(key) {
1432 out.push(u);
1433 }
1434 };
1435
1436 // Priority 1: best_oa_location (Unpaywall's own quality-ordered
1437 // pick — ADR-0029 D2 NORMATIVE: defer to the metadata source's
1438 // ordering).
1439 if let Some(best) = meta.get("best_oa_location") {
1440 if let Some(u) = pull_oa_url_from_location(best) {
1441 push_unique(u);
1442 }
1443 }
1444 // Priority 2: every entry in oa_locations[] after the best one.
1445 // The fallback target this ADR exists to enable is precisely the
1446 // arXiv preprint that lives here when `best_oa_location` is a
1447 // WAF-blocked publisher URL.
1448 if let Some(arr) = meta.get("oa_locations").and_then(|v| v.as_array()) {
1449 for loc in arr {
1450 if let Some(u) = pull_oa_url_from_location(loc) {
1451 push_unique(u);
1452 }
1453 }
1454 }
1455 out
1456}
1457
1458/// Resolve a single OA location object to a `url::Url`. Tries
1459/// `url_for_pdf` first (the direct PDF link Unpaywall annotates when
1460/// it knows one), falling back to `url` (the landing page). Returns
1461/// `None` if neither field is present or parses.
1462fn pull_oa_url_from_location(loc: &Value) -> Option<url::Url> {
1463 let candidate = loc
1464 .get("url_for_pdf")
1465 .and_then(|v| v.as_str())
1466 .or_else(|| loc.get("url").and_then(|v| v.as_str()))?;
1467 url::Url::parse(candidate).ok()
1468}
1469
1470/// Helper to parse clean arXiv IDs from URLs like arxiv.org/pdf/1901.12345.pdf.
1471///
1472/// Strips the trailing `.pdf` extension and any version suffix (`v1`, `v2`, …)
1473/// so the returned ID refers to the latest version rather than pinning a
1474/// specific one. Returns `None` for non-arXiv hosts or unrecognised path shapes.
1475fn extract_arxiv_id_from_url(url: &url::Url) -> Option<String> {
1476 let host = url.host_str()?;
1477 let is_arxiv = matches!(
1478 host,
1479 "arxiv.org" | "www.arxiv.org" | "export.arxiv.org" | "e-print.arxiv.org"
1480 );
1481 if !is_arxiv {
1482 return None;
1483 }
1484 let path = url.path();
1485 let raw = if path.starts_with("/pdf/") {
1486 let s = path.strip_prefix("/pdf/")?;
1487 s.strip_suffix(".pdf").unwrap_or(s)
1488 } else if path.starts_with("/abs/") {
1489 path.strip_prefix("/abs/")?
1490 } else {
1491 return None;
1492 };
1493 Some(strip_arxiv_version(raw).to_string())
1494}
1495
1496/// Strip a trailing arXiv version suffix (`v1`, `v2`, …) from an ID string.
1497///
1498/// Recognises the suffix only when the `v` is **preceded by a digit** (ruling
1499/// out category fragments like `quant-ph`) and followed by one or more ASCII
1500/// digits. Leaves IDs without a recognisable version suffix unchanged.
1501fn strip_arxiv_version(id: &str) -> &str {
1502 if let Some(v_pos) = id.rfind('v') {
1503 let before_v = id[..v_pos].chars().next_back();
1504 let suffix = &id[v_pos + 1..];
1505 if before_v.is_some_and(|c| c.is_ascii_digit())
1506 && !suffix.is_empty()
1507 && suffix.bytes().all(|b| b.is_ascii_digit())
1508 {
1509 return &id[..v_pos];
1510 }
1511 }
1512 id
1513}
1514
1515fn unpaywall_email_from_env(fallback_contact: &str) -> String {
1516 std::env::var("DOIGET_UNPAYWALL_EMAIL").unwrap_or_else(|_| fallback_contact.to_string())
1517}
1518
1519// ---------------------------------------------------------------------------
1520// batch_fetch — multi-ref orchestrator (Slice 2)
1521// ---------------------------------------------------------------------------
1522
1523/// Per-ref outcome carried inside [`BatchOutcome::results`].
1524///
1525/// Each entry's `outcome` is independent — a single `Err(...)` does not
1526/// abort sibling refs. The MCP `doiget_batch_fetch` tool method
1527/// serializes the success-or-error per row inside `results[]`.
1528#[derive(Debug)]
1529pub struct BatchResultEntry {
1530 /// The parsed ref this entry describes.
1531 pub ref_: Ref,
1532 /// `Ok(...)` on a successful fetch through [`fetch_paper`];
1533 /// `Err(...)` on a per-ref failure (the outer call still returned
1534 /// `Ok(BatchOutcome)`).
1535 pub outcome: Result<FetchPaperOutcome, FetchError>,
1536}
1537
1538/// Outcome of a successful [`batch_fetch`] call.
1539///
1540/// The outer call returns `Err(_)` only on whole-call failures (the
1541/// only such variant in Slice 2 is [`FetchError::TooManyRefs`]). Each
1542/// per-ref result lives inside `results[]` so the agent can see every
1543/// outcome without losing sibling successes.
1544#[derive(Debug)]
1545#[non_exhaustive]
1546pub struct BatchOutcome {
1547 /// One entry per supplied ref, in input order.
1548 pub results: Vec<BatchResultEntry>,
1549}
1550
1551/// Iterate over `refs` through [`fetch_paper`], collecting one
1552/// [`BatchResultEntry`] per ref.
1553///
1554/// **Cap**: caller must supply at most [`MAX_BATCH_REFS`] refs; otherwise
1555/// the function returns `Err(FetchError::TooManyRefs { got, max })`
1556/// before any fetch is attempted. The cap mirrors the CLI's
1557/// `commands::batch` enforcement (`MCP_BATCH_MAX_SIZE`).
1558///
1559/// **Concurrency**: Slice 2 dispatches refs serially through
1560/// [`fetch_paper`]. The CLI's existing `commands::batch::run_with_options`
1561/// keeps its bounded-concurrency `JoinSet`+semaphore path for backward
1562/// compatibility; the MCP server uses this serial loop because the MCP
1563/// tool boundary already serializes calls per session.
1564///
1565/// **Session bookkeeping**: this function does NOT emit `SessionStart`
1566/// / `SessionEnd` rows — that is the caller's responsibility.
1567pub async fn batch_fetch(
1568 refs: &[Ref],
1569 profile: &CapabilityProfile,
1570 ctx: &FetchContext,
1571 store: &dyn Store,
1572 store_root: &Utf8Path,
1573) -> Result<BatchOutcome, FetchError> {
1574 if refs.len() > MAX_BATCH_REFS {
1575 return Err(FetchError::TooManyRefs {
1576 got: refs.len(),
1577 max: MAX_BATCH_REFS,
1578 });
1579 }
1580 let mut results = Vec::with_capacity(refs.len());
1581 for ref_ in refs {
1582 let outcome = fetch_paper(ref_, profile, ctx, store, store_root).await;
1583 results.push(BatchResultEntry {
1584 ref_: ref_.clone(),
1585 outcome,
1586 });
1587 }
1588 Ok(BatchOutcome { results })
1589}
1590
1591/// Dry-run preview for a batch — one [`FetchPlan`] per ref. Enforces
1592/// the same [`MAX_BATCH_REFS`] cap [`batch_fetch`] does.
1593///
1594/// Returns `Err(FetchError::TooManyRefs)` when over the cap, or
1595/// `Err(FetchError::SourceSchema)` if the dry-run allowlist invariant
1596/// has drifted (issue #156 ②: this now propagates as a typed error via
1597/// [`try_build_fetch_plan`] rather than silently emitting an empty
1598/// `candidate_hosts` list — the signature already returned `Result`, so
1599/// this is an in-crate behavior tightening with no caller-visible type
1600/// change). Otherwise `Ok(Vec<(Ref, FetchPlan)>)` parallel to the input
1601/// order.
1602pub fn batch_fetch_plans(
1603 refs: &[Ref],
1604 store_root: &Utf8Path,
1605) -> Result<Vec<(Ref, FetchPlan)>, FetchError> {
1606 if refs.len() > MAX_BATCH_REFS {
1607 return Err(FetchError::TooManyRefs {
1608 got: refs.len(),
1609 max: MAX_BATCH_REFS,
1610 });
1611 }
1612 refs.iter()
1613 .map(|r| try_build_fetch_plan(r, store_root).map(|p| (r.clone(), p)))
1614 .collect()
1615}
1616
1617// ---------------------------------------------------------------------------
1618// Tests
1619// ---------------------------------------------------------------------------
1620
1621#[cfg(test)]
1622#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
1623mod tests {
1624 use super::*;
1625
1626 #[test]
1627 fn test_extract_arxiv_id_from_url() {
1628 let urls = [
1629 // Basic new-style ID
1630 ("https://arxiv.org/pdf/1901.12345.pdf", Some("1901.12345")),
1631 ("https://arxiv.org/abs/1901.12345", Some("1901.12345")),
1632 // Version suffix is stripped
1633 ("https://arxiv.org/pdf/1901.12345v2.pdf", Some("1901.12345")),
1634 ("https://arxiv.org/abs/1901.12345v3", Some("1901.12345")),
1635 // Old-style category/ID
1636 (
1637 "https://www.arxiv.org/pdf/cond-mat/9501001.pdf",
1638 Some("cond-mat/9501001"),
1639 ),
1640 (
1641 "https://export.arxiv.org/abs/cond-mat/9501001",
1642 Some("cond-mat/9501001"),
1643 ),
1644 // Old-style with version stripped
1645 (
1646 "https://arxiv.org/pdf/cond-mat/9501001v1.pdf",
1647 Some("cond-mat/9501001"),
1648 ),
1649 // e-print subdomain
1650 (
1651 "https://e-print.arxiv.org/pdf/2401.12345.pdf",
1652 Some("2401.12345"),
1653 ),
1654 // Non-arXiv host
1655 ("https://example.org/pdf/1901.12345.pdf", None),
1656 ];
1657 for (url_str, expected) in urls {
1658 let url = url::Url::parse(url_str).unwrap();
1659 assert_eq!(
1660 extract_arxiv_id_from_url(&url),
1661 expected.map(String::from),
1662 "url: {url_str}"
1663 );
1664 }
1665 }
1666
1667 #[test]
1668 fn test_strip_arxiv_version() {
1669 assert_eq!(strip_arxiv_version("2401.12345v2"), "2401.12345");
1670 assert_eq!(strip_arxiv_version("2401.12345v10"), "2401.12345");
1671 assert_eq!(strip_arxiv_version("2401.12345"), "2401.12345");
1672 assert_eq!(
1673 strip_arxiv_version("cond-mat/9501001v3"),
1674 "cond-mat/9501001"
1675 );
1676 // "v" not followed by digits — unchanged
1677 assert_eq!(strip_arxiv_version("quant-phv5"), "quant-phv5");
1678 }
1679
1680 #[test]
1681 fn extract_crossref_oa_url_finds_first_url() {
1682 let msg = serde_json::json!({
1683 "link": [
1684 {"URL": "https://example.org/free.pdf"},
1685 {"URL": "https://example.org/alt.pdf"}
1686 ]
1687 });
1688 assert_eq!(
1689 extract_crossref_oa_url(&msg),
1690 Some("https://example.org/free.pdf".to_string())
1691 );
1692 }
1693
1694 #[test]
1695 fn extract_crossref_oa_url_returns_none_when_absent() {
1696 let msg = serde_json::json!({});
1697 assert!(extract_crossref_oa_url(&msg).is_none());
1698 }
1699
1700 #[test]
1701 fn extract_crossref_oa_url_skips_empty_url_strings() {
1702 let msg = serde_json::json!({
1703 "link": [
1704 {"URL": ""},
1705 {"URL": "https://example.org/real.pdf"}
1706 ]
1707 });
1708 assert_eq!(
1709 extract_crossref_oa_url(&msg),
1710 Some("https://example.org/real.pdf".to_string())
1711 );
1712 }
1713
1714 #[test]
1715 fn extract_unpaywall_oa_url_prefers_url_for_pdf() {
1716 let meta = serde_json::json!({
1717 "best_oa_location": {
1718 "url_for_pdf": "https://example.org/pdf",
1719 "url": "https://example.org/landing"
1720 }
1721 });
1722 assert_eq!(
1723 extract_unpaywall_oa_url(&meta),
1724 Some("https://example.org/pdf".to_string())
1725 );
1726 }
1727
1728 #[test]
1729 fn extract_unpaywall_oa_url_falls_back_to_url() {
1730 let meta = serde_json::json!({
1731 "best_oa_location": {
1732 "url": "https://example.org/landing"
1733 }
1734 });
1735 assert_eq!(
1736 extract_unpaywall_oa_url(&meta),
1737 Some("https://example.org/landing".to_string())
1738 );
1739 }
1740
1741 #[test]
1742 fn extract_unpaywall_oa_url_returns_none_when_absent() {
1743 let meta = serde_json::json!({});
1744 assert!(extract_unpaywall_oa_url(&meta).is_none());
1745 }
1746
1747 // ---------------------------------------------------------------
1748 // Slice 2: fetch_paper / batch_fetch coverage. The wiremock-driven
1749 // happy-path tests live in `crates/doiget-mcp/tests/...` (they need
1750 // a real `Store` impl and an HTTP client wired to `FetchContext`,
1751 // both of which the MCP integration tests already stand up). The
1752 // unit tests here pin the pure-function pieces (extractors, cap
1753 // enforcement, plan-shape preservation).
1754 // ---------------------------------------------------------------
1755
1756 #[test]
1757 fn extract_crossref_fields_parses_minimal_shape() {
1758 let msg = serde_json::json!({
1759 "title": ["Example Title"],
1760 "author": [{ "family": "Smith", "given": "Alice" }],
1761 "issued": { "date-parts": [[2024, 1, 15]] },
1762 "container-title": ["Phys. Rev. X"],
1763 "type": "journal-article"
1764 });
1765 let f = extract_crossref_fields(&msg);
1766 assert_eq!(f.title.as_deref(), Some("Example Title"));
1767 assert_eq!(f.authors, vec!["Smith, Alice".to_string()]);
1768 assert_eq!(f.year, Some(2024));
1769 assert_eq!(f.venue.as_deref(), Some("Phys. Rev. X"));
1770 assert_eq!(f.type_.as_deref(), Some("journal-article"));
1771 }
1772
1773 #[test]
1774 fn extract_crossref_fields_tolerates_missing() {
1775 let f = extract_crossref_fields(&serde_json::json!({}));
1776 assert!(f.title.is_none());
1777 assert!(f.authors.is_empty());
1778 assert!(f.year.is_none());
1779 assert!(f.venue.is_none());
1780 assert!(f.type_.is_none());
1781 }
1782
1783 #[test]
1784 fn extract_oa_url_chain_prefers_best_url_for_pdf() {
1785 // `best_oa_location.url_for_pdf` is the highest-priority
1786 // candidate (ADR-0029 D2 — defer to the metadata source's
1787 // ordering). Falls back to `best_oa_location.url` only when
1788 // no PDF link is annotated.
1789 let meta = serde_json::json!({
1790 "best_oa_location": {
1791 "url_for_pdf": "https://example.org/pdf",
1792 "url": "https://example.org/landing"
1793 }
1794 });
1795 let chain = extract_oa_url_chain(Some(&meta));
1796 assert_eq!(chain.len(), 1);
1797 assert_eq!(chain[0].as_str(), "https://example.org/pdf");
1798 }
1799
1800 #[test]
1801 fn extract_oa_url_chain_falls_back_to_url_when_url_for_pdf_absent() {
1802 let meta = serde_json::json!({
1803 "best_oa_location": {
1804 "url": "https://example.org/landing"
1805 }
1806 });
1807 let chain = extract_oa_url_chain(Some(&meta));
1808 assert_eq!(chain.len(), 1);
1809 assert_eq!(chain[0].as_str(), "https://example.org/landing");
1810 }
1811
1812 #[test]
1813 fn extract_oa_url_chain_is_empty_when_no_locations() {
1814 let meta = serde_json::json!({});
1815 assert!(extract_oa_url_chain(Some(&meta)).is_empty());
1816 assert!(extract_oa_url_chain(None).is_empty());
1817 }
1818
1819 #[test]
1820 fn extract_oa_url_chain_appends_oa_locations_after_best() {
1821 // ADR-0029 D2: best_oa_location first, then the rest of
1822 // oa_locations in metadata-source order. This is the load-
1823 // bearing test: it pins the fact that an arXiv preprint
1824 // listed *after* a WAF-blocked publisher in oa_locations[]
1825 // becomes a fallback candidate the chain walker can reach.
1826 let meta = serde_json::json!({
1827 "best_oa_location": {
1828 "url_for_pdf": "https://publisher.example.org/pdf"
1829 },
1830 "oa_locations": [
1831 {"url_for_pdf": "https://publisher.example.org/pdf"},
1832 {"url_for_pdf": "https://arxiv.org/pdf/2401.12345"},
1833 {"url": "https://repo.example.edu/handle/123"}
1834 ]
1835 });
1836 let chain = extract_oa_url_chain(Some(&meta));
1837 let strs: Vec<&str> = chain.iter().map(|u| u.as_str()).collect();
1838 assert_eq!(
1839 strs,
1840 vec![
1841 "https://publisher.example.org/pdf",
1842 "https://arxiv.org/pdf/2401.12345",
1843 "https://repo.example.edu/handle/123",
1844 ],
1845 "chain ordering MUST be best_oa_location first, oa_locations[] verbatim after"
1846 );
1847 }
1848
1849 #[test]
1850 fn extract_oa_url_chain_dedupes_repeated_urls() {
1851 // A URL that appears as both `best_oa_location` and an entry
1852 // in `oa_locations[]` is fetched at most once. Without this,
1853 // a publisher whose record has the same URL in both slots
1854 // would consume two HTTP requests + two rate-limit ticks.
1855 let meta = serde_json::json!({
1856 "best_oa_location": {
1857 "url_for_pdf": "https://example.org/pdf"
1858 },
1859 "oa_locations": [
1860 {"url_for_pdf": "https://example.org/pdf"},
1861 {"url_for_pdf": "https://example.org/pdf"},
1862 {"url_for_pdf": "https://arxiv.org/pdf/2401.12345"}
1863 ]
1864 });
1865 let chain = extract_oa_url_chain(Some(&meta));
1866 assert_eq!(chain.len(), 2);
1867 assert_eq!(chain[0].as_str(), "https://example.org/pdf");
1868 assert_eq!(chain[1].as_str(), "https://arxiv.org/pdf/2401.12345");
1869 }
1870
1871 #[test]
1872 fn extract_oa_url_chain_skips_unparsable_urls() {
1873 // A malformed URL in oa_locations[] is dropped silently
1874 // rather than aborting the chain — the metadata source can
1875 // emit a stray entry without poisoning the whole fetch.
1876 let meta = serde_json::json!({
1877 "best_oa_location": {
1878 "url_for_pdf": "https://good.example.org/pdf"
1879 },
1880 "oa_locations": [
1881 {"url_for_pdf": "not a url"},
1882 {"url_for_pdf": "https://arxiv.org/pdf/2401.12345"}
1883 ]
1884 });
1885 let chain = extract_oa_url_chain(Some(&meta));
1886 assert_eq!(chain.len(), 2);
1887 assert_eq!(chain[0].as_str(), "https://good.example.org/pdf");
1888 assert_eq!(chain[1].as_str(), "https://arxiv.org/pdf/2401.12345");
1889 }
1890
1891 #[test]
1892 fn fetch_paper_plan_matches_build_fetch_plan() {
1893 // The slice-2-named alias is a thin pass-through to
1894 // `dry_run::build_fetch_plan`. Pin behavioral equivalence so
1895 // a future refactor that diverges them surfaces here.
1896 use crate::{ArxivId, Doi};
1897 let r = Ref::Doi(Doi("10.1234/example".to_string()));
1898 let root = Utf8PathBuf::from("/tmp/doiget-test");
1899 let plan_a = fetch_paper_plan(&r, &root);
1900 let plan_b = build_fetch_plan(&r, &root);
1901 assert_eq!(plan_a.metadata_sources, plan_b.metadata_sources);
1902 assert_eq!(plan_a.target_pdf_path, plan_b.target_pdf_path);
1903 assert_eq!(plan_a.target_metadata_path, plan_b.target_metadata_path);
1904
1905 let r2 = Ref::Arxiv(ArxivId("2401.12345".to_string()));
1906 let plan_c = fetch_paper_plan(&r2, &root);
1907 let plan_d = build_fetch_plan(&r2, &root);
1908 assert_eq!(plan_c.pdf_sources[0].key, plan_d.pdf_sources[0].key);
1909 }
1910
1911 #[test]
1912 fn batch_fetch_plans_returns_plan_per_ref_in_order() {
1913 use crate::{ArxivId, Doi};
1914 let refs = vec![
1915 Ref::Doi(Doi("10.1234/alpha".to_string())),
1916 Ref::Arxiv(ArxivId("2401.12345".to_string())),
1917 ];
1918 let root = Utf8PathBuf::from("/tmp/doiget-batch-test");
1919 let plans = batch_fetch_plans(&refs, &root).expect("under cap returns Ok");
1920 assert_eq!(plans.len(), 2);
1921 // Order preserved.
1922 assert!(matches!(plans[0].0, Ref::Doi(_)));
1923 assert!(matches!(plans[1].0, Ref::Arxiv(_)));
1924 // DOI plan carries the crossref + unpaywall metadata sources.
1925 assert_eq!(plans[0].1.metadata_sources, vec!["crossref", "unpaywall"]);
1926 // arXiv plan has the arxiv PDF source key.
1927 assert_eq!(plans[1].1.pdf_sources[0].key, "arxiv");
1928 }
1929
1930 #[test]
1931 fn batch_fetch_plans_too_many_refs_returns_err() {
1932 use crate::Doi;
1933 // Build MAX_BATCH_REFS + 1 entries — boundary case.
1934 let n = MAX_BATCH_REFS + 1;
1935 let refs: Vec<Ref> = (0..n)
1936 .map(|i| Ref::Doi(Doi(format!("10.1234/n{}", i))))
1937 .collect();
1938 let root = Utf8PathBuf::from("/tmp/doiget-toomany");
1939 let err = batch_fetch_plans(&refs, &root).expect_err("over cap returns Err");
1940 match err {
1941 FetchError::TooManyRefs { got, max } => {
1942 assert_eq!(got, n);
1943 assert_eq!(max, MAX_BATCH_REFS);
1944 }
1945 other => panic!("expected TooManyRefs, got: {other:?}"),
1946 }
1947 }
1948
1949 #[tokio::test]
1950 async fn batch_fetch_too_many_refs_returns_err_before_any_fetch() {
1951 // The cap is enforced before any per-ref work, so we don't need
1952 // a working store/network here — pass a sentinel store_root and
1953 // a dummy FetchContext that would panic on use.
1954 use crate::http::{tier_1_allowlist, HttpClient};
1955 use crate::provenance::ProvenanceLog;
1956 use crate::rate_limiter::RateLimiter;
1957 use crate::store::FsStore;
1958 use crate::{Doi, RateLimits};
1959 use std::sync::Arc;
1960
1961 let td = tempfile::TempDir::new().expect("tempdir");
1962 let log_path = Utf8Path::from_path(td.path())
1963 .expect("utf-8")
1964 .join("log.jsonl");
1965 let store_root = Utf8Path::from_path(td.path())
1966 .expect("utf-8")
1967 .join("papers");
1968
1969 let ctx = FetchContext {
1970 http: Arc::new(HttpClient::new(tier_1_allowlist()).expect("http client")),
1971 rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
1972 log: Arc::new(
1973 ProvenanceLog::open(log_path, "01J0000000000000000000TEST".into())
1974 .expect("provenance log"),
1975 ),
1976 session_id: "01J0000000000000000000TEST".into(),
1977 };
1978 let profile = CapabilityProfile::from_env().expect("clean env");
1979 let store = FsStore::new(store_root.clone()).expect("fs store");
1980
1981 let n = MAX_BATCH_REFS + 1;
1982 let refs: Vec<Ref> = (0..n)
1983 .map(|i| Ref::Doi(Doi(format!("10.1234/n{}", i))))
1984 .collect();
1985
1986 let err = batch_fetch(&refs, &profile, &ctx, &store, &store_root)
1987 .await
1988 .expect_err("over cap returns Err");
1989 match err {
1990 FetchError::TooManyRefs { got, max } => {
1991 assert_eq!(got, n);
1992 assert_eq!(max, MAX_BATCH_REFS);
1993 }
1994 other => panic!("expected TooManyRefs, got: {other:?}"),
1995 }
1996 }
1997
1998 // Issue #118: a non-PDF OA body must surface as `Err(HttpError)`
1999 // from `try_fetch_oa_pdf` (previously silently flattened to
2000 // `None`, which `fetch_paper_doi` then reported as a clean
2001 // metadata-only success). The compiler-checked `Err(e) =>
2002 // PdfLegStatus::Blocked` arm in `fetch_paper_doi` does the rest.
2003 #[tokio::test]
2004 async fn try_fetch_oa_pdf_non_pdf_body_is_err_not_silent_none() {
2005 use crate::http::HttpClient;
2006 use crate::provenance::ProvenanceLog;
2007 use crate::rate_limiter::RateLimiter;
2008 use crate::{Doi, RateLimits};
2009 use std::sync::Arc;
2010 use wiremock::matchers::method;
2011 use wiremock::{Mock, MockServer, ResponseTemplate};
2012
2013 let server = MockServer::start().await;
2014 Mock::given(method("GET"))
2015 .respond_with(
2016 ResponseTemplate::new(200).set_body_bytes(b"<html>not a pdf</html>".to_vec()),
2017 )
2018 .mount(&server)
2019 .await;
2020 let host = server
2021 .uri()
2022 .parse::<url::Url>()
2023 .expect("uri")
2024 .host_str()
2025 .expect("host")
2026 .to_string();
2027
2028 let td = tempfile::TempDir::new().expect("tempdir");
2029 let log_path = Utf8Path::from_path(td.path())
2030 .expect("utf-8")
2031 .join("log.jsonl");
2032 let ctx = FetchContext {
2033 http: Arc::new(HttpClient::new_for_tests_allow_http("oa-publisher", &host)),
2034 rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
2035 log: Arc::new(
2036 ProvenanceLog::open(log_path, "01J0000000000000000000TEST".into())
2037 .expect("provenance log"),
2038 ),
2039 session_id: "01J0000000000000000000TEST".into(),
2040 };
2041
2042 let doi = Doi("10.1234/example".to_string());
2043 let url: url::Url = format!("{}/oa.pdf", server.uri()).parse().expect("url");
2044 let res = try_fetch_oa_pdf(&doi, &url, &ctx).await;
2045 match res {
2046 Err(HttpError::NotAPdf { .. }) => {}
2047 other => panic!("expected Err(NotAPdf), got: {other:?}"),
2048 }
2049 }
2050
2051 // Issue #145 / `docs/REDIRECT_ALLOWLIST.md` §1: the `oa-publisher`
2052 // host allowlist MUST be consulted on the metadata-discovered OA URL
2053 // *before the actual PDF fetch is issued*, not only on redirect hops.
2054 // An OA URL whose host is OFF the allowlist and that resolves WITHOUT
2055 // a redirect previously slipped past the redirect closure entirely and
2056 // was misclassified as a transport error. This test pins the fix: the
2057 // pre-fetch check rejects it with the SAME `HttpError::RedirectDenied`
2058 // the redirect closure produces, the OA fetch is NEVER issued (the
2059 // wiremock origin records ZERO requests, proving no PDF bytes were
2060 // requested / written), and the provenance trail is the byte-identical
2061 // `Fetch`/`err`/`oa-publisher`/`NETWORK_ERROR` row the redirect-denied
2062 // path emits.
2063 #[tokio::test]
2064 async fn try_fetch_oa_pdf_off_allowlist_host_no_redirect_is_redirect_denied_145() {
2065 use crate::http::HttpClient;
2066 use crate::provenance::ProvenanceLog;
2067 use crate::rate_limiter::RateLimiter;
2068 use crate::{DenialContext, DenialReason, Doi, RateLimits};
2069 use std::sync::Arc;
2070 use wiremock::matchers::method;
2071 use wiremock::{Mock, MockServer, ResponseTemplate};
2072
2073 // The wiremock origin would serve a valid PDF with NO redirect —
2074 // if the pre-check were absent the fetch would *succeed* against
2075 // an off-allowlist host, which is exactly the §1 violation.
2076 let server = MockServer::start().await;
2077 Mock::given(method("GET"))
2078 .respond_with(ResponseTemplate::new(200).set_body_bytes(b"%PDF-1.7 real pdf".to_vec()))
2079 .mount(&server)
2080 .await;
2081
2082 // Register a DIFFERENT host as the `oa-publisher` allowlist so the
2083 // wiremock origin (127.0.0.1) is OFF it. `evil.example.com` is a
2084 // valid host string the allowlist will not match.
2085 let td = tempfile::TempDir::new().expect("tempdir");
2086 let log_path = Utf8Path::from_path(td.path())
2087 .expect("utf-8")
2088 .join("log.jsonl");
2089 let ctx = FetchContext {
2090 http: Arc::new(HttpClient::new_for_tests_allow_http(
2091 "oa-publisher",
2092 "allowed-publisher.example.com",
2093 )),
2094 rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
2095 log: Arc::new(
2096 ProvenanceLog::open(log_path.clone(), "01J0000000000000000000TEST".into())
2097 .expect("provenance log"),
2098 ),
2099 session_id: "01J0000000000000000000TEST".into(),
2100 };
2101
2102 let doi = Doi("10.1234/example".to_string());
2103 // The OA URL Unpaywall handed back resolves to the wiremock host,
2104 // which is OFF the `oa-publisher` allowlist.
2105 let off_host_url: url::Url = format!("{}/oa.pdf", server.uri()).parse().expect("url");
2106 let res = try_fetch_oa_pdf(&doi, &off_host_url, &ctx).await;
2107
2108 // 1. Same error variant the redirect closure produces.
2109 let err = match res {
2110 Err(e @ HttpError::RedirectDenied { .. }) => e,
2111 other => {
2112 panic!("expected Err(RedirectDenied) from the pre-fetch check, got: {other:?}")
2113 }
2114 };
2115 match &err {
2116 HttpError::RedirectDenied {
2117 source_key,
2118 host,
2119 expected_hosts,
2120 } => {
2121 assert_eq!(source_key, "oa-publisher");
2122 // The host is lowercased, exactly as the redirect closure
2123 // would record it.
2124 assert_eq!(
2125 host,
2126 off_host_url
2127 .host_str()
2128 .expect("wiremock host")
2129 .to_ascii_lowercase()
2130 .as_str()
2131 );
2132 assert_eq!(
2133 expected_hosts,
2134 &vec!["allowed-publisher.example.com".to_string()]
2135 );
2136 }
2137 _ => unreachable!(),
2138 }
2139
2140 // 2. The OA fetch was NEVER issued — the wiremock origin saw zero
2141 // requests, so no PDF bytes were requested or written.
2142 assert!(
2143 server
2144 .received_requests()
2145 .await
2146 .unwrap_or_default()
2147 .is_empty(),
2148 "the off-allowlist OA URL must NOT be fetched: the pre-check \
2149 (REDIRECT_ALLOWLIST.md §1) rejects it before any request is \
2150 issued; wiremock recorded request(s)",
2151 );
2152
2153 // 3. The structured denial side-channel is byte-identical to the
2154 // redirect-closure path: `RedirectNotInAllowlist`, source key,
2155 // attempted host, expected allowlist snapshot.
2156 let dc: Option<DenialContext> = (&err).into();
2157 let dc = dc.expect("pre-fetch RedirectDenied -> Some(DenialContext)");
2158 assert_eq!(dc.reason, DenialReason::RedirectNotInAllowlist);
2159 assert_eq!(dc.source.as_deref(), Some("oa-publisher"));
2160 assert_eq!(
2161 dc.attempted,
2162 Some(off_host_url.host_str().expect("host").to_ascii_lowercase()),
2163 "attempted host must be the rejected OA URL host, lowercased — \
2164 identical to what the redirect closure records",
2165 );
2166 assert_eq!(
2167 dc.expected,
2168 Some(vec!["allowed-publisher.example.com".to_string()]),
2169 );
2170
2171 // 4. Provenance: exactly the `Fetch`/`err`/`oa-publisher`/
2172 // `NETWORK_ERROR` row the post-fetch redirect-denied arm emits
2173 // (same row kind + source key + closed-set code).
2174 let log_txt = std::fs::read_to_string(&log_path).expect("read provenance log");
2175 let fetch_err_row = log_txt
2176 .lines()
2177 .filter_map(|l| serde_json::from_str::<serde_json::Value>(l).ok())
2178 .find(|v| {
2179 v.get("event").and_then(|e| e.as_str()) == Some("fetch")
2180 && v.get("result").and_then(|r| r.as_str()) == Some("err")
2181 })
2182 .expect("a Fetch/err provenance row was written");
2183 assert_eq!(
2184 fetch_err_row.get("source").and_then(|s| s.as_str()),
2185 Some("oa-publisher"),
2186 );
2187 assert_eq!(
2188 fetch_err_row.get("error_code").and_then(|c| c.as_str()),
2189 Some("NETWORK_ERROR"),
2190 );
2191 assert_eq!(
2192 fetch_err_row.get("ref").and_then(|r| r.as_str()),
2193 Some("10.1234/example"),
2194 );
2195 }
2196
2197 // Issue #145 positive / no-regression: an ON-allowlist OA URL still
2198 // fetches the PDF normally. The pre-fetch check must be a pure gate —
2199 // it must not perturb the happy path.
2200 #[tokio::test]
2201 async fn try_fetch_oa_pdf_on_allowlist_host_still_fetches_pdf_no_regression_145() {
2202 use crate::http::HttpClient;
2203 use crate::provenance::ProvenanceLog;
2204 use crate::rate_limiter::RateLimiter;
2205 use crate::{Doi, RateLimits};
2206 use std::sync::Arc;
2207 use wiremock::matchers::method;
2208 use wiremock::{Mock, MockServer, ResponseTemplate};
2209
2210 let server = MockServer::start().await;
2211 let body = b"%PDF-1.7\nhello pdf".to_vec();
2212 Mock::given(method("GET"))
2213 .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
2214 .mount(&server)
2215 .await;
2216 // The wiremock host IS the registered `oa-publisher` allowlist, so
2217 // the pre-check passes and the fetch proceeds as before.
2218 let host = server
2219 .uri()
2220 .parse::<url::Url>()
2221 .expect("uri")
2222 .host_str()
2223 .expect("host")
2224 .to_string();
2225
2226 let td = tempfile::TempDir::new().expect("tempdir");
2227 let log_path = Utf8Path::from_path(td.path())
2228 .expect("utf-8")
2229 .join("log.jsonl");
2230 let ctx = FetchContext {
2231 http: Arc::new(HttpClient::new_for_tests_allow_http("oa-publisher", &host)),
2232 rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
2233 log: Arc::new(
2234 ProvenanceLog::open(log_path, "01J0000000000000000000TEST".into())
2235 .expect("provenance log"),
2236 ),
2237 session_id: "01J0000000000000000000TEST".into(),
2238 };
2239
2240 let doi = Doi("10.1234/example".to_string());
2241 let url: url::Url = format!("{}/oa.pdf", server.uri()).parse().expect("url");
2242 let (bytes, _final_url) = try_fetch_oa_pdf(&doi, &url, &ctx)
2243 .await
2244 .expect("on-allowlist OA URL still fetches the PDF");
2245 assert_eq!(bytes, body, "PDF bytes must be returned unchanged");
2246 }
2247
2248 // Issue #145: the pre-fetch denial and the redirect-closure denial
2249 // MUST produce a byte-identical `DenialContext` so PR #162's CLI
2250 // classification (CAPABILITY_DENIED / exit 3) handles both unchanged.
2251 // This pins the equivalence at the value level: the same source key +
2252 // host + allowlist snapshot map through the SAME
2253 // `From<&HttpError> for Option<DenialContext>` impl to equal structs.
2254 #[test]
2255 fn pre_fetch_denial_produces_byte_identical_denial_context_as_redirect_denied_145() {
2256 use crate::{DenialContext, DenialReason};
2257
2258 // Shape produced by the pre-fetch check in `try_fetch_oa_pdf`.
2259 let pre_fetch = HttpError::RedirectDenied {
2260 source_key: "oa-publisher".to_string(),
2261 host: "attacker.test".to_string(),
2262 expected_hosts: vec!["*.springer.com".to_string(), "*.plos.org".to_string()],
2263 };
2264 // Shape produced by the redirect closure in `crate::http` for the
2265 // identical inputs.
2266 let redirect_closure = HttpError::RedirectDenied {
2267 source_key: "oa-publisher".to_string(),
2268 host: "attacker.test".to_string(),
2269 expected_hosts: vec!["*.springer.com".to_string(), "*.plos.org".to_string()],
2270 };
2271
2272 let dc_pre: Option<DenialContext> = (&pre_fetch).into();
2273 let dc_red: Option<DenialContext> = (&redirect_closure).into();
2274 let dc_pre = dc_pre.expect("pre-fetch -> Some");
2275 let dc_red = dc_red.expect("redirect -> Some");
2276
2277 // Byte-identical: same reason, same source, same attempted host,
2278 // same expected snapshot, all auxiliary channels None.
2279 assert_eq!(dc_pre, dc_red);
2280 assert_eq!(dc_pre.reason, DenialReason::RedirectNotInAllowlist);
2281 assert_eq!(dc_pre.source.as_deref(), Some("oa-publisher"));
2282 assert_eq!(dc_pre.attempted.as_deref(), Some("attacker.test"));
2283 assert_eq!(
2284 dc_pre.expected,
2285 Some(vec!["*.springer.com".to_string(), "*.plos.org".to_string()]),
2286 );
2287 assert_eq!(dc_pre.hop_index, None);
2288 assert_eq!(dc_pre.cap, None);
2289 assert_eq!(dc_pre.actual, None);
2290 }
2291
2292 // -----------------------------------------------------------------
2293 // #139 — metadata_only_to_store writes the metadata TOML;
2294 // resolve_only / pure metadata_only write NOTHING.
2295 // -----------------------------------------------------------------
2296
2297 /// Build a ctx + FsStore under a fresh tempdir and point Crossref at
2298 /// a wiremock origin that returns one minimal `message`. Returns
2299 /// `(server, ctx, store, store_root, _td)` — `_td` keeps the tempdir
2300 /// alive for the test body.
2301 async fn md139_harness() -> (
2302 wiremock::MockServer,
2303 FetchContext,
2304 crate::store::FsStore,
2305 Utf8PathBuf,
2306 tempfile::TempDir,
2307 ) {
2308 use crate::http::HttpClient;
2309 use crate::provenance::ProvenanceLog;
2310 use crate::rate_limiter::RateLimiter;
2311 use crate::store::FsStore;
2312 use crate::RateLimits;
2313 use std::sync::Arc;
2314 use wiremock::matchers::method;
2315 use wiremock::{Mock, MockServer, ResponseTemplate};
2316
2317 let server = MockServer::start().await;
2318 Mock::given(method("GET"))
2319 .respond_with(ResponseTemplate::new(200).set_body_string(
2320 r#"{"status":"ok","message":{"title":["Example Paper"],"author":[{"given":"Ada","family":"Lovelace"}]}}"#,
2321 ))
2322 .mount(&server)
2323 .await;
2324 std::env::set_var("DOIGET_CROSSREF_BASE", server.uri());
2325
2326 // wiremock serves http://127.0.0.1:PORT; the production client is
2327 // https_only, so the test ctx uses the allow-http test client
2328 // scoped to the crossref/unpaywall source keys + the wiremock host.
2329 let host = server
2330 .uri()
2331 .parse::<url::Url>()
2332 .expect("uri")
2333 .host_str()
2334 .expect("host")
2335 .to_string();
2336
2337 let td = tempfile::TempDir::new().expect("tempdir");
2338 let base = Utf8Path::from_path(td.path()).expect("utf-8");
2339 let log_path = base.join("log.jsonl");
2340 let store_root = base.join("papers");
2341 let ctx = FetchContext {
2342 http: Arc::new(HttpClient::new_for_tests_allow_http_multi(&[
2343 ("crossref", &host),
2344 ("unpaywall", &host),
2345 ])),
2346 rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
2347 log: Arc::new(
2348 ProvenanceLog::open(log_path, "01J0000000000000000000TEST".into())
2349 .expect("provenance log"),
2350 ),
2351 session_id: "01J0000000000000000000TEST".into(),
2352 };
2353 let store = FsStore::new(store_root.clone()).expect("fs store");
2354 (server, ctx, store, store_root, td)
2355 }
2356
2357 fn metadata_dir_tomls(store_root: &Utf8Path) -> Vec<Utf8PathBuf> {
2358 let md = store_root.join(".metadata");
2359 match std::fs::read_dir(md.as_std_path()) {
2360 Ok(rd) => rd
2361 .filter_map(|e| e.ok())
2362 .filter_map(|e| Utf8PathBuf::from_path_buf(e.path()).ok())
2363 .filter(|p| p.extension() == Some("toml"))
2364 .collect(),
2365 Err(_) => Vec::new(),
2366 }
2367 }
2368
2369 #[tokio::test]
2370 #[serial_test::serial]
2371 async fn metadata_only_to_store_writes_metadata_toml_139() {
2372 let (_server, ctx, store, store_root, _td) = md139_harness().await;
2373 let profile = CapabilityProfile::from_env().expect("clean env");
2374 let ref_ = Ref::Doi(Doi("10.1234/example".to_string()));
2375
2376 let outcome = metadata_only_to_store(&ref_, &profile, &ctx, &store)
2377 .await
2378 .expect("metadata_only_to_store ok");
2379 assert_eq!(outcome.source, "crossref");
2380
2381 let tomls = metadata_dir_tomls(&store_root);
2382 assert_eq!(
2383 tomls.len(),
2384 1,
2385 "exactly one .metadata/*.toml must be written (MCP_TOOLS.md §11 SIDE EFFECT, #139); got {tomls:?}"
2386 );
2387 let body = std::fs::read_to_string(&tomls[0]).expect("read metadata toml");
2388 let meta: crate::store::Metadata = toml::from_str(&body).expect("parse metadata toml");
2389 assert_eq!(meta.title, "Example Paper");
2390 assert_eq!(
2391 meta.doi.as_ref().map(|d| d.as_str()),
2392 Some("10.1234/example")
2393 );
2394 let ext = meta.doiget.expect("[doiget] table present");
2395 assert_eq!(ext.source, "crossref");
2396 assert_eq!(ext.size_bytes, 0, "metadata-only entry has no PDF");
2397
2398 std::env::remove_var("DOIGET_CROSSREF_BASE");
2399 }
2400
2401 #[tokio::test]
2402 #[serial_test::serial]
2403 async fn resolve_only_and_pure_metadata_only_write_nothing_139() {
2404 let (_server, ctx, _store, store_root, _td) = md139_harness().await;
2405 let profile = CapabilityProfile::from_env().expect("clean env");
2406 let ref_ = Ref::Doi(Doi("10.1234/example".to_string()));
2407
2408 // resolve_only: contractually MUST NOT touch the store.
2409 let r = resolve_only(&ref_, &profile, &ctx)
2410 .await
2411 .expect("resolve_only ok");
2412 assert_eq!(r.source, "crossref");
2413 assert!(
2414 metadata_dir_tomls(&store_root).is_empty(),
2415 "resolve_only MUST NOT write a metadata TOML (docs/MCP_TOOLS.md §1; #139)"
2416 );
2417
2418 // The pure metadata_only is also write-free (the store-write
2419 // lives only in metadata_only_to_store).
2420 let m = metadata_only(&ref_, &profile, &ctx)
2421 .await
2422 .expect("metadata_only ok");
2423 assert_eq!(m.source, "crossref");
2424 assert!(
2425 metadata_dir_tomls(&store_root).is_empty(),
2426 "pure metadata_only MUST NOT write to the store (#139)"
2427 );
2428
2429 std::env::remove_var("DOIGET_CROSSREF_BASE");
2430 }
2431
2432 /// #139 — the arXiv branch of `metadata_only_to_store` must also
2433 /// write the metadata TOML (different code path: Atom feed,
2434 /// source="arxiv", license="arxiv-default", doi=None). Review I3/C1.
2435 #[tokio::test]
2436 #[serial_test::serial]
2437 async fn metadata_only_to_store_arxiv_writes_metadata_toml_139() {
2438 use crate::http::HttpClient;
2439 use crate::provenance::ProvenanceLog;
2440 use crate::rate_limiter::RateLimiter;
2441 use crate::store::FsStore;
2442 use crate::RateLimits;
2443 use std::sync::Arc;
2444 use wiremock::matchers::method;
2445 use wiremock::{Mock, MockServer, ResponseTemplate};
2446
2447 let atom = r#"<?xml version="1.0" encoding="UTF-8"?>
2448<feed xmlns="http://www.w3.org/2005/Atom">
2449 <entry>
2450 <id>http://arxiv.org/abs/2401.12345v1</id>
2451 <published>2024-01-15T00:00:00Z</published>
2452 <title>Example arXiv Paper Title</title>
2453 <summary>Example abstract.</summary>
2454 <author><name>Jane Doe</name></author>
2455 <category term="cs.LG" scheme="http://arxiv.org/schemas/atom"/>
2456 </entry>
2457</feed>"#;
2458 let server = MockServer::start().await;
2459 Mock::given(method("GET"))
2460 .respond_with(ResponseTemplate::new(200).set_body_string(atom))
2461 .mount(&server)
2462 .await;
2463 std::env::set_var("DOIGET_ARXIV_BASE", server.uri());
2464 let host = server
2465 .uri()
2466 .parse::<url::Url>()
2467 .expect("uri")
2468 .host_str()
2469 .expect("host")
2470 .to_string();
2471
2472 let td = tempfile::TempDir::new().expect("tempdir");
2473 let base = Utf8Path::from_path(td.path()).expect("utf-8");
2474 let store_root = base.join("papers");
2475 let ctx = FetchContext {
2476 http: Arc::new(HttpClient::new_for_tests_allow_http("arxiv", &host)),
2477 rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
2478 log: Arc::new(
2479 ProvenanceLog::open(base.join("log.jsonl"), "01J0000000000000000000TEST".into())
2480 .expect("provenance log"),
2481 ),
2482 session_id: "01J0000000000000000000TEST".into(),
2483 };
2484 let store = FsStore::new(store_root.clone()).expect("fs store");
2485 let profile = CapabilityProfile::from_env().expect("clean env");
2486 let ref_ = Ref::Arxiv(crate::ArxivId::parse("2401.12345").expect("arxiv id"));
2487
2488 let outcome = metadata_only_to_store(&ref_, &profile, &ctx, &store)
2489 .await
2490 .expect("metadata_only_to_store (arxiv) ok");
2491 assert_eq!(outcome.source, "arxiv");
2492
2493 let tomls = metadata_dir_tomls(&store_root);
2494 assert_eq!(
2495 tomls.len(),
2496 1,
2497 "arXiv metadata-only must write one TOML; got {tomls:?}"
2498 );
2499 let meta: crate::store::Metadata =
2500 toml::from_str(&std::fs::read_to_string(&tomls[0]).expect("read")).expect("parse");
2501 assert_eq!(meta.title, "Example arXiv Paper Title");
2502 assert_eq!(
2503 meta.arxiv_id.as_ref().map(|a| a.as_str()),
2504 Some("2401.12345")
2505 );
2506 assert!(meta.doi.is_none(), "arXiv entry has no DOI");
2507 let ext = meta.doiget.expect("[doiget] table");
2508 assert_eq!(ext.source, "arxiv");
2509 assert_eq!(ext.license, "arxiv-default");
2510
2511 std::env::remove_var("DOIGET_ARXIV_BASE");
2512 }
2513
2514 // ----- pure-function unit tests for the #139 extraction helpers ----
2515
2516 #[test]
2517 fn extract_metadata_title_handles_string_array_missing_blank() {
2518 use serde_json::json;
2519 // bare string (arXiv/Unpaywall shape)
2520 assert_eq!(
2521 extract_metadata_title(&json!({"title": "Hello"})),
2522 Some("Hello".to_string())
2523 );
2524 // single-element array (Crossref `message.title` in practice)
2525 assert_eq!(
2526 extract_metadata_title(&json!({"title": ["Real Title"]})),
2527 Some("Real Title".to_string())
2528 );
2529 // missing key -> None (caller falls back to ref id)
2530 assert_eq!(extract_metadata_title(&json!({"x": 1})), None);
2531 // blank string -> None (must not persist an empty title)
2532 assert_eq!(extract_metadata_title(&json!({"title": " "})), None);
2533 // empty array -> None
2534 assert_eq!(extract_metadata_title(&json!({"title": []})), None);
2535 // A leading blank/whitespace array element is SKIPPED — the first
2536 // non-blank element is taken (a stray leading empty element must
2537 // not mask the real Crossref title).
2538 assert_eq!(
2539 extract_metadata_title(&json!({"title": [" ", "Real Title"]})),
2540 Some("Real Title".to_string())
2541 );
2542 // all-blank array -> None (caller falls back to ref id)
2543 assert_eq!(extract_metadata_title(&json!({"title": [" ", ""]})), None);
2544 }
2545
2546 #[test]
2547 fn extract_metadata_authors_handles_each_resolver_shape() {
2548 use serde_json::json;
2549 // arXiv: authors: [String]
2550 assert_eq!(
2551 extract_metadata_authors(&json!({"authors": ["Jane Doe", "John Roe"]})),
2552 vec!["Jane Doe".to_string(), "John Roe".to_string()]
2553 );
2554 // Crossref: author: [{given,family}]
2555 assert_eq!(
2556 extract_metadata_authors(&json!({"author": [{"given": "Ada", "family": "Lovelace"}]})),
2557 vec!["Ada Lovelace".to_string()]
2558 );
2559 // family-only (given absent) -> trimmed, no leading space
2560 assert_eq!(
2561 extract_metadata_authors(&json!({"author": [{"family": "Onsager"}]})),
2562 vec!["Onsager".to_string()]
2563 );
2564 // `name` fallback when given+family both absent
2565 assert_eq!(
2566 extract_metadata_authors(&json!({"author": [{"name": "K. Wilson"}]})),
2567 vec!["K. Wilson".to_string()]
2568 );
2569 // z_authors fallback shape (forward-compat branch)
2570 assert_eq!(
2571 extract_metadata_authors(&json!({"z_authors": [{"given": "L", "family": "Kadanoff"}]})),
2572 vec!["L Kadanoff".to_string()]
2573 );
2574 // nothing parseable -> empty (still a valid TOML)
2575 assert!(extract_metadata_authors(&json!({"x": 1})).is_empty());
2576 assert!(extract_metadata_authors(&json!({"authors": []})).is_empty());
2577 }
2578}