Skip to main content

doiget_core/sources/
crossref.rs

1//! Crossref source — DOI metadata + OA URL discovery via `link[]` array.
2//!
3//! Spec: docs/SOURCES.md §4 (Crossref). No auth; polite-pool User-Agent
4//! contact email is REQUIRED — see [`CrossrefSource::new`].
5
6use std::collections::HashSet;
7
8use async_trait::async_trait;
9use serde::Deserialize;
10use url::Url;
11
12use crate::provenance::{Capability, LogEvent, LogResult, RowInput};
13use crate::source::{FetchContext, FetchError, FetchResult, Source};
14use crate::{CapabilityProfile, Ref};
15
16/// Production Crossref REST API base URL. Hard-coded per `docs/SOURCES.md`
17/// §4; tests inject a wiremock origin via [`CrossrefSource::with_base`].
18const DEFAULT_BASE: &str = "https://api.crossref.org";
19
20/// Minimum token overlap similarity score to include a candidate in
21/// [`CrossrefSource::resolve_citation`] results.
22const MIN_CITATION_SCORE: f64 = 0.5;
23
24/// Crossref [`Source`] impl — DOI → metadata; OA URL via `message.link[]`.
25///
26/// See `docs/SOURCES.md` §4 for the access policy (no auth, polite pool).
27#[derive(Clone, Debug)]
28pub struct CrossrefSource {
29    /// API base URL. Production constructor pins this to
30    /// `https://api.crossref.org`; the [`with_base`](Self::with_base)
31    /// test-only constructor lets wiremock substitute an `http://127.0.0.1:N`
32    /// origin.
33    base: Url,
34    /// Polite-pool contact email per `docs/SOURCES.md` §4 Crossref.
35    /// Concretely formatted into the `User-Agent` header by [`crate::http::HttpClient`].
36    /// (Phase 1: caller injects via [`CrossrefSource::new`]; CLI / config wiring
37    /// lands in a follow-up PR.)
38    #[allow(dead_code)]
39    contact_email: String,
40}
41
42impl CrossrefSource {
43    /// Production constructor: hard-codes `https://api.crossref.org` as the
44    /// base URL. The `contact_email` value is appended to the polite-pool
45    /// User-Agent (config plumbing arrives in a later PR — see
46    /// `docs/SOURCES.md` §4).
47    #[must_use]
48    pub fn new(contact_email: String) -> Self {
49        Self {
50            // The hard-coded constant is a known-valid URL; the `expect`
51            // here is the documented exception to the workspace
52            // `expect_used` lint (it can never fire in practice).
53            #[allow(clippy::expect_used)]
54            base: Url::parse(DEFAULT_BASE).expect("hard-coded base URL is valid"),
55            contact_email,
56        }
57    }
58
59    /// Construct with an arbitrary base URL.
60    ///
61    /// The orchestrator (`doiget-cli::commands::fetch`) uses this to honor
62    /// the `DOIGET_CROSSREF_BASE` env var, which lets integration tests point
63    /// the source at a wiremock origin without compile-time gates. Production
64    /// callers use [`CrossrefSource::new`].
65    pub fn with_base(base: Url, contact_email: String) -> Self {
66        Self {
67            base,
68            contact_email,
69        }
70    }
71
72    /// Build the `/works/{doi}` URL for the configured base. Returns
73    /// [`FetchError::SourceSchema`] if joining the path produces an invalid
74    /// URL (only possible if the base URL is malformed — should never happen
75    /// in production).
76    fn request_url(&self, doi: &crate::Doi) -> Result<Url, FetchError> {
77        // Crossref accepts the bare DOI (no `doi:` scheme). `Doi::as_str()`
78        // already returns it without the scheme. The `/` inside the suffix
79        // is URL-encoded by `reqwest` when the request is built; wiremock
80        // sees the decoded path on its `path()` matcher.
81        let path = format!("/works/{}", doi.as_str());
82        self.base.join(&path).map_err(|e| FetchError::SourceSchema {
83            hint: format!("crossref URL construction failed: {e}"),
84        })
85    }
86
87    /// Resolves a free-form bibliographic citation string to ranked DOI candidates.
88    pub async fn resolve_citation(
89        &self,
90        query: &str,
91        rows: u8,
92        ctx: &FetchContext,
93    ) -> Result<Vec<crate::ResolvedCandidate>, FetchError> {
94        // 1. Rate limiter
95        let _permit = ctx.rate_limiter.acquire(self.name()).await;
96
97        // 2. Build works query URL
98        // /works?query.bibliographic=<query>&rows=<rows>&mailto=<email>
99        let mut url = self
100            .base
101            .join("/works")
102            .map_err(|e| FetchError::SourceSchema {
103                hint: format!("crossref resolve_citation URL construction failed: {e}"),
104            })?;
105        url.query_pairs_mut()
106            .append_pair("query.bibliographic", query)
107            .append_pair("rows", &rows.to_string())
108            .append_pair("mailto", &self.contact_email);
109
110        // 3. HTTP fetch
111        let (body, _final_url) = ctx.http.fetch_bytes(self.name(), url).await?;
112
113        // 4. Parse JSON
114        let envelope: serde_json::Value =
115            serde_json::from_slice(&body).map_err(|e| FetchError::SourceSchema {
116                hint: format!("crossref returned non-JSON for search: {e}"),
117            })?;
118
119        let items = envelope
120            .get("message")
121            .and_then(|m| m.get("items"))
122            .and_then(|i| i.as_array())
123            .ok_or_else(|| FetchError::SourceSchema {
124                hint: "crossref response missing message.items".to_string(),
125            })?;
126
127        // 5. Tokenize query (unique tokens, sorted)
128        let query_tokens = {
129            let mut t: Vec<String> = query
130                .split(|c: char| !c.is_alphanumeric())
131                .map(|s| s.to_lowercase())
132                .filter(|s| !s.is_empty())
133                .collect();
134            t.sort();
135            t.dedup();
136            t
137        };
138
139        if query_tokens.is_empty() {
140            return Ok(Vec::new());
141        }
142
143        let mut candidates = Vec::new();
144
145        for item in items {
146            let doi = match item.get("DOI").and_then(|v| v.as_str()) {
147                Some(d) => d.to_string(),
148                None => continue,
149            };
150
151            let fields = crate::orchestrator::extract_crossref_fields(item);
152
153            // Construct search text from candidate
154            let mut candidate_text = String::new();
155            if let Some(t) = &fields.title {
156                candidate_text.push_str(&t.to_lowercase());
157                candidate_text.push(' ');
158            }
159            if let Some(author) = fields.authors.first() {
160                candidate_text.push_str(&author.to_lowercase());
161                candidate_text.push(' ');
162            }
163            if let Some(v) = &fields.venue {
164                candidate_text.push_str(&v.to_lowercase());
165                candidate_text.push(' ');
166            }
167            if let Some(y) = fields.year {
168                candidate_text.push_str(&y.to_string());
169                candidate_text.push(' ');
170            }
171
172            // Simple tokenize of candidate into a HashSet for O(1) lookup.
173            let candidate_tokens: HashSet<String> = candidate_text
174                .split(|c: char| !c.is_alphanumeric())
175                .map(|s| s.to_lowercase())
176                .filter(|s| !s.is_empty())
177                .collect();
178
179            let matched = query_tokens
180                .iter()
181                .filter(|q| candidate_tokens.contains(*q))
182                .count();
183
184            let score = matched as f64 / query_tokens.len() as f64;
185
186            if score >= MIN_CITATION_SCORE {
187                let first_author = fields.authors.first().cloned().unwrap_or_default();
188                candidates.push(crate::ResolvedCandidate {
189                    doi,
190                    title: fields.title.unwrap_or_default(),
191                    author: first_author,
192                    year: fields.year,
193                    score,
194                    source: "crossref".to_string(),
195                });
196            }
197        }
198
199        // 6. Sort candidates by score descending
200        candidates.sort_by(|a, b| {
201            b.score
202                .partial_cmp(&a.score)
203                .unwrap_or(std::cmp::Ordering::Equal)
204        });
205
206        Ok(candidates)
207    }
208}
209
210#[async_trait]
211impl Source for CrossrefSource {
212    fn name(&self) -> &str {
213        "crossref"
214    }
215
216    fn can_serve(&self, _profile: &CapabilityProfile, ref_: &Ref) -> bool {
217        matches!(ref_, Ref::Doi(_))
218    }
219
220    async fn fetch(
221        &self,
222        ref_: &Ref,
223        _profile: &CapabilityProfile,
224        ctx: &FetchContext,
225    ) -> Result<FetchResult, FetchError> {
226        let doi = match ref_ {
227            Ref::Doi(d) => d,
228            Ref::Arxiv(_) => {
229                return Err(FetchError::NotEligible {
230                    source_key: "crossref".into(),
231                });
232            }
233        };
234
235        // Step 1: rate limiter (politeness — `docs/SOURCES.md` §6).
236        let _permit = ctx.rate_limiter.acquire(self.name()).await;
237
238        // Step 2: HTTP fetch. Body is JSON; the `PDF_MAX_BYTES` size cap in
239        // `HttpClient` applies. Crossref responses are well under 100 MB
240        // even for bibliographically rich DOIs.
241        let url = self.request_url(doi)?;
242        let (body, final_url) = ctx.http.fetch_bytes(self.name(), url).await?;
243
244        // Step 3: parse the response envelope. Crossref wraps the work
245        // record in a top-level `{ "status": "ok", "message": { ... } }`
246        // envelope (per <https://api.crossref.org/swagger-ui/index.html>).
247        let envelope: CrossrefEnvelope =
248            serde_json::from_slice(&body).map_err(|e| FetchError::SourceSchema {
249                hint: format!("crossref returned non-JSON: {e}"),
250            })?;
251        if envelope.status != "ok" {
252            return Err(FetchError::SourceSchema {
253                hint: format!("crossref status = {}", envelope.status),
254            });
255        }
256
257        // Step 4: log the fetch event (`docs/PROVENANCE_LOG.md` §3).
258        // ADR-0021 §1 canonical-digest: promote the ref under the
259        // "crossref" resolver profile (no version — Crossref does not
260        // expose a per-call version token in Phase 1).
261        let canonical = ref_.promote(self.name(), None).digest_hex();
262        ctx.log.append(RowInput {
263            event: LogEvent::Fetch,
264            result: LogResult::Ok,
265            capability: Capability::Oa,
266            ref_: Some(doi.as_str()),
267            source: Some(self.name()),
268            error_code: None,
269            size_bytes: Some(body.len() as u64),
270            license: None,
271            store_path: None,
272            canonical_digest: Some(&canonical),
273        })?;
274
275        Ok(FetchResult {
276            source: self.name().to_string(),
277            license: "unknown".into(),
278            // Crossref is metadata; PDF retrieval is the job of Unpaywall /
279            // publisher sources (Phase 1+ sibling PRs).
280            pdf_bytes: None,
281            final_url: Some(final_url),
282            metadata_json: Some(envelope.message),
283        })
284    }
285}
286
287/// Top-level Crossref response envelope. Only `status` and `message` are
288/// load-bearing here; `message-type`, `message-version`, etc. are ignored.
289#[derive(Debug, Deserialize)]
290struct CrossrefEnvelope {
291    status: String,
292    message: serde_json::Value,
293}
294
295// ---------------------------------------------------------------------------
296// Tests
297// ---------------------------------------------------------------------------
298
299#[cfg(test)]
300#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
301mod tests {
302    use super::*;
303
304    use std::sync::Arc;
305
306    use camino::Utf8PathBuf;
307    use tempfile::TempDir;
308    use wiremock::matchers::{method, path};
309    use wiremock::{Mock, MockServer, ResponseTemplate};
310
311    use crate::http::HttpClient;
312    use crate::provenance::ProvenanceLog;
313    use crate::rate_limiter::RateLimiter;
314    use crate::{ArxivId, CapabilityProfile, Doi, RateLimits, Ref};
315
316    /// Build a `FetchContext` whose [`HttpClient`] allows the wiremock
317    /// `http://` origin under the `crossref` source key, plus a
318    /// tempdir-backed `ProvenanceLog`. Returns the tempdir so the caller
319    /// keeps it alive for the duration of the test.
320    fn build_test_context(wiremock_host: &str) -> (TempDir, FetchContext) {
321        let td = TempDir::new().expect("tempdir");
322        // Workspace lints ban `std::path::PathBuf`; convert via camino.
323        let log_dir =
324            Utf8PathBuf::try_from(td.path().to_path_buf()).expect("temp dir path must be UTF-8");
325        let log_path = log_dir.join("test.jsonl");
326
327        // Use the test-only constructor that relaxes `https_only` for the
328        // initial leg so wiremock (which serves over plain HTTP) can be
329        // reached. Redirect closure still rejects http:// targets — see
330        // `http.rs::build_client_allow_http`.
331        let http = Arc::new(HttpClient::new_for_tests_allow_http(
332            "crossref",
333            wiremock_host,
334        ));
335        let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
336        let session_id = "01J0000000000000000000TEST".to_string();
337        let log = Arc::new(
338            ProvenanceLog::open(log_path, session_id.clone()).expect("provenance log opens"),
339        );
340
341        (
342            td,
343            FetchContext {
344                http,
345                rate_limiter,
346                log,
347                session_id,
348            },
349        )
350    }
351
352    /// Extract the host string of a wiremock server's URI.
353    fn server_host(server: &MockServer) -> String {
354        server
355            .uri()
356            .parse::<Url>()
357            .expect("wiremock uri parses")
358            .host_str()
359            .expect("wiremock uri has host")
360            .to_string()
361    }
362
363    /// Build a [`CrossrefSource`] pointing at the given wiremock URI.
364    fn crossref_for(server: &MockServer) -> CrossrefSource {
365        let base = server.uri().parse::<Url>().expect("wiremock uri parses");
366        CrossrefSource::with_base(base, "test@example.org".to_string())
367    }
368
369    #[test]
370    fn crossref_can_serve_returns_true_for_doi() {
371        let s = CrossrefSource::new("test@example.org".into());
372        let profile = CapabilityProfile::from_env().expect("clean env");
373        let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
374        assert!(s.can_serve(&profile, &r));
375    }
376
377    #[test]
378    fn crossref_can_serve_returns_false_for_arxiv() {
379        let s = CrossrefSource::new("test@example.org".into());
380        let profile = CapabilityProfile::from_env().expect("clean env");
381        let r = Ref::Arxiv(ArxivId::parse("2401.12345").unwrap());
382        assert!(!s.can_serve(&profile, &r));
383    }
384
385    #[tokio::test]
386    async fn crossref_fetch_returns_envelope_message() {
387        let server = MockServer::start().await;
388        Mock::given(method("GET"))
389            .and(path("/works/10.1234/example"))
390            .respond_with(
391                ResponseTemplate::new(200)
392                    .set_body_string(r#"{"status":"ok","message":{"title":["Example"]}}"#),
393            )
394            .mount(&server)
395            .await;
396
397        let host = server_host(&server);
398        let s = crossref_for(&server);
399        let (_td, ctx) = build_test_context(&host);
400        let profile = CapabilityProfile::from_env().expect("clean env");
401        let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
402
403        let res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
404        assert_eq!(res.source, "crossref");
405        assert_eq!(
406            res.metadata_json,
407            Some(serde_json::json!({ "title": ["Example"] })),
408        );
409        assert!(res.pdf_bytes.is_none());
410        assert!(res.final_url.is_some());
411    }
412
413    #[tokio::test]
414    async fn crossref_fetch_with_arxiv_ref_errors_not_eligible() {
415        // wiremock not needed: the arxiv branch short-circuits before any
416        // outbound call. Construct the source with a dummy base, and pass
417        // a dummy allowlist host since fetch never reaches the HTTP layer.
418        let s = CrossrefSource::with_base(
419            Url::parse("http://127.0.0.1:1/").unwrap(),
420            "test@example.org".into(),
421        );
422        let (_td, ctx) = build_test_context("127.0.0.1");
423        let profile = CapabilityProfile::from_env().expect("clean env");
424        let r = Ref::Arxiv(ArxivId::parse("2401.12345").unwrap());
425
426        let err = s.fetch(&r, &profile, &ctx).await.expect_err("not eligible");
427        match err {
428            FetchError::NotEligible { source_key } => {
429                assert_eq!(source_key, "crossref");
430            }
431            other => panic!("expected NotEligible, got {:?}", other),
432        }
433    }
434
435    #[tokio::test]
436    async fn crossref_fetch_writes_log_row() {
437        let server = MockServer::start().await;
438        Mock::given(method("GET"))
439            .and(path("/works/10.1234/example"))
440            .respond_with(
441                ResponseTemplate::new(200)
442                    .set_body_string(r#"{"status":"ok","message":{"title":["Example"]}}"#),
443            )
444            .mount(&server)
445            .await;
446
447        let host = server_host(&server);
448        let s = crossref_for(&server);
449        let (_td, ctx) = build_test_context(&host);
450        let profile = CapabilityProfile::from_env().expect("clean env");
451        let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
452
453        let _res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
454
455        // Reopen the log file as raw JSON Lines and assert the single row's
456        // semantic fields. We deliberately don't reach into ProvenanceLog
457        // internals — the public read path is "parse the JSONL by line".
458        let log_path = _td.path().join("test.jsonl");
459        let raw = std::fs::read_to_string(&log_path).expect("log file readable");
460        let lines: Vec<&str> = raw.lines().filter(|l| !l.is_empty()).collect();
461        assert_eq!(lines.len(), 1, "expected exactly one row, got {:?}", lines);
462        let row: serde_json::Value = serde_json::from_str(lines[0]).expect("row is valid JSON");
463        assert_eq!(row["event"], "fetch");
464        assert_eq!(row["result"], "ok");
465        assert_eq!(row["source"], "crossref");
466        assert_eq!(row["ref"], "10.1234/example");
467    }
468
469    #[tokio::test]
470    async fn crossref_404_maps_to_http_error() {
471        let server = MockServer::start().await;
472        Mock::given(method("GET"))
473            .and(path("/works/10.1234/example"))
474            .respond_with(ResponseTemplate::new(404))
475            .mount(&server)
476            .await;
477
478        let host = server_host(&server);
479        let s = crossref_for(&server);
480        let (_td, ctx) = build_test_context(&host);
481        let profile = CapabilityProfile::from_env().expect("clean env");
482        let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
483
484        let err = s.fetch(&r, &profile, &ctx).await.expect_err("404 errors");
485        match err {
486            FetchError::Http(_) => {}
487            other => panic!("expected Http(_) on 404, got {:?}", other),
488        }
489    }
490
491    #[tokio::test]
492    async fn crossref_non_ok_status_field_errors_source_schema() {
493        let server = MockServer::start().await;
494        Mock::given(method("GET"))
495            .and(path("/works/10.1234/example"))
496            .respond_with(
497                ResponseTemplate::new(200).set_body_string(r#"{"status":"error","message":{}}"#),
498            )
499            .mount(&server)
500            .await;
501
502        let host = server_host(&server);
503        let s = crossref_for(&server);
504        let (_td, ctx) = build_test_context(&host);
505        let profile = CapabilityProfile::from_env().expect("clean env");
506        let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
507
508        let err = s
509            .fetch(&r, &profile, &ctx)
510            .await
511            .expect_err("non-ok status errors");
512        match err {
513            FetchError::SourceSchema { hint } => {
514                assert!(
515                    hint.contains("status"),
516                    "expected status mention in hint, got {hint}"
517                );
518            }
519            other => panic!("expected SourceSchema, got {:?}", other),
520        }
521    }
522
523    #[tokio::test]
524    async fn test_resolve_citation_success() {
525        let server = MockServer::start().await;
526        let mock_body = serde_json::json!({
527            "status": "ok",
528            "message": {
529                "items": [
530                    {
531                        "DOI": "10.1000/xyz123",
532                        "title": ["Lars Onsager, Crystal Statistics. I. A Two-Dimensional Model with an Order-Disorder Transition"],
533                        "author": [
534                            {"family": "Onsager", "given": "Lars"}
535                        ],
536                        "issued": {
537                            "date-parts": [[1944, 2, 1]]
538                        },
539                        "container-title": ["Physical Review"]
540                    },
541                    {
542                        "DOI": "10.1000/unrelated",
543                        "title": ["Some Unrelated Paper"],
544                        "author": [
545                            {"family": "Smith", "given": "John"}
546                        ],
547                        "issued": {
548                            "date-parts": [[2020]]
549                        }
550                    }
551                ]
552            }
553        });
554
555        Mock::given(method("GET"))
556            .and(path("/works"))
557            .respond_with(ResponseTemplate::new(200).set_body_json(mock_body))
558            .mount(&server)
559            .await;
560
561        let host = server_host(&server);
562        let s = crossref_for(&server);
563        let (_td, ctx) = build_test_context(&host);
564
565        let candidates = s
566            .resolve_citation("Onsager 1944", 2, &ctx)
567            .await
568            .expect("resolve ok");
569
570        // The query "Onsager 1944" has tokens ["onsager", "1944"].
571        // The first candidate has both "onsager" (author family) and "1944" (issued year). Score is 1.0.
572        // The second candidate has neither. Score is 0.0, filtered out.
573        assert_eq!(candidates.len(), 1);
574        let cand = &candidates[0];
575        assert_eq!(cand.doi, "10.1000/xyz123");
576        assert_eq!(cand.title, "Lars Onsager, Crystal Statistics. I. A Two-Dimensional Model with an Order-Disorder Transition");
577        assert_eq!(cand.author, "Onsager, Lars");
578        assert_eq!(cand.year, Some(1944));
579        assert_eq!(cand.score, 1.0);
580    }
581}