1use std::collections::HashSet;
7
8use async_trait::async_trait;
9use serde::Deserialize;
10use url::Url;
11
12use crate::provenance::{Capability, LogEvent, LogResult, RowInput};
13use crate::source::{FetchContext, FetchError, FetchResult, Source};
14use crate::{CapabilityProfile, Ref};
15
16const DEFAULT_BASE: &str = "https://api.crossref.org";
19
20const MIN_CITATION_SCORE: f64 = 0.5;
23
24#[derive(Clone, Debug)]
28pub struct CrossrefSource {
29 base: Url,
34 #[allow(dead_code)]
39 contact_email: String,
40}
41
42impl CrossrefSource {
43 #[must_use]
48 pub fn new(contact_email: String) -> Self {
49 Self {
50 #[allow(clippy::expect_used)]
54 base: Url::parse(DEFAULT_BASE).expect("hard-coded base URL is valid"),
55 contact_email,
56 }
57 }
58
59 pub fn with_base(base: Url, contact_email: String) -> Self {
66 Self {
67 base,
68 contact_email,
69 }
70 }
71
72 fn request_url(&self, doi: &crate::Doi) -> Result<Url, FetchError> {
77 let path = format!("/works/{}", doi.as_str());
82 self.base.join(&path).map_err(|e| FetchError::SourceSchema {
83 hint: format!("crossref URL construction failed: {e}"),
84 })
85 }
86
87 pub async fn resolve_citation(
89 &self,
90 query: &str,
91 rows: u8,
92 ctx: &FetchContext,
93 ) -> Result<Vec<crate::ResolvedCandidate>, FetchError> {
94 let _permit = ctx.rate_limiter.acquire(self.name()).await;
96
97 let mut url = self
100 .base
101 .join("/works")
102 .map_err(|e| FetchError::SourceSchema {
103 hint: format!("crossref resolve_citation URL construction failed: {e}"),
104 })?;
105 url.query_pairs_mut()
106 .append_pair("query.bibliographic", query)
107 .append_pair("rows", &rows.to_string())
108 .append_pair("mailto", &self.contact_email);
109
110 let (body, _final_url) = ctx.http.fetch_bytes(self.name(), url).await?;
112
113 let envelope: serde_json::Value =
115 serde_json::from_slice(&body).map_err(|e| FetchError::SourceSchema {
116 hint: format!("crossref returned non-JSON for search: {e}"),
117 })?;
118
119 let items = envelope
120 .get("message")
121 .and_then(|m| m.get("items"))
122 .and_then(|i| i.as_array())
123 .ok_or_else(|| FetchError::SourceSchema {
124 hint: "crossref response missing message.items".to_string(),
125 })?;
126
127 let query_tokens = {
129 let mut t: Vec<String> = query
130 .split(|c: char| !c.is_alphanumeric())
131 .map(|s| s.to_lowercase())
132 .filter(|s| !s.is_empty())
133 .collect();
134 t.sort();
135 t.dedup();
136 t
137 };
138
139 if query_tokens.is_empty() {
140 return Ok(Vec::new());
141 }
142
143 let mut candidates = Vec::new();
144
145 for item in items {
146 let doi = match item.get("DOI").and_then(|v| v.as_str()) {
147 Some(d) => d.to_string(),
148 None => continue,
149 };
150
151 let fields = crate::orchestrator::extract_crossref_fields(item);
152
153 let mut candidate_text = String::new();
155 if let Some(t) = &fields.title {
156 candidate_text.push_str(&t.to_lowercase());
157 candidate_text.push(' ');
158 }
159 if let Some(author) = fields.authors.first() {
160 candidate_text.push_str(&author.to_lowercase());
161 candidate_text.push(' ');
162 }
163 if let Some(v) = &fields.venue {
164 candidate_text.push_str(&v.to_lowercase());
165 candidate_text.push(' ');
166 }
167 if let Some(y) = fields.year {
168 candidate_text.push_str(&y.to_string());
169 candidate_text.push(' ');
170 }
171
172 let candidate_tokens: HashSet<String> = candidate_text
174 .split(|c: char| !c.is_alphanumeric())
175 .map(|s| s.to_lowercase())
176 .filter(|s| !s.is_empty())
177 .collect();
178
179 let matched = query_tokens
180 .iter()
181 .filter(|q| candidate_tokens.contains(*q))
182 .count();
183
184 let score = matched as f64 / query_tokens.len() as f64;
185
186 if score >= MIN_CITATION_SCORE {
187 let first_author = fields.authors.first().cloned().unwrap_or_default();
188 candidates.push(crate::ResolvedCandidate {
189 doi,
190 title: fields.title.unwrap_or_default(),
191 author: first_author,
192 year: fields.year,
193 score,
194 source: "crossref".to_string(),
195 });
196 }
197 }
198
199 candidates.sort_by(|a, b| {
201 b.score
202 .partial_cmp(&a.score)
203 .unwrap_or(std::cmp::Ordering::Equal)
204 });
205
206 Ok(candidates)
207 }
208}
209
210#[async_trait]
211impl Source for CrossrefSource {
212 fn name(&self) -> &str {
213 "crossref"
214 }
215
216 fn can_serve(&self, _profile: &CapabilityProfile, ref_: &Ref) -> bool {
217 matches!(ref_, Ref::Doi(_))
218 }
219
220 async fn fetch(
221 &self,
222 ref_: &Ref,
223 _profile: &CapabilityProfile,
224 ctx: &FetchContext,
225 ) -> Result<FetchResult, FetchError> {
226 let doi = match ref_ {
227 Ref::Doi(d) => d,
228 Ref::Arxiv(_) => {
229 return Err(FetchError::NotEligible {
230 source_key: "crossref".into(),
231 });
232 }
233 };
234
235 let _permit = ctx.rate_limiter.acquire(self.name()).await;
237
238 let url = self.request_url(doi)?;
242 let (body, final_url) = ctx.http.fetch_bytes(self.name(), url).await?;
243
244 let envelope: CrossrefEnvelope =
248 serde_json::from_slice(&body).map_err(|e| FetchError::SourceSchema {
249 hint: format!("crossref returned non-JSON: {e}"),
250 })?;
251 if envelope.status != "ok" {
252 return Err(FetchError::SourceSchema {
253 hint: format!("crossref status = {}", envelope.status),
254 });
255 }
256
257 let canonical = ref_.promote(self.name(), None).digest_hex();
262 ctx.log.append(RowInput {
263 event: LogEvent::Fetch,
264 result: LogResult::Ok,
265 capability: Capability::Oa,
266 ref_: Some(doi.as_str()),
267 source: Some(self.name()),
268 error_code: None,
269 size_bytes: Some(body.len() as u64),
270 license: None,
271 store_path: None,
272 canonical_digest: Some(&canonical),
273 })?;
274
275 Ok(FetchResult {
276 source: self.name().to_string(),
277 license: "unknown".into(),
278 pdf_bytes: None,
281 final_url: Some(final_url),
282 metadata_json: Some(envelope.message),
283 })
284 }
285}
286
287#[derive(Debug, Deserialize)]
290struct CrossrefEnvelope {
291 status: String,
292 message: serde_json::Value,
293}
294
295#[cfg(test)]
300#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
301mod tests {
302 use super::*;
303
304 use std::sync::Arc;
305
306 use camino::Utf8PathBuf;
307 use tempfile::TempDir;
308 use wiremock::matchers::{method, path};
309 use wiremock::{Mock, MockServer, ResponseTemplate};
310
311 use crate::http::HttpClient;
312 use crate::provenance::ProvenanceLog;
313 use crate::rate_limiter::RateLimiter;
314 use crate::{ArxivId, CapabilityProfile, Doi, RateLimits, Ref};
315
316 fn build_test_context(wiremock_host: &str) -> (TempDir, FetchContext) {
321 let td = TempDir::new().expect("tempdir");
322 let log_dir =
324 Utf8PathBuf::try_from(td.path().to_path_buf()).expect("temp dir path must be UTF-8");
325 let log_path = log_dir.join("test.jsonl");
326
327 let http = Arc::new(HttpClient::new_for_tests_allow_http(
332 "crossref",
333 wiremock_host,
334 ));
335 let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
336 let session_id = "01J0000000000000000000TEST".to_string();
337 let log = Arc::new(
338 ProvenanceLog::open(log_path, session_id.clone()).expect("provenance log opens"),
339 );
340
341 (
342 td,
343 FetchContext {
344 http,
345 rate_limiter,
346 log,
347 session_id,
348 },
349 )
350 }
351
352 fn server_host(server: &MockServer) -> String {
354 server
355 .uri()
356 .parse::<Url>()
357 .expect("wiremock uri parses")
358 .host_str()
359 .expect("wiremock uri has host")
360 .to_string()
361 }
362
363 fn crossref_for(server: &MockServer) -> CrossrefSource {
365 let base = server.uri().parse::<Url>().expect("wiremock uri parses");
366 CrossrefSource::with_base(base, "test@example.org".to_string())
367 }
368
369 #[test]
370 fn crossref_can_serve_returns_true_for_doi() {
371 let s = CrossrefSource::new("test@example.org".into());
372 let profile = CapabilityProfile::from_env().expect("clean env");
373 let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
374 assert!(s.can_serve(&profile, &r));
375 }
376
377 #[test]
378 fn crossref_can_serve_returns_false_for_arxiv() {
379 let s = CrossrefSource::new("test@example.org".into());
380 let profile = CapabilityProfile::from_env().expect("clean env");
381 let r = Ref::Arxiv(ArxivId::parse("2401.12345").unwrap());
382 assert!(!s.can_serve(&profile, &r));
383 }
384
385 #[tokio::test]
386 async fn crossref_fetch_returns_envelope_message() {
387 let server = MockServer::start().await;
388 Mock::given(method("GET"))
389 .and(path("/works/10.1234/example"))
390 .respond_with(
391 ResponseTemplate::new(200)
392 .set_body_string(r#"{"status":"ok","message":{"title":["Example"]}}"#),
393 )
394 .mount(&server)
395 .await;
396
397 let host = server_host(&server);
398 let s = crossref_for(&server);
399 let (_td, ctx) = build_test_context(&host);
400 let profile = CapabilityProfile::from_env().expect("clean env");
401 let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
402
403 let res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
404 assert_eq!(res.source, "crossref");
405 assert_eq!(
406 res.metadata_json,
407 Some(serde_json::json!({ "title": ["Example"] })),
408 );
409 assert!(res.pdf_bytes.is_none());
410 assert!(res.final_url.is_some());
411 }
412
413 #[tokio::test]
414 async fn crossref_fetch_with_arxiv_ref_errors_not_eligible() {
415 let s = CrossrefSource::with_base(
419 Url::parse("http://127.0.0.1:1/").unwrap(),
420 "test@example.org".into(),
421 );
422 let (_td, ctx) = build_test_context("127.0.0.1");
423 let profile = CapabilityProfile::from_env().expect("clean env");
424 let r = Ref::Arxiv(ArxivId::parse("2401.12345").unwrap());
425
426 let err = s.fetch(&r, &profile, &ctx).await.expect_err("not eligible");
427 match err {
428 FetchError::NotEligible { source_key } => {
429 assert_eq!(source_key, "crossref");
430 }
431 other => panic!("expected NotEligible, got {:?}", other),
432 }
433 }
434
435 #[tokio::test]
436 async fn crossref_fetch_writes_log_row() {
437 let server = MockServer::start().await;
438 Mock::given(method("GET"))
439 .and(path("/works/10.1234/example"))
440 .respond_with(
441 ResponseTemplate::new(200)
442 .set_body_string(r#"{"status":"ok","message":{"title":["Example"]}}"#),
443 )
444 .mount(&server)
445 .await;
446
447 let host = server_host(&server);
448 let s = crossref_for(&server);
449 let (_td, ctx) = build_test_context(&host);
450 let profile = CapabilityProfile::from_env().expect("clean env");
451 let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
452
453 let _res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
454
455 let log_path = _td.path().join("test.jsonl");
459 let raw = std::fs::read_to_string(&log_path).expect("log file readable");
460 let lines: Vec<&str> = raw.lines().filter(|l| !l.is_empty()).collect();
461 assert_eq!(lines.len(), 1, "expected exactly one row, got {:?}", lines);
462 let row: serde_json::Value = serde_json::from_str(lines[0]).expect("row is valid JSON");
463 assert_eq!(row["event"], "fetch");
464 assert_eq!(row["result"], "ok");
465 assert_eq!(row["source"], "crossref");
466 assert_eq!(row["ref"], "10.1234/example");
467 }
468
469 #[tokio::test]
470 async fn crossref_404_maps_to_http_error() {
471 let server = MockServer::start().await;
472 Mock::given(method("GET"))
473 .and(path("/works/10.1234/example"))
474 .respond_with(ResponseTemplate::new(404))
475 .mount(&server)
476 .await;
477
478 let host = server_host(&server);
479 let s = crossref_for(&server);
480 let (_td, ctx) = build_test_context(&host);
481 let profile = CapabilityProfile::from_env().expect("clean env");
482 let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
483
484 let err = s.fetch(&r, &profile, &ctx).await.expect_err("404 errors");
485 match err {
486 FetchError::Http(_) => {}
487 other => panic!("expected Http(_) on 404, got {:?}", other),
488 }
489 }
490
491 #[tokio::test]
492 async fn crossref_non_ok_status_field_errors_source_schema() {
493 let server = MockServer::start().await;
494 Mock::given(method("GET"))
495 .and(path("/works/10.1234/example"))
496 .respond_with(
497 ResponseTemplate::new(200).set_body_string(r#"{"status":"error","message":{}}"#),
498 )
499 .mount(&server)
500 .await;
501
502 let host = server_host(&server);
503 let s = crossref_for(&server);
504 let (_td, ctx) = build_test_context(&host);
505 let profile = CapabilityProfile::from_env().expect("clean env");
506 let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
507
508 let err = s
509 .fetch(&r, &profile, &ctx)
510 .await
511 .expect_err("non-ok status errors");
512 match err {
513 FetchError::SourceSchema { hint } => {
514 assert!(
515 hint.contains("status"),
516 "expected status mention in hint, got {hint}"
517 );
518 }
519 other => panic!("expected SourceSchema, got {:?}", other),
520 }
521 }
522
523 #[tokio::test]
524 async fn test_resolve_citation_success() {
525 let server = MockServer::start().await;
526 let mock_body = serde_json::json!({
527 "status": "ok",
528 "message": {
529 "items": [
530 {
531 "DOI": "10.1000/xyz123",
532 "title": ["Lars Onsager, Crystal Statistics. I. A Two-Dimensional Model with an Order-Disorder Transition"],
533 "author": [
534 {"family": "Onsager", "given": "Lars"}
535 ],
536 "issued": {
537 "date-parts": [[1944, 2, 1]]
538 },
539 "container-title": ["Physical Review"]
540 },
541 {
542 "DOI": "10.1000/unrelated",
543 "title": ["Some Unrelated Paper"],
544 "author": [
545 {"family": "Smith", "given": "John"}
546 ],
547 "issued": {
548 "date-parts": [[2020]]
549 }
550 }
551 ]
552 }
553 });
554
555 Mock::given(method("GET"))
556 .and(path("/works"))
557 .respond_with(ResponseTemplate::new(200).set_body_json(mock_body))
558 .mount(&server)
559 .await;
560
561 let host = server_host(&server);
562 let s = crossref_for(&server);
563 let (_td, ctx) = build_test_context(&host);
564
565 let candidates = s
566 .resolve_citation("Onsager 1944", 2, &ctx)
567 .await
568 .expect("resolve ok");
569
570 assert_eq!(candidates.len(), 1);
574 let cand = &candidates[0];
575 assert_eq!(cand.doi, "10.1000/xyz123");
576 assert_eq!(cand.title, "Lars Onsager, Crystal Statistics. I. A Two-Dimensional Model with an Order-Disorder Transition");
577 assert_eq!(cand.author, "Onsager, Lars");
578 assert_eq!(cand.year, Some(1944));
579 assert_eq!(cand.score, 1.0);
580 }
581}