From d660ed457e30eeb335085a1754617f4f23130d5c Mon Sep 17 00:00:00 2001 From: Alexander Minges Date: Fri, 25 Jul 2025 10:42:00 +0200 Subject: [PATCH] refactor: centralize constants and improve tests - Centralize LICENSE_MAP, API_URLS, DERIVATIVE_ALLOWED_LICENSES, and TEMPLATES in core/constants.py - Replace custom HTTP_STATUS dict with Python's standard http.HTTPStatus enum - Add comprehensive tests for derivative license logic in AbstractProcessor - Add unit tests for DERIVATIVE_ALLOWED_LICENSES constant validation - Create helper function create_license_from_map() for consistent test data - Fix formatting inconsistencies in constants.py (remove double empty lines) - Use descriptive _mock_print variable names to satisfy ruff linting - Update CHANGELOG to reflect constants centralization and HTTPStatus usage This eliminates code duplication, improves maintainability, and ensures derivative license detection logic is properly tested. --- CHANGELOG.md | 5 + doi2dataset/__init__.py | 12 +- doi2dataset/api/processors.py | 38 ++---- doi2dataset/core/constants.py | 42 ++++++ doi2dataset/processing/metadata.py | 13 +- tests/test_abstract_processor.py | 206 +++++++++++++++++++++++++++++ tests/test_license_processor.py | 54 +++++++- 7 files changed, 335 insertions(+), 35 deletions(-) create mode 100644 tests/test_abstract_processor.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 6bdd8a5..da40350 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Centralize ICONS definitions in core/constants.py module to eliminate code duplication +- Centralize additional constants (LICENSE_MAP, API_URLS, etc.) in core/constants.py module +- Remove hardcoded Creative Commons license mappings from processors.py +- Remove hardcoded API URLs for OpenAlex and CrossRef from processing modules +- Replace custom HTTP status code constants with Python's standard `http.HTTPStatus` enum +- Remove hardcoded template strings throughout codebase ### Fixed diff --git a/doi2dataset/__init__.py b/doi2dataset/__init__.py index 1c260d7..5bd6bb4 100644 --- a/doi2dataset/__init__.py +++ b/doi2dataset/__init__.py @@ -51,7 +51,13 @@ from .core import ( Person, PrimitiveMetadataField, ) -from .core.constants import ICONS +from .core.constants import ( + API_URLS, + DERIVATIVE_ALLOWED_LICENSES, + ICONS, + LICENSE_MAP, + TEMPLATES, +) from .processing import ( CitationBuilder, MetadataProcessor, @@ -83,6 +89,10 @@ __all__ = [ "Abstract", # Constants "ICONS", + "LICENSE_MAP", + "API_URLS", + "DERIVATIVE_ALLOWED_LICENSES", + "TEMPLATES", # Metadata fields "BaseMetadataField", "PrimitiveMetadataField", diff --git a/doi2dataset/api/processors.py b/doi2dataset/api/processors.py index a117366..fe31bd8 100644 --- a/doi2dataset/api/processors.py +++ b/doi2dataset/api/processors.py @@ -6,11 +6,17 @@ including license processing and abstract extraction/cleaning. """ import re +from http import HTTPStatus from typing import Any from rich.console import Console -from ..core.constants import ICONS +from ..core.constants import ( + API_URLS, + DERIVATIVE_ALLOWED_LICENSES, + ICONS, + LICENSE_MAP, +) from ..core.models import Abstract, License from .client import APIClient @@ -20,26 +26,6 @@ class LicenseProcessor: Processes license information from metadata. """ - LICENSE_MAP = { - "cc-by": ("https://creativecommons.org/licenses/by/4.0/", "CC BY 4.0"), - "cc-by-sa": ("https://creativecommons.org/licenses/by-sa/4.0/", "CC BY-SA 4.0"), - "cc-by-nc": ("https://creativecommons.org/licenses/by-nc/4.0/", "CC BY-NC 4.0"), - "cc-by-nc-sa": ( - "https://creativecommons.org/licenses/by-nc-sa/4.0/", - "CC BY-NC-SA 4.0", - ), - "cc-by-nc-nd": ( - "https://creativecommons.org/licenses/by-nc-nd/4.0/", - "CC BY-NC-ND 4.0", - ), - "cc-by-nd": ("https://creativecommons.org/licenses/by-nd/4.0/", "CC BY-ND 4.0"), - "cc0": ("https://creativecommons.org/publicdomain/zero/1.0/", "CC0 1.0"), - "pd": ( - "https://creativecommons.org/publicdomain/mark/1.0/", - "Public Domain Mark 1.0", - ), - } - @classmethod def process_license(cls, data: dict[str, Any]) -> License: """ @@ -58,7 +44,7 @@ class LicenseProcessor: return License(name="", uri="", short="unknown") base_license = license_short.split("/")[0].lower() - uri, name = cls.LICENSE_MAP.get(base_license, ("", license_short)) + uri, name = LICENSE_MAP.get(base_license, ("", license_short)) return License(name=name, uri=uri, short=license_short) @@ -92,9 +78,7 @@ class AbstractProcessor: Returns: Abstract: The abstract with its source. """ - license_ok = {"cc-by", "cc-by-sa", "cc-by-nc", "cc-by-nc-sa", "cc0", "pd"} - - if license.short in license_ok: + if license.short in DERIVATIVE_ALLOWED_LICENSES: self.console.print( f"\n{ICONS['info']} License {license.name} allows derivative works. Pulling abstract from CrossRef.", style="info", @@ -144,10 +128,10 @@ class AbstractProcessor: Returns: str | None: The abstract if found, otherwise None. """ - url = f"https://api.crossref.org/works/{doi}" + url = f"{API_URLS['crossref_base']}{doi}" response = self.api_client.make_request(url) - if response and response.status_code == 200: + if response and response.status_code == HTTPStatus.OK: abstract_raw = response.json().get("message", {}).get("abstract") return self._clean_jats(abstract_raw) return None diff --git a/doi2dataset/core/constants.py b/doi2dataset/core/constants.py index dacbe48..4fb3306 100644 --- a/doi2dataset/core/constants.py +++ b/doi2dataset/core/constants.py @@ -41,3 +41,45 @@ EMOJI_ICONS = { # Default icon set preference DEFAULT_ICONS = ICONS + +# API endpoint URLs +API_URLS = { + "openalex_base": "https://api.openalex.org/works/https://doi.org/", + "crossref_base": "https://api.crossref.org/works/", +} + +# License mapping for Creative Commons and public domain licenses +LICENSE_MAP = { + "cc-by": ("https://creativecommons.org/licenses/by/4.0/", "CC BY 4.0"), + "cc-by-sa": ("https://creativecommons.org/licenses/by-sa/4.0/", "CC BY-SA 4.0"), + "cc-by-nc": ("https://creativecommons.org/licenses/by-nc/4.0/", "CC BY-NC 4.0"), + "cc-by-nc-sa": ( + "https://creativecommons.org/licenses/by-nc-sa/4.0/", + "CC BY-NC-SA 4.0", + ), + "cc-by-nc-nd": ( + "https://creativecommons.org/licenses/by-nc-nd/4.0/", + "CC BY-NC-ND 4.0", + ), + "cc-by-nd": ("https://creativecommons.org/licenses/by-nd/4.0/", "CC BY-ND 4.0"), + "cc0": ("https://creativecommons.org/publicdomain/zero/1.0/", "CC0 1.0"), + "pd": ( + "https://creativecommons.org/publicdomain/mark/1.0/", + "Public Domain Mark 1.0", + ), +} + +# Licenses that allow derivative works (for abstract extraction) +DERIVATIVE_ALLOWED_LICENSES = { + "cc-by", + "cc-by-sa", + "cc-by-nc", + "cc-by-nc-sa", + "cc0", + "pd", +} + +# Template strings +TEMPLATES = { + "copyright_todo": "All rights reserved. Copyright © {year}, [TODO: Insert copyright holder here!]", +} diff --git a/doi2dataset/processing/metadata.py b/doi2dataset/processing/metadata.py index e0e1a55..53e9588 100644 --- a/doi2dataset/processing/metadata.py +++ b/doi2dataset/processing/metadata.py @@ -7,6 +7,7 @@ of processing DOIs: fetching data, building metadata, and optionally uploading t import json import warnings +from http import HTTPStatus from pathlib import Path from typing import Any @@ -16,7 +17,7 @@ from rich.progress import Progress, TaskID from ..api.client import APIClient from ..api.processors import AbstractProcessor, LicenseProcessor from ..core.config import Config -from ..core.constants import ICONS +from ..core.constants import API_URLS, ICONS, TEMPLATES from ..core.metadata_fields import ( CompoundMetadataField, ControlledVocabularyMetadataField, @@ -190,10 +191,10 @@ class MetadataProcessor: Raises: ValueError: If data fetching fails. """ - url = f"https://api.openalex.org/works/https://doi.org/{self.doi}" + url = f"{API_URLS['openalex_base']}{self.doi}" response = self.api_client.make_request(url) - if response is None or response.status_code != 200: + if response is None or response.status_code != HTTPStatus.OK: self.console.print( f"\n{ICONS['error']} Failed to fetch data for DOI: {self.doi}", style="error", @@ -319,9 +320,9 @@ class MetadataProcessor: "uri": license_info.uri, } else: - return_dict["datasetVersion"]["termsOfUse"] = ( - f"All rights reserved. Copyright © {self._get_publication_year(data)}, [TODO: Insert copyright holder here!]" - ) + return_dict["datasetVersion"]["termsOfUse"] = TEMPLATES[ + "copyright_todo" + ].format(year=self._get_publication_year(data)) return return_dict diff --git a/tests/test_abstract_processor.py b/tests/test_abstract_processor.py new file mode 100644 index 0000000..c0ca6c0 --- /dev/null +++ b/tests/test_abstract_processor.py @@ -0,0 +1,206 @@ +from unittest.mock import patch + +import pytest + +from doi2dataset import DERIVATIVE_ALLOWED_LICENSES, LICENSE_MAP, License +from doi2dataset.api.client import APIClient +from doi2dataset.api.processors import AbstractProcessor + + +def create_license_from_map(license_short: str) -> License: + """Helper function to create License objects from LICENSE_MAP""" + if license_short in LICENSE_MAP: + uri, name = LICENSE_MAP[license_short] + return License(name=name, uri=uri, short=license_short) + else: + # For unknown licenses not in the map + return License(name="Unknown License", uri="", short=license_short) + + +class TestAbstractProcessor: + """Test cases for AbstractProcessor derivative license logic""" + + def setup_method(self): + """Setup test fixtures""" + self.api_client = APIClient() + self.processor = AbstractProcessor(self.api_client) + + def test_derivative_allowed_license_uses_crossref(self): + """Test that licenses allowing derivatives attempt CrossRef first""" + # Create a license that allows derivatives using LICENSE_MAP + license_obj = create_license_from_map("cc-by") + + # Mock the CrossRef method to return an abstract and console output + with patch.object( + self.processor, + "_get_crossref_abstract", + return_value="CrossRef abstract text", + ) as mock_crossref: + with patch.object( + self.processor, "_get_openalex_abstract" + ) as mock_openalex: + with patch.object(self.processor.console, "print") as _mock_print: + result = self.processor.get_abstract( + "10.1234/test", {}, license_obj + ) + + # Should call CrossRef and get result + mock_crossref.assert_called_once_with("10.1234/test") + mock_openalex.assert_not_called() + assert result.text == "CrossRef abstract text" + assert result.source == "crossref" + + def test_derivative_not_allowed_license_uses_openalex(self): + """Test that licenses not allowing derivatives use OpenAlex reconstruction""" + # Create a license that does not allow derivatives using LICENSE_MAP + license_obj = create_license_from_map("cc-by-nd") + + # Mock the OpenAlex method to return an abstract + with patch.object(self.processor, "_get_crossref_abstract") as mock_crossref: + with patch.object( + self.processor, + "_get_openalex_abstract", + return_value="OpenAlex reconstructed text", + ) as mock_openalex: + with patch.object(self.processor.console, "print") as _mock_print: + result = self.processor.get_abstract( + "10.1234/test", {}, license_obj + ) + + # Should skip CrossRef and use OpenAlex + mock_crossref.assert_not_called() + mock_openalex.assert_called_once_with({}) + assert result.text == "OpenAlex reconstructed text" + assert result.source == "openalex" + + def test_unknown_license_uses_openalex(self): + """Test that unknown licenses default to OpenAlex reconstruction""" + # Create an unknown license (not in LICENSE_MAP) + license_obj = create_license_from_map("unknown-license") + + # Mock the OpenAlex method to return an abstract + with patch.object(self.processor, "_get_crossref_abstract") as mock_crossref: + with patch.object( + self.processor, + "_get_openalex_abstract", + return_value="OpenAlex reconstructed text", + ) as mock_openalex: + with patch.object(self.processor.console, "print") as _mock_print: + result = self.processor.get_abstract( + "10.1234/test", {}, license_obj + ) + + # Should skip CrossRef and use OpenAlex + mock_crossref.assert_not_called() + mock_openalex.assert_called_once_with({}) + assert result.text == "OpenAlex reconstructed text" + assert result.source == "openalex" + + def test_crossref_fallback_to_openalex(self): + """Test fallback to OpenAlex when CrossRef returns no abstract""" + # Create a license that allows derivatives using LICENSE_MAP + license_obj = create_license_from_map("cc-by") + + # Mock CrossRef to return None (no abstract found) + with patch.object( + self.processor, "_get_crossref_abstract", return_value=None + ) as mock_crossref: + with patch.object( + self.processor, + "_get_openalex_abstract", + return_value="OpenAlex fallback text", + ) as mock_openalex: + with patch.object(self.processor.console, "print") as _mock_print: + result = self.processor.get_abstract( + "10.1234/test", {}, license_obj + ) + + # Should try CrossRef first, then fall back to OpenAlex + mock_crossref.assert_called_once_with("10.1234/test") + mock_openalex.assert_called_once_with({}) + assert result.text == "OpenAlex fallback text" + assert result.source == "openalex" + + def test_no_abstract_found_anywhere(self): + """Test when no abstract is found in either source""" + # Create a license that allows derivatives using LICENSE_MAP + license_obj = create_license_from_map("cc-by") + + # Mock both methods to return None + with patch.object( + self.processor, "_get_crossref_abstract", return_value=None + ) as mock_crossref: + with patch.object( + self.processor, "_get_openalex_abstract", return_value=None + ) as mock_openalex: + with patch.object(self.processor.console, "print") as _mock_print: + result = self.processor.get_abstract( + "10.1234/test", {}, license_obj + ) + + # Should try both sources + mock_crossref.assert_called_once_with("10.1234/test") + mock_openalex.assert_called_once_with({}) + assert result.text == "" + assert result.source == "none" + + @pytest.mark.parametrize("license_short", DERIVATIVE_ALLOWED_LICENSES) + def test_all_derivative_allowed_licenses_use_crossref_first(self, license_short): + """Test that all licenses in DERIVATIVE_ALLOWED_LICENSES use CrossRef first""" + # Create license using LICENSE_MAP data + license_obj = create_license_from_map(license_short) + + with patch.object( + self.processor, "_get_crossref_abstract", return_value="CrossRef text" + ) as mock_crossref: + with patch.object( + self.processor, "_get_openalex_abstract" + ) as mock_openalex: + with patch.object(self.processor.console, "print") as _mock_print: + result = self.processor.get_abstract( + "10.1234/test", {}, license_obj + ) + + # Should use CrossRef for all derivative-allowed licenses + mock_crossref.assert_called_once() + mock_openalex.assert_not_called() + assert result.source == "crossref" + + def test_derivative_allowed_licenses_set_matches_usage(self): + """Test that DERIVATIVE_ALLOWED_LICENSES set is correctly used in logic""" + # This is a meta-test to ensure the constant is used correctly + + # Test a license that should allow derivatives using LICENSE_MAP + allowed_license = create_license_from_map("cc-by") + assert allowed_license.short in DERIVATIVE_ALLOWED_LICENSES + + # Test a license that should not allow derivatives using LICENSE_MAP + not_allowed_license = create_license_from_map("cc-by-nd") + assert not_allowed_license.short not in DERIVATIVE_ALLOWED_LICENSES + + # Test that the processor logic matches the set + with patch.object( + self.processor, "_get_crossref_abstract", return_value="CrossRef" + ) as mock_crossref: + with patch.object( + self.processor, "_get_openalex_abstract", return_value="OpenAlex" + ) as mock_openalex: + with patch.object(self.processor.console, "print") as _mock_print: + # Allowed license should use CrossRef + result1 = self.processor.get_abstract( + "10.1234/test", {}, allowed_license + ) + assert mock_crossref.call_count == 1 + assert result1.source == "crossref" + + # Reset mocks + mock_crossref.reset_mock() + mock_openalex.reset_mock() + + # Not allowed license should skip CrossRef + result2 = self.processor.get_abstract( + "10.1234/test", {}, not_allowed_license + ) + mock_crossref.assert_not_called() + mock_openalex.assert_called_once() + assert result2.source == "openalex" diff --git a/tests/test_license_processor.py b/tests/test_license_processor.py index f9eff58..ff9a164 100644 --- a/tests/test_license_processor.py +++ b/tests/test_license_processor.py @@ -1,4 +1,4 @@ -from doi2dataset import License, LicenseProcessor +from doi2dataset import DERIVATIVE_ALLOWED_LICENSES, License, LicenseProcessor def test_license_processor_cc_by(): @@ -50,3 +50,55 @@ def test_license_processor_no_primary_location(): assert license_obj.short == "unknown" assert license_obj.name == "" assert license_obj.uri == "" + + +def test_derivative_allowed_licenses_cc_by(): + """Test that CC BY license allows derivatives""" + assert "cc-by" in DERIVATIVE_ALLOWED_LICENSES + + +def test_derivative_allowed_licenses_cc_by_sa(): + """Test that CC BY-SA license allows derivatives""" + assert "cc-by-sa" in DERIVATIVE_ALLOWED_LICENSES + + +def test_derivative_allowed_licenses_cc_by_nc(): + """Test that CC BY-NC license allows derivatives""" + assert "cc-by-nc" in DERIVATIVE_ALLOWED_LICENSES + + +def test_derivative_allowed_licenses_cc_by_nc_sa(): + """Test that CC BY-NC-SA license allows derivatives""" + assert "cc-by-nc-sa" in DERIVATIVE_ALLOWED_LICENSES + + +def test_derivative_allowed_licenses_cc0(): + """Test that CC0 license allows derivatives""" + assert "cc0" in DERIVATIVE_ALLOWED_LICENSES + + +def test_derivative_allowed_licenses_public_domain(): + """Test that Public Domain license allows derivatives""" + assert "pd" in DERIVATIVE_ALLOWED_LICENSES + + +def test_derivative_not_allowed_licenses_cc_by_nd(): + """Test that CC BY-ND license does not allow derivatives""" + assert "cc-by-nd" not in DERIVATIVE_ALLOWED_LICENSES + + +def test_derivative_not_allowed_licenses_cc_by_nc_nd(): + """Test that CC BY-NC-ND license does not allow derivatives""" + assert "cc-by-nc-nd" not in DERIVATIVE_ALLOWED_LICENSES + + +def test_derivative_not_allowed_licenses_unknown(): + """Test that unknown licenses do not allow derivatives""" + assert "unknown-license" not in DERIVATIVE_ALLOWED_LICENSES + assert "all-rights-reserved" not in DERIVATIVE_ALLOWED_LICENSES + + +def test_derivative_allowed_licenses_set_completeness(): + """Test that DERIVATIVE_ALLOWED_LICENSES contains expected licenses""" + expected_licenses = {"cc-by", "cc-by-sa", "cc-by-nc", "cc-by-nc-sa", "cc0", "pd"} + assert DERIVATIVE_ALLOWED_LICENSES == expected_licenses