refactor: centralize constants and improve tests

- Centralize LICENSE_MAP, API_URLS, DERIVATIVE_ALLOWED_LICENSES,
  and TEMPLATES in core/constants.py
- Replace custom HTTP_STATUS dict with Python's standard
  http.HTTPStatus enum
- Add comprehensive tests for derivative license logic in
  AbstractProcessor
- Add unit tests for DERIVATIVE_ALLOWED_LICENSES constant validation
- Create helper function create_license_from_map() for consistent
  test data
- Fix formatting inconsistencies in constants.py (remove double
  empty lines)
- Use descriptive _mock_print variable names to satisfy ruff linting
- Update CHANGELOG to reflect constants centralization and
  HTTPStatus usage

This eliminates code duplication, improves maintainability, and
ensures derivative license detection logic is properly tested.
This commit is contained in:
Alexander Minges 2025-07-25 10:42:00 +02:00
parent df007b6076
commit d660ed457e
Signed by: Athemis
SSH key fingerprint: SHA256:TUXshgulbwL+FRYvBNo54pCsI0auROsSEgSvueKbkZ4
7 changed files with 335 additions and 35 deletions

View file

@ -10,6 +10,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed ### Changed
- Centralize ICONS definitions in core/constants.py module to eliminate code duplication - Centralize ICONS definitions in core/constants.py module to eliminate code duplication
- Centralize additional constants (LICENSE_MAP, API_URLS, etc.) in core/constants.py module
- Remove hardcoded Creative Commons license mappings from processors.py
- Remove hardcoded API URLs for OpenAlex and CrossRef from processing modules
- Replace custom HTTP status code constants with Python's standard `http.HTTPStatus` enum
- Remove hardcoded template strings throughout codebase
### Fixed ### Fixed

View file

@ -51,7 +51,13 @@ from .core import (
Person, Person,
PrimitiveMetadataField, PrimitiveMetadataField,
) )
from .core.constants import ICONS from .core.constants import (
API_URLS,
DERIVATIVE_ALLOWED_LICENSES,
ICONS,
LICENSE_MAP,
TEMPLATES,
)
from .processing import ( from .processing import (
CitationBuilder, CitationBuilder,
MetadataProcessor, MetadataProcessor,
@ -83,6 +89,10 @@ __all__ = [
"Abstract", "Abstract",
# Constants # Constants
"ICONS", "ICONS",
"LICENSE_MAP",
"API_URLS",
"DERIVATIVE_ALLOWED_LICENSES",
"TEMPLATES",
# Metadata fields # Metadata fields
"BaseMetadataField", "BaseMetadataField",
"PrimitiveMetadataField", "PrimitiveMetadataField",

View file

@ -6,11 +6,17 @@ including license processing and abstract extraction/cleaning.
""" """
import re import re
from http import HTTPStatus
from typing import Any from typing import Any
from rich.console import Console from rich.console import Console
from ..core.constants import ICONS from ..core.constants import (
API_URLS,
DERIVATIVE_ALLOWED_LICENSES,
ICONS,
LICENSE_MAP,
)
from ..core.models import Abstract, License from ..core.models import Abstract, License
from .client import APIClient from .client import APIClient
@ -20,26 +26,6 @@ class LicenseProcessor:
Processes license information from metadata. Processes license information from metadata.
""" """
LICENSE_MAP = {
"cc-by": ("https://creativecommons.org/licenses/by/4.0/", "CC BY 4.0"),
"cc-by-sa": ("https://creativecommons.org/licenses/by-sa/4.0/", "CC BY-SA 4.0"),
"cc-by-nc": ("https://creativecommons.org/licenses/by-nc/4.0/", "CC BY-NC 4.0"),
"cc-by-nc-sa": (
"https://creativecommons.org/licenses/by-nc-sa/4.0/",
"CC BY-NC-SA 4.0",
),
"cc-by-nc-nd": (
"https://creativecommons.org/licenses/by-nc-nd/4.0/",
"CC BY-NC-ND 4.0",
),
"cc-by-nd": ("https://creativecommons.org/licenses/by-nd/4.0/", "CC BY-ND 4.0"),
"cc0": ("https://creativecommons.org/publicdomain/zero/1.0/", "CC0 1.0"),
"pd": (
"https://creativecommons.org/publicdomain/mark/1.0/",
"Public Domain Mark 1.0",
),
}
@classmethod @classmethod
def process_license(cls, data: dict[str, Any]) -> License: def process_license(cls, data: dict[str, Any]) -> License:
""" """
@ -58,7 +44,7 @@ class LicenseProcessor:
return License(name="", uri="", short="unknown") return License(name="", uri="", short="unknown")
base_license = license_short.split("/")[0].lower() base_license = license_short.split("/")[0].lower()
uri, name = cls.LICENSE_MAP.get(base_license, ("", license_short)) uri, name = LICENSE_MAP.get(base_license, ("", license_short))
return License(name=name, uri=uri, short=license_short) return License(name=name, uri=uri, short=license_short)
@ -92,9 +78,7 @@ class AbstractProcessor:
Returns: Returns:
Abstract: The abstract with its source. Abstract: The abstract with its source.
""" """
license_ok = {"cc-by", "cc-by-sa", "cc-by-nc", "cc-by-nc-sa", "cc0", "pd"} if license.short in DERIVATIVE_ALLOWED_LICENSES:
if license.short in license_ok:
self.console.print( self.console.print(
f"\n{ICONS['info']} License {license.name} allows derivative works. Pulling abstract from CrossRef.", f"\n{ICONS['info']} License {license.name} allows derivative works. Pulling abstract from CrossRef.",
style="info", style="info",
@ -144,10 +128,10 @@ class AbstractProcessor:
Returns: Returns:
str | None: The abstract if found, otherwise None. str | None: The abstract if found, otherwise None.
""" """
url = f"https://api.crossref.org/works/{doi}" url = f"{API_URLS['crossref_base']}{doi}"
response = self.api_client.make_request(url) response = self.api_client.make_request(url)
if response and response.status_code == 200: if response and response.status_code == HTTPStatus.OK:
abstract_raw = response.json().get("message", {}).get("abstract") abstract_raw = response.json().get("message", {}).get("abstract")
return self._clean_jats(abstract_raw) return self._clean_jats(abstract_raw)
return None return None

View file

@ -41,3 +41,45 @@ EMOJI_ICONS = {
# Default icon set preference # Default icon set preference
DEFAULT_ICONS = ICONS DEFAULT_ICONS = ICONS
# API endpoint URLs
API_URLS = {
"openalex_base": "https://api.openalex.org/works/https://doi.org/",
"crossref_base": "https://api.crossref.org/works/",
}
# License mapping for Creative Commons and public domain licenses
LICENSE_MAP = {
"cc-by": ("https://creativecommons.org/licenses/by/4.0/", "CC BY 4.0"),
"cc-by-sa": ("https://creativecommons.org/licenses/by-sa/4.0/", "CC BY-SA 4.0"),
"cc-by-nc": ("https://creativecommons.org/licenses/by-nc/4.0/", "CC BY-NC 4.0"),
"cc-by-nc-sa": (
"https://creativecommons.org/licenses/by-nc-sa/4.0/",
"CC BY-NC-SA 4.0",
),
"cc-by-nc-nd": (
"https://creativecommons.org/licenses/by-nc-nd/4.0/",
"CC BY-NC-ND 4.0",
),
"cc-by-nd": ("https://creativecommons.org/licenses/by-nd/4.0/", "CC BY-ND 4.0"),
"cc0": ("https://creativecommons.org/publicdomain/zero/1.0/", "CC0 1.0"),
"pd": (
"https://creativecommons.org/publicdomain/mark/1.0/",
"Public Domain Mark 1.0",
),
}
# Licenses that allow derivative works (for abstract extraction)
DERIVATIVE_ALLOWED_LICENSES = {
"cc-by",
"cc-by-sa",
"cc-by-nc",
"cc-by-nc-sa",
"cc0",
"pd",
}
# Template strings
TEMPLATES = {
"copyright_todo": "All rights reserved. Copyright © {year}, [TODO: Insert copyright holder here!]",
}

View file

@ -7,6 +7,7 @@ of processing DOIs: fetching data, building metadata, and optionally uploading t
import json import json
import warnings import warnings
from http import HTTPStatus
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
@ -16,7 +17,7 @@ from rich.progress import Progress, TaskID
from ..api.client import APIClient from ..api.client import APIClient
from ..api.processors import AbstractProcessor, LicenseProcessor from ..api.processors import AbstractProcessor, LicenseProcessor
from ..core.config import Config from ..core.config import Config
from ..core.constants import ICONS from ..core.constants import API_URLS, ICONS, TEMPLATES
from ..core.metadata_fields import ( from ..core.metadata_fields import (
CompoundMetadataField, CompoundMetadataField,
ControlledVocabularyMetadataField, ControlledVocabularyMetadataField,
@ -190,10 +191,10 @@ class MetadataProcessor:
Raises: Raises:
ValueError: If data fetching fails. ValueError: If data fetching fails.
""" """
url = f"https://api.openalex.org/works/https://doi.org/{self.doi}" url = f"{API_URLS['openalex_base']}{self.doi}"
response = self.api_client.make_request(url) response = self.api_client.make_request(url)
if response is None or response.status_code != 200: if response is None or response.status_code != HTTPStatus.OK:
self.console.print( self.console.print(
f"\n{ICONS['error']} Failed to fetch data for DOI: {self.doi}", f"\n{ICONS['error']} Failed to fetch data for DOI: {self.doi}",
style="error", style="error",
@ -319,9 +320,9 @@ class MetadataProcessor:
"uri": license_info.uri, "uri": license_info.uri,
} }
else: else:
return_dict["datasetVersion"]["termsOfUse"] = ( return_dict["datasetVersion"]["termsOfUse"] = TEMPLATES[
f"All rights reserved. Copyright © {self._get_publication_year(data)}, [TODO: Insert copyright holder here!]" "copyright_todo"
) ].format(year=self._get_publication_year(data))
return return_dict return return_dict

View file

@ -0,0 +1,206 @@
from unittest.mock import patch
import pytest
from doi2dataset import DERIVATIVE_ALLOWED_LICENSES, LICENSE_MAP, License
from doi2dataset.api.client import APIClient
from doi2dataset.api.processors import AbstractProcessor
def create_license_from_map(license_short: str) -> License:
"""Helper function to create License objects from LICENSE_MAP"""
if license_short in LICENSE_MAP:
uri, name = LICENSE_MAP[license_short]
return License(name=name, uri=uri, short=license_short)
else:
# For unknown licenses not in the map
return License(name="Unknown License", uri="", short=license_short)
class TestAbstractProcessor:
"""Test cases for AbstractProcessor derivative license logic"""
def setup_method(self):
"""Setup test fixtures"""
self.api_client = APIClient()
self.processor = AbstractProcessor(self.api_client)
def test_derivative_allowed_license_uses_crossref(self):
"""Test that licenses allowing derivatives attempt CrossRef first"""
# Create a license that allows derivatives using LICENSE_MAP
license_obj = create_license_from_map("cc-by")
# Mock the CrossRef method to return an abstract and console output
with patch.object(
self.processor,
"_get_crossref_abstract",
return_value="CrossRef abstract text",
) as mock_crossref:
with patch.object(
self.processor, "_get_openalex_abstract"
) as mock_openalex:
with patch.object(self.processor.console, "print") as _mock_print:
result = self.processor.get_abstract(
"10.1234/test", {}, license_obj
)
# Should call CrossRef and get result
mock_crossref.assert_called_once_with("10.1234/test")
mock_openalex.assert_not_called()
assert result.text == "CrossRef abstract text"
assert result.source == "crossref"
def test_derivative_not_allowed_license_uses_openalex(self):
"""Test that licenses not allowing derivatives use OpenAlex reconstruction"""
# Create a license that does not allow derivatives using LICENSE_MAP
license_obj = create_license_from_map("cc-by-nd")
# Mock the OpenAlex method to return an abstract
with patch.object(self.processor, "_get_crossref_abstract") as mock_crossref:
with patch.object(
self.processor,
"_get_openalex_abstract",
return_value="OpenAlex reconstructed text",
) as mock_openalex:
with patch.object(self.processor.console, "print") as _mock_print:
result = self.processor.get_abstract(
"10.1234/test", {}, license_obj
)
# Should skip CrossRef and use OpenAlex
mock_crossref.assert_not_called()
mock_openalex.assert_called_once_with({})
assert result.text == "OpenAlex reconstructed text"
assert result.source == "openalex"
def test_unknown_license_uses_openalex(self):
"""Test that unknown licenses default to OpenAlex reconstruction"""
# Create an unknown license (not in LICENSE_MAP)
license_obj = create_license_from_map("unknown-license")
# Mock the OpenAlex method to return an abstract
with patch.object(self.processor, "_get_crossref_abstract") as mock_crossref:
with patch.object(
self.processor,
"_get_openalex_abstract",
return_value="OpenAlex reconstructed text",
) as mock_openalex:
with patch.object(self.processor.console, "print") as _mock_print:
result = self.processor.get_abstract(
"10.1234/test", {}, license_obj
)
# Should skip CrossRef and use OpenAlex
mock_crossref.assert_not_called()
mock_openalex.assert_called_once_with({})
assert result.text == "OpenAlex reconstructed text"
assert result.source == "openalex"
def test_crossref_fallback_to_openalex(self):
"""Test fallback to OpenAlex when CrossRef returns no abstract"""
# Create a license that allows derivatives using LICENSE_MAP
license_obj = create_license_from_map("cc-by")
# Mock CrossRef to return None (no abstract found)
with patch.object(
self.processor, "_get_crossref_abstract", return_value=None
) as mock_crossref:
with patch.object(
self.processor,
"_get_openalex_abstract",
return_value="OpenAlex fallback text",
) as mock_openalex:
with patch.object(self.processor.console, "print") as _mock_print:
result = self.processor.get_abstract(
"10.1234/test", {}, license_obj
)
# Should try CrossRef first, then fall back to OpenAlex
mock_crossref.assert_called_once_with("10.1234/test")
mock_openalex.assert_called_once_with({})
assert result.text == "OpenAlex fallback text"
assert result.source == "openalex"
def test_no_abstract_found_anywhere(self):
"""Test when no abstract is found in either source"""
# Create a license that allows derivatives using LICENSE_MAP
license_obj = create_license_from_map("cc-by")
# Mock both methods to return None
with patch.object(
self.processor, "_get_crossref_abstract", return_value=None
) as mock_crossref:
with patch.object(
self.processor, "_get_openalex_abstract", return_value=None
) as mock_openalex:
with patch.object(self.processor.console, "print") as _mock_print:
result = self.processor.get_abstract(
"10.1234/test", {}, license_obj
)
# Should try both sources
mock_crossref.assert_called_once_with("10.1234/test")
mock_openalex.assert_called_once_with({})
assert result.text == ""
assert result.source == "none"
@pytest.mark.parametrize("license_short", DERIVATIVE_ALLOWED_LICENSES)
def test_all_derivative_allowed_licenses_use_crossref_first(self, license_short):
"""Test that all licenses in DERIVATIVE_ALLOWED_LICENSES use CrossRef first"""
# Create license using LICENSE_MAP data
license_obj = create_license_from_map(license_short)
with patch.object(
self.processor, "_get_crossref_abstract", return_value="CrossRef text"
) as mock_crossref:
with patch.object(
self.processor, "_get_openalex_abstract"
) as mock_openalex:
with patch.object(self.processor.console, "print") as _mock_print:
result = self.processor.get_abstract(
"10.1234/test", {}, license_obj
)
# Should use CrossRef for all derivative-allowed licenses
mock_crossref.assert_called_once()
mock_openalex.assert_not_called()
assert result.source == "crossref"
def test_derivative_allowed_licenses_set_matches_usage(self):
"""Test that DERIVATIVE_ALLOWED_LICENSES set is correctly used in logic"""
# This is a meta-test to ensure the constant is used correctly
# Test a license that should allow derivatives using LICENSE_MAP
allowed_license = create_license_from_map("cc-by")
assert allowed_license.short in DERIVATIVE_ALLOWED_LICENSES
# Test a license that should not allow derivatives using LICENSE_MAP
not_allowed_license = create_license_from_map("cc-by-nd")
assert not_allowed_license.short not in DERIVATIVE_ALLOWED_LICENSES
# Test that the processor logic matches the set
with patch.object(
self.processor, "_get_crossref_abstract", return_value="CrossRef"
) as mock_crossref:
with patch.object(
self.processor, "_get_openalex_abstract", return_value="OpenAlex"
) as mock_openalex:
with patch.object(self.processor.console, "print") as _mock_print:
# Allowed license should use CrossRef
result1 = self.processor.get_abstract(
"10.1234/test", {}, allowed_license
)
assert mock_crossref.call_count == 1
assert result1.source == "crossref"
# Reset mocks
mock_crossref.reset_mock()
mock_openalex.reset_mock()
# Not allowed license should skip CrossRef
result2 = self.processor.get_abstract(
"10.1234/test", {}, not_allowed_license
)
mock_crossref.assert_not_called()
mock_openalex.assert_called_once()
assert result2.source == "openalex"

View file

@ -1,4 +1,4 @@
from doi2dataset import License, LicenseProcessor from doi2dataset import DERIVATIVE_ALLOWED_LICENSES, License, LicenseProcessor
def test_license_processor_cc_by(): def test_license_processor_cc_by():
@ -50,3 +50,55 @@ def test_license_processor_no_primary_location():
assert license_obj.short == "unknown" assert license_obj.short == "unknown"
assert license_obj.name == "" assert license_obj.name == ""
assert license_obj.uri == "" assert license_obj.uri == ""
def test_derivative_allowed_licenses_cc_by():
"""Test that CC BY license allows derivatives"""
assert "cc-by" in DERIVATIVE_ALLOWED_LICENSES
def test_derivative_allowed_licenses_cc_by_sa():
"""Test that CC BY-SA license allows derivatives"""
assert "cc-by-sa" in DERIVATIVE_ALLOWED_LICENSES
def test_derivative_allowed_licenses_cc_by_nc():
"""Test that CC BY-NC license allows derivatives"""
assert "cc-by-nc" in DERIVATIVE_ALLOWED_LICENSES
def test_derivative_allowed_licenses_cc_by_nc_sa():
"""Test that CC BY-NC-SA license allows derivatives"""
assert "cc-by-nc-sa" in DERIVATIVE_ALLOWED_LICENSES
def test_derivative_allowed_licenses_cc0():
"""Test that CC0 license allows derivatives"""
assert "cc0" in DERIVATIVE_ALLOWED_LICENSES
def test_derivative_allowed_licenses_public_domain():
"""Test that Public Domain license allows derivatives"""
assert "pd" in DERIVATIVE_ALLOWED_LICENSES
def test_derivative_not_allowed_licenses_cc_by_nd():
"""Test that CC BY-ND license does not allow derivatives"""
assert "cc-by-nd" not in DERIVATIVE_ALLOWED_LICENSES
def test_derivative_not_allowed_licenses_cc_by_nc_nd():
"""Test that CC BY-NC-ND license does not allow derivatives"""
assert "cc-by-nc-nd" not in DERIVATIVE_ALLOWED_LICENSES
def test_derivative_not_allowed_licenses_unknown():
"""Test that unknown licenses do not allow derivatives"""
assert "unknown-license" not in DERIVATIVE_ALLOWED_LICENSES
assert "all-rights-reserved" not in DERIVATIVE_ALLOWED_LICENSES
def test_derivative_allowed_licenses_set_completeness():
"""Test that DERIVATIVE_ALLOWED_LICENSES contains expected licenses"""
expected_licenses = {"cc-by", "cc-by-sa", "cc-by-nc", "cc-by-nc-sa", "cc0", "pd"}
assert DERIVATIVE_ALLOWED_LICENSES == expected_licenses