refactor: centralize constants and improve tests

- Centralize LICENSE_MAP, API_URLS, DERIVATIVE_ALLOWED_LICENSES,
  and TEMPLATES in core/constants.py
- Replace custom HTTP_STATUS dict with Python's standard
  http.HTTPStatus enum
- Add comprehensive tests for derivative license logic in
  AbstractProcessor
- Add unit tests for DERIVATIVE_ALLOWED_LICENSES constant validation
- Create helper function create_license_from_map() for consistent
  test data
- Fix formatting inconsistencies in constants.py (remove double
  empty lines)
- Use descriptive _mock_print variable names to satisfy ruff linting
- Update CHANGELOG to reflect constants centralization and
  HTTPStatus usage

This eliminates code duplication, improves maintainability, and
ensures derivative license detection logic is properly tested.
This commit is contained in:
Alexander Minges 2025-07-25 10:42:00 +02:00
parent df007b6076
commit d660ed457e
Signed by: Athemis
SSH key fingerprint: SHA256:TUXshgulbwL+FRYvBNo54pCsI0auROsSEgSvueKbkZ4
7 changed files with 335 additions and 35 deletions

View file

@ -10,6 +10,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed
- Centralize ICONS definitions in core/constants.py module to eliminate code duplication
- Centralize additional constants (LICENSE_MAP, API_URLS, etc.) in core/constants.py module
- Remove hardcoded Creative Commons license mappings from processors.py
- Remove hardcoded API URLs for OpenAlex and CrossRef from processing modules
- Replace custom HTTP status code constants with Python's standard `http.HTTPStatus` enum
- Remove hardcoded template strings throughout codebase
### Fixed

View file

@ -51,7 +51,13 @@ from .core import (
Person,
PrimitiveMetadataField,
)
from .core.constants import ICONS
from .core.constants import (
API_URLS,
DERIVATIVE_ALLOWED_LICENSES,
ICONS,
LICENSE_MAP,
TEMPLATES,
)
from .processing import (
CitationBuilder,
MetadataProcessor,
@ -83,6 +89,10 @@ __all__ = [
"Abstract",
# Constants
"ICONS",
"LICENSE_MAP",
"API_URLS",
"DERIVATIVE_ALLOWED_LICENSES",
"TEMPLATES",
# Metadata fields
"BaseMetadataField",
"PrimitiveMetadataField",

View file

@ -6,11 +6,17 @@ including license processing and abstract extraction/cleaning.
"""
import re
from http import HTTPStatus
from typing import Any
from rich.console import Console
from ..core.constants import ICONS
from ..core.constants import (
API_URLS,
DERIVATIVE_ALLOWED_LICENSES,
ICONS,
LICENSE_MAP,
)
from ..core.models import Abstract, License
from .client import APIClient
@ -20,26 +26,6 @@ class LicenseProcessor:
Processes license information from metadata.
"""
LICENSE_MAP = {
"cc-by": ("https://creativecommons.org/licenses/by/4.0/", "CC BY 4.0"),
"cc-by-sa": ("https://creativecommons.org/licenses/by-sa/4.0/", "CC BY-SA 4.0"),
"cc-by-nc": ("https://creativecommons.org/licenses/by-nc/4.0/", "CC BY-NC 4.0"),
"cc-by-nc-sa": (
"https://creativecommons.org/licenses/by-nc-sa/4.0/",
"CC BY-NC-SA 4.0",
),
"cc-by-nc-nd": (
"https://creativecommons.org/licenses/by-nc-nd/4.0/",
"CC BY-NC-ND 4.0",
),
"cc-by-nd": ("https://creativecommons.org/licenses/by-nd/4.0/", "CC BY-ND 4.0"),
"cc0": ("https://creativecommons.org/publicdomain/zero/1.0/", "CC0 1.0"),
"pd": (
"https://creativecommons.org/publicdomain/mark/1.0/",
"Public Domain Mark 1.0",
),
}
@classmethod
def process_license(cls, data: dict[str, Any]) -> License:
"""
@ -58,7 +44,7 @@ class LicenseProcessor:
return License(name="", uri="", short="unknown")
base_license = license_short.split("/")[0].lower()
uri, name = cls.LICENSE_MAP.get(base_license, ("", license_short))
uri, name = LICENSE_MAP.get(base_license, ("", license_short))
return License(name=name, uri=uri, short=license_short)
@ -92,9 +78,7 @@ class AbstractProcessor:
Returns:
Abstract: The abstract with its source.
"""
license_ok = {"cc-by", "cc-by-sa", "cc-by-nc", "cc-by-nc-sa", "cc0", "pd"}
if license.short in license_ok:
if license.short in DERIVATIVE_ALLOWED_LICENSES:
self.console.print(
f"\n{ICONS['info']} License {license.name} allows derivative works. Pulling abstract from CrossRef.",
style="info",
@ -144,10 +128,10 @@ class AbstractProcessor:
Returns:
str | None: The abstract if found, otherwise None.
"""
url = f"https://api.crossref.org/works/{doi}"
url = f"{API_URLS['crossref_base']}{doi}"
response = self.api_client.make_request(url)
if response and response.status_code == 200:
if response and response.status_code == HTTPStatus.OK:
abstract_raw = response.json().get("message", {}).get("abstract")
return self._clean_jats(abstract_raw)
return None

View file

@ -41,3 +41,45 @@ EMOJI_ICONS = {
# Default icon set preference
DEFAULT_ICONS = ICONS
# API endpoint URLs
API_URLS = {
"openalex_base": "https://api.openalex.org/works/https://doi.org/",
"crossref_base": "https://api.crossref.org/works/",
}
# License mapping for Creative Commons and public domain licenses
LICENSE_MAP = {
"cc-by": ("https://creativecommons.org/licenses/by/4.0/", "CC BY 4.0"),
"cc-by-sa": ("https://creativecommons.org/licenses/by-sa/4.0/", "CC BY-SA 4.0"),
"cc-by-nc": ("https://creativecommons.org/licenses/by-nc/4.0/", "CC BY-NC 4.0"),
"cc-by-nc-sa": (
"https://creativecommons.org/licenses/by-nc-sa/4.0/",
"CC BY-NC-SA 4.0",
),
"cc-by-nc-nd": (
"https://creativecommons.org/licenses/by-nc-nd/4.0/",
"CC BY-NC-ND 4.0",
),
"cc-by-nd": ("https://creativecommons.org/licenses/by-nd/4.0/", "CC BY-ND 4.0"),
"cc0": ("https://creativecommons.org/publicdomain/zero/1.0/", "CC0 1.0"),
"pd": (
"https://creativecommons.org/publicdomain/mark/1.0/",
"Public Domain Mark 1.0",
),
}
# Licenses that allow derivative works (for abstract extraction)
DERIVATIVE_ALLOWED_LICENSES = {
"cc-by",
"cc-by-sa",
"cc-by-nc",
"cc-by-nc-sa",
"cc0",
"pd",
}
# Template strings
TEMPLATES = {
"copyright_todo": "All rights reserved. Copyright © {year}, [TODO: Insert copyright holder here!]",
}

View file

@ -7,6 +7,7 @@ of processing DOIs: fetching data, building metadata, and optionally uploading t
import json
import warnings
from http import HTTPStatus
from pathlib import Path
from typing import Any
@ -16,7 +17,7 @@ from rich.progress import Progress, TaskID
from ..api.client import APIClient
from ..api.processors import AbstractProcessor, LicenseProcessor
from ..core.config import Config
from ..core.constants import ICONS
from ..core.constants import API_URLS, ICONS, TEMPLATES
from ..core.metadata_fields import (
CompoundMetadataField,
ControlledVocabularyMetadataField,
@ -190,10 +191,10 @@ class MetadataProcessor:
Raises:
ValueError: If data fetching fails.
"""
url = f"https://api.openalex.org/works/https://doi.org/{self.doi}"
url = f"{API_URLS['openalex_base']}{self.doi}"
response = self.api_client.make_request(url)
if response is None or response.status_code != 200:
if response is None or response.status_code != HTTPStatus.OK:
self.console.print(
f"\n{ICONS['error']} Failed to fetch data for DOI: {self.doi}",
style="error",
@ -319,9 +320,9 @@ class MetadataProcessor:
"uri": license_info.uri,
}
else:
return_dict["datasetVersion"]["termsOfUse"] = (
f"All rights reserved. Copyright © {self._get_publication_year(data)}, [TODO: Insert copyright holder here!]"
)
return_dict["datasetVersion"]["termsOfUse"] = TEMPLATES[
"copyright_todo"
].format(year=self._get_publication_year(data))
return return_dict

View file

@ -0,0 +1,206 @@
from unittest.mock import patch
import pytest
from doi2dataset import DERIVATIVE_ALLOWED_LICENSES, LICENSE_MAP, License
from doi2dataset.api.client import APIClient
from doi2dataset.api.processors import AbstractProcessor
def create_license_from_map(license_short: str) -> License:
"""Helper function to create License objects from LICENSE_MAP"""
if license_short in LICENSE_MAP:
uri, name = LICENSE_MAP[license_short]
return License(name=name, uri=uri, short=license_short)
else:
# For unknown licenses not in the map
return License(name="Unknown License", uri="", short=license_short)
class TestAbstractProcessor:
"""Test cases for AbstractProcessor derivative license logic"""
def setup_method(self):
"""Setup test fixtures"""
self.api_client = APIClient()
self.processor = AbstractProcessor(self.api_client)
def test_derivative_allowed_license_uses_crossref(self):
"""Test that licenses allowing derivatives attempt CrossRef first"""
# Create a license that allows derivatives using LICENSE_MAP
license_obj = create_license_from_map("cc-by")
# Mock the CrossRef method to return an abstract and console output
with patch.object(
self.processor,
"_get_crossref_abstract",
return_value="CrossRef abstract text",
) as mock_crossref:
with patch.object(
self.processor, "_get_openalex_abstract"
) as mock_openalex:
with patch.object(self.processor.console, "print") as _mock_print:
result = self.processor.get_abstract(
"10.1234/test", {}, license_obj
)
# Should call CrossRef and get result
mock_crossref.assert_called_once_with("10.1234/test")
mock_openalex.assert_not_called()
assert result.text == "CrossRef abstract text"
assert result.source == "crossref"
def test_derivative_not_allowed_license_uses_openalex(self):
"""Test that licenses not allowing derivatives use OpenAlex reconstruction"""
# Create a license that does not allow derivatives using LICENSE_MAP
license_obj = create_license_from_map("cc-by-nd")
# Mock the OpenAlex method to return an abstract
with patch.object(self.processor, "_get_crossref_abstract") as mock_crossref:
with patch.object(
self.processor,
"_get_openalex_abstract",
return_value="OpenAlex reconstructed text",
) as mock_openalex:
with patch.object(self.processor.console, "print") as _mock_print:
result = self.processor.get_abstract(
"10.1234/test", {}, license_obj
)
# Should skip CrossRef and use OpenAlex
mock_crossref.assert_not_called()
mock_openalex.assert_called_once_with({})
assert result.text == "OpenAlex reconstructed text"
assert result.source == "openalex"
def test_unknown_license_uses_openalex(self):
"""Test that unknown licenses default to OpenAlex reconstruction"""
# Create an unknown license (not in LICENSE_MAP)
license_obj = create_license_from_map("unknown-license")
# Mock the OpenAlex method to return an abstract
with patch.object(self.processor, "_get_crossref_abstract") as mock_crossref:
with patch.object(
self.processor,
"_get_openalex_abstract",
return_value="OpenAlex reconstructed text",
) as mock_openalex:
with patch.object(self.processor.console, "print") as _mock_print:
result = self.processor.get_abstract(
"10.1234/test", {}, license_obj
)
# Should skip CrossRef and use OpenAlex
mock_crossref.assert_not_called()
mock_openalex.assert_called_once_with({})
assert result.text == "OpenAlex reconstructed text"
assert result.source == "openalex"
def test_crossref_fallback_to_openalex(self):
"""Test fallback to OpenAlex when CrossRef returns no abstract"""
# Create a license that allows derivatives using LICENSE_MAP
license_obj = create_license_from_map("cc-by")
# Mock CrossRef to return None (no abstract found)
with patch.object(
self.processor, "_get_crossref_abstract", return_value=None
) as mock_crossref:
with patch.object(
self.processor,
"_get_openalex_abstract",
return_value="OpenAlex fallback text",
) as mock_openalex:
with patch.object(self.processor.console, "print") as _mock_print:
result = self.processor.get_abstract(
"10.1234/test", {}, license_obj
)
# Should try CrossRef first, then fall back to OpenAlex
mock_crossref.assert_called_once_with("10.1234/test")
mock_openalex.assert_called_once_with({})
assert result.text == "OpenAlex fallback text"
assert result.source == "openalex"
def test_no_abstract_found_anywhere(self):
"""Test when no abstract is found in either source"""
# Create a license that allows derivatives using LICENSE_MAP
license_obj = create_license_from_map("cc-by")
# Mock both methods to return None
with patch.object(
self.processor, "_get_crossref_abstract", return_value=None
) as mock_crossref:
with patch.object(
self.processor, "_get_openalex_abstract", return_value=None
) as mock_openalex:
with patch.object(self.processor.console, "print") as _mock_print:
result = self.processor.get_abstract(
"10.1234/test", {}, license_obj
)
# Should try both sources
mock_crossref.assert_called_once_with("10.1234/test")
mock_openalex.assert_called_once_with({})
assert result.text == ""
assert result.source == "none"
@pytest.mark.parametrize("license_short", DERIVATIVE_ALLOWED_LICENSES)
def test_all_derivative_allowed_licenses_use_crossref_first(self, license_short):
"""Test that all licenses in DERIVATIVE_ALLOWED_LICENSES use CrossRef first"""
# Create license using LICENSE_MAP data
license_obj = create_license_from_map(license_short)
with patch.object(
self.processor, "_get_crossref_abstract", return_value="CrossRef text"
) as mock_crossref:
with patch.object(
self.processor, "_get_openalex_abstract"
) as mock_openalex:
with patch.object(self.processor.console, "print") as _mock_print:
result = self.processor.get_abstract(
"10.1234/test", {}, license_obj
)
# Should use CrossRef for all derivative-allowed licenses
mock_crossref.assert_called_once()
mock_openalex.assert_not_called()
assert result.source == "crossref"
def test_derivative_allowed_licenses_set_matches_usage(self):
"""Test that DERIVATIVE_ALLOWED_LICENSES set is correctly used in logic"""
# This is a meta-test to ensure the constant is used correctly
# Test a license that should allow derivatives using LICENSE_MAP
allowed_license = create_license_from_map("cc-by")
assert allowed_license.short in DERIVATIVE_ALLOWED_LICENSES
# Test a license that should not allow derivatives using LICENSE_MAP
not_allowed_license = create_license_from_map("cc-by-nd")
assert not_allowed_license.short not in DERIVATIVE_ALLOWED_LICENSES
# Test that the processor logic matches the set
with patch.object(
self.processor, "_get_crossref_abstract", return_value="CrossRef"
) as mock_crossref:
with patch.object(
self.processor, "_get_openalex_abstract", return_value="OpenAlex"
) as mock_openalex:
with patch.object(self.processor.console, "print") as _mock_print:
# Allowed license should use CrossRef
result1 = self.processor.get_abstract(
"10.1234/test", {}, allowed_license
)
assert mock_crossref.call_count == 1
assert result1.source == "crossref"
# Reset mocks
mock_crossref.reset_mock()
mock_openalex.reset_mock()
# Not allowed license should skip CrossRef
result2 = self.processor.get_abstract(
"10.1234/test", {}, not_allowed_license
)
mock_crossref.assert_not_called()
mock_openalex.assert_called_once()
assert result2.source == "openalex"

View file

@ -1,4 +1,4 @@
from doi2dataset import License, LicenseProcessor
from doi2dataset import DERIVATIVE_ALLOWED_LICENSES, License, LicenseProcessor
def test_license_processor_cc_by():
@ -50,3 +50,55 @@ def test_license_processor_no_primary_location():
assert license_obj.short == "unknown"
assert license_obj.name == ""
assert license_obj.uri == ""
def test_derivative_allowed_licenses_cc_by():
"""Test that CC BY license allows derivatives"""
assert "cc-by" in DERIVATIVE_ALLOWED_LICENSES
def test_derivative_allowed_licenses_cc_by_sa():
"""Test that CC BY-SA license allows derivatives"""
assert "cc-by-sa" in DERIVATIVE_ALLOWED_LICENSES
def test_derivative_allowed_licenses_cc_by_nc():
"""Test that CC BY-NC license allows derivatives"""
assert "cc-by-nc" in DERIVATIVE_ALLOWED_LICENSES
def test_derivative_allowed_licenses_cc_by_nc_sa():
"""Test that CC BY-NC-SA license allows derivatives"""
assert "cc-by-nc-sa" in DERIVATIVE_ALLOWED_LICENSES
def test_derivative_allowed_licenses_cc0():
"""Test that CC0 license allows derivatives"""
assert "cc0" in DERIVATIVE_ALLOWED_LICENSES
def test_derivative_allowed_licenses_public_domain():
"""Test that Public Domain license allows derivatives"""
assert "pd" in DERIVATIVE_ALLOWED_LICENSES
def test_derivative_not_allowed_licenses_cc_by_nd():
"""Test that CC BY-ND license does not allow derivatives"""
assert "cc-by-nd" not in DERIVATIVE_ALLOWED_LICENSES
def test_derivative_not_allowed_licenses_cc_by_nc_nd():
"""Test that CC BY-NC-ND license does not allow derivatives"""
assert "cc-by-nc-nd" not in DERIVATIVE_ALLOWED_LICENSES
def test_derivative_not_allowed_licenses_unknown():
"""Test that unknown licenses do not allow derivatives"""
assert "unknown-license" not in DERIVATIVE_ALLOWED_LICENSES
assert "all-rights-reserved" not in DERIVATIVE_ALLOWED_LICENSES
def test_derivative_allowed_licenses_set_completeness():
"""Test that DERIVATIVE_ALLOWED_LICENSES contains expected licenses"""
expected_licenses = {"cc-by", "cc-by-sa", "cc-by-nc", "cc-by-nc-sa", "cc0", "pd"}
assert DERIVATIVE_ALLOWED_LICENSES == expected_licenses