test: replace hardcoded with dynamic extraction

- Replace hardcoded author names with dynamic openalex_data extraction
- Extract DOIs from API response when tests use other response values
- Remove redundant fake_openalex_response fixture
- Add abstract_inverted_index_v3 to allowed None fields in API tests
- Fix test robustness against fixture data changes
- Improve test coverage from ~84% to ~90%
This commit is contained in:
Alexander Minges 2025-07-25 12:17:24 +02:00
parent c282cd1047
commit cc94e495ff
Signed by: Athemis
SSH key fingerprint: SHA256:TUXshgulbwL+FRYvBNo54pCsI0auROsSEgSvueKbkZ4
14 changed files with 883 additions and 56 deletions

View file

@ -31,12 +31,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Consolidate overlapping test concerns into dedicated files
- Extract CLI tests into dedicated `test_cli.py` module
- Improve test coverage from 63.87% to 84.84%
- Replace hardcoded test values with dynamic extraction from API response fixtures
- Extract DOIs from API response data in tests that use other response values for consistency
- Remove redundant test fixtures and parameters
### Fixed
- Fix list formatting in API documentation docstrings for better sphinx rendering
- Fix formatting inconsistencies in constants.py (remove double empty lines)
- Fix ruff linting issues with unused mock variables in tests
- Replace hardcoded author names with dynamic extraction from OpenAlex data
- Replace hardcoded content checks with dynamic validation using actual API response data
- Fix test robustness against changes in fixture data by using real API response processing
- Remove duplicate fake_openalex_response fixture in favor of direct openalex_data usage
- Add abstract_inverted_index_v3 to allowed None fields in API response structure tests
## [v3.0.1] - 2025-07-25

View file

@ -1,3 +1,10 @@
dataverse:
url: "https://test.dataverse.org"
api_token: "test_token"
dataverse: "test_dataverse"
auth_user: "test_user"
auth_password: "test_password"
default_grants:
- funder: "Awesome Funding Agency"
id: "ABC12345"

View file

@ -1,8 +1,27 @@
import json
import os
import sys
import pytest
# Get the path to the parent directory of tests
parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
# Add the parent directory to sys.path
sys.path.insert(0, parent_dir)
@pytest.fixture(scope="session")
def openalex_data():
"""Load OpenAlex API response data for reuse across tests."""
json_path = os.path.join(os.path.dirname(__file__), "srep45389_openalex.json")
with open(json_path, encoding="utf-8") as f:
return json.load(f)
@pytest.fixture(scope="session")
def crossref_data():
"""Load CrossRef API response data for reuse across tests."""
json_path = os.path.join(os.path.dirname(__file__), "srep45389_crossref.json")
with open(json_path, encoding="utf-8") as f:
return json.load(f)

File diff suppressed because one or more lines are too long

View file

@ -204,3 +204,173 @@ class TestAbstractProcessor:
mock_crossref.assert_not_called()
mock_openalex.assert_called_once()
assert result2.source == "openalex"
def test_custom_license_console_output(self):
"""Test console output for custom licenses without names"""
# Create a custom license without a name
custom_license = License(name="", uri="http://custom.license", short="custom")
with patch.object(
self.processor, "_get_openalex_abstract", return_value="OpenAlex text"
):
with patch.object(self.processor.console, "print") as mock_print:
result = self.processor.get_abstract("10.1234/test", {}, custom_license)
# Should print custom license message
mock_print.assert_called()
# Check that it mentions "Custom license"
call_args = mock_print.call_args[0][0]
assert "Custom license does not allow derivative works" in call_args
assert result.source == "openalex"
def test_crossref_api_failure(self):
"""Test _get_crossref_abstract when API call fails"""
from unittest.mock import Mock
# Mock API response failure
mock_response = Mock()
mock_response.status_code = 404
with patch.object(
self.processor.api_client, "make_request", return_value=mock_response
):
result = self.processor._get_crossref_abstract("10.1234/test")
assert result is None
# Test with no response
with patch.object(self.processor.api_client, "make_request", return_value=None):
result = self.processor._get_crossref_abstract("10.1234/test")
assert result is None
def test_get_openalex_abstract_no_inverted_index(self):
"""Test _get_openalex_abstract when no abstract_inverted_index exists"""
data = {"title": "Test Article"} # No abstract_inverted_index
result = self.processor._get_openalex_abstract(data)
assert result is None
def test_clean_jats_comprehensive(self):
"""Test _clean_jats method with various JATS tags"""
# Test with None input
result = self.processor._clean_jats(None)
assert result == ""
# Test with empty string
result = self.processor._clean_jats("")
assert result == ""
# Test with ordered list
jats_text = '<jats:list list-type="order"><jats:list-item>First item</jats:list-item><jats:list-item>Second item</jats:list-item></jats:list>'
expected = "<ol><li>First item</li><li>Second item</li></ol>"
result = self.processor._clean_jats(jats_text)
assert result == expected
# Test with unordered list
jats_text = '<jats:list list-type="bullet"><jats:list-item>Bullet one</jats:list-item><jats:list-item>Bullet two</jats:list-item></jats:list>'
expected = "<ul><li>Bullet one</li><li>Bullet two</li></ul>"
result = self.processor._clean_jats(jats_text)
assert result == expected
# Test with mixed formatting tags
jats_text = "<jats:p>This is <jats:italic>italic</jats:italic> and <jats:bold>bold</jats:bold> text with <jats:sup>superscript</jats:sup> and <jats:sub>subscript</jats:sub>.</jats:p>"
expected = "<p>This is <i>italic</i> and <b>bold</b> text with <sup>superscript</sup> and <sub>subscript</sub>.</p>"
result = self.processor._clean_jats(jats_text)
assert result == expected
# Test with other formatting tags
jats_text = "<jats:underline>Underlined</jats:underline> <jats:monospace>Code</jats:monospace> <jats:sc>Small caps</jats:sc>"
expected = "<u>Underlined</u> <code>Code</code> <small>Small caps</small>"
result = self.processor._clean_jats(jats_text)
assert result == expected
# Test with title and blockquote
jats_text = "<jats:title>Section Title</jats:title><jats:blockquote>This is a quote</jats:blockquote>"
expected = "<h2>Section Title</h2><blockquote>This is a quote</blockquote>"
result = self.processor._clean_jats(jats_text)
assert result == expected
def test_no_abstract_found_console_messages(self):
"""Test console messages when no abstract is found"""
license_obj = create_license_from_map("cc-by-nd") # No derivative allowed
with patch.object(self.processor, "_get_openalex_abstract", return_value=None):
with patch.object(self.processor.console, "print") as mock_print:
result = self.processor.get_abstract("10.1234/test", {}, license_obj)
# Should print warning messages
assert mock_print.call_count >= 2
# Check for specific warning messages
call_messages = [call[0][0] for call in mock_print.call_args_list]
assert any(
"No abstract found in OpenAlex!" in msg for msg in call_messages
)
assert any(
"No abstract found in either CrossRef nor OpenAlex!" in msg
for msg in call_messages
)
assert result.text == ""
assert result.source == "none"
def test_crossref_abstract_with_real_data(self, crossref_data):
"""Test CrossRef abstract extraction using real CrossRef data"""
from http import HTTPStatus
from unittest.mock import Mock
# Mock successful API response with real data
mock_response = Mock()
mock_response.status_code = HTTPStatus.OK
mock_response.json.return_value = crossref_data
# Extract DOI from CrossRef data since we're using other values from the response
expected_doi = crossref_data["message"]["DOI"]
with patch.object(
self.processor.api_client, "make_request", return_value=mock_response
):
result = self.processor._get_crossref_abstract(expected_doi)
# Should successfully extract and clean the abstract
assert result is not None
assert len(result) > 0
# Check that JATS tags were converted to HTML
assert "<p>" in result # JATS paragraphs converted
assert "<i>" in result # JATS italic converted
assert "<sub>" in result # JATS subscript converted
assert "jats:" not in result # No JATS tags should remain
def test_jats_cleaning_comprehensive_real_data(self, crossref_data):
"""Test JATS cleaning with real CrossRef abstract data"""
raw_abstract = crossref_data["message"]["abstract"]
# Clean the JATS tags
cleaned = self.processor._clean_jats(raw_abstract)
# Verify specific transformations from the real data
assert "<jats:title>" not in cleaned
assert "<h2>" in cleaned # Title should be converted
assert "<jats:p>" not in cleaned
assert "<p>" in cleaned # Paragraphs should be converted
assert "<jats:sub>" not in cleaned
assert "<sub>" in cleaned # Subscripts should be converted
assert "<jats:italic>" not in cleaned
assert "<i>" in cleaned # Italics should be converted
# Ensure the content is preserved by checking for specific content from the abstract
assert "pyruvate phosphate dikinase" in cleaned.lower()
assert "Abstract" in cleaned
def test_openalex_abstract_reconstruction_with_real_data(self, openalex_data):
"""Test OpenAlex abstract reconstruction using real inverted index data"""
# Extract the abstract using the inverted index
result = self.processor._get_openalex_abstract(openalex_data)
if result: # Only test if there's an abstract in the data
assert isinstance(result, str)
assert len(result) > 0
# Should be reconstructed from word positions
assert " " in result # Should have spaces between words

View file

@ -428,3 +428,101 @@ class TestAPIClientUsageScenarios:
assert "X-Dataverse-key" in client.session.headers
assert "Custom-Header" in client.session.headers
assert client.session.headers["Custom-Header"] == "custom-value"
def test_api_response_structure_processing(openalex_data):
"""Test API client processes complex nested response structures correctly."""
client = APIClient()
with patch.object(client.session, "request") as mock_request:
mock_response = Mock()
mock_response.status_code = 200
mock_response.json.return_value = openalex_data
mock_request.return_value = mock_response
response = client.make_request("https://api.openalex.org/works/test")
assert response is not None
data = response.json()
# Test that nested structures are preserved through the request pipeline
if "authorships" in data:
assert isinstance(data["authorships"], list)
# Test deep nesting preservation
for authorship in data["authorships"]:
if "institutions" in authorship:
assert isinstance(authorship["institutions"], list)
# Test data type preservation through JSON serialization/deserialization
for key, value in data.items():
assert value is not None or key in [
"abstract_inverted_index",
"abstract_inverted_index_v3",
] # Some fields can legitimately be None
def test_api_unicode_encoding_processing(openalex_data):
"""Test API client correctly processes Unicode characters in responses."""
client = APIClient()
with patch.object(client.session, "request") as mock_request:
mock_response = Mock()
mock_response.status_code = 200
mock_response.json.return_value = openalex_data
mock_response.encoding = "utf-8"
mock_request.return_value = mock_response
response = client.make_request("https://api.openalex.org/works/test")
assert response is not None
data = response.json()
# Test that Unicode characters are preserved through processing pipeline
def check_unicode_preservation(obj):
if isinstance(obj, str):
# Should preserve Unicode characters
try:
obj.encode("utf-8")
return True
except UnicodeEncodeError:
return False
elif isinstance(obj, dict):
return all(check_unicode_preservation(v) for v in obj.values())
elif isinstance(obj, list):
return all(check_unicode_preservation(item) for item in obj)
return True
assert check_unicode_preservation(data)
def test_large_response_processing_efficiency(openalex_data):
"""Test API client efficiently processes large response payloads."""
client = APIClient()
# Create large response based on real structure
large_data = dict(openalex_data)
if "referenced_works" in large_data:
# Extend existing referenced works
base_works = (
large_data["referenced_works"][:10]
if large_data["referenced_works"]
else []
)
large_data["referenced_works"] = base_works * 100 # Create large list
with patch.object(client.session, "request") as mock_request:
mock_response = Mock()
mock_response.status_code = 200
mock_response.json.return_value = large_data
mock_request.return_value = mock_response
response = client.make_request("https://api.openalex.org/works/test")
assert response is not None
data = response.json()
# Verify large data structures are handled correctly
if "referenced_works" in data:
assert len(data["referenced_works"]) > 100
# All elements should maintain structure integrity
assert all(isinstance(work, str) for work in data["referenced_works"])

View file

@ -1,18 +1,8 @@
import json
import os
import pytest
from doi2dataset import CitationBuilder, Person, PIFinder
@pytest.fixture
def openalex_data():
"""Load the saved JSON response from the file 'srep45389.json'"""
json_path = os.path.join(os.path.dirname(__file__), "srep45389.json")
with open(json_path, encoding="utf-8") as f:
data = json.load(f)
return data
# openalex_data fixture now comes from conftest.py
@pytest.fixture
@ -169,3 +159,113 @@ def test_build_authors_with_ror(openalex_data, pi_finder):
assert "@type" in expanded_value
assert expanded_value["@type"] == "https://schema.org/Organization"
def test_build_authors_with_real_data(openalex_data, pi_finder):
"""Test author building with real OpenAlex data structure"""
doi = openalex_data["doi"].replace("https://doi.org/", "")
builder = CitationBuilder(data=openalex_data, doi=doi, pi_finder=pi_finder)
authors, corresponding = builder.build_authors()
# Should have multiple authors from the real data
assert len(authors) > 0
# Extract expected author names from the API response data
expected_authors = []
for authorship in openalex_data.get("authorships", []):
if "author" in authorship and "display_name" in authorship["author"]:
expected_authors.append(authorship["author"]["display_name"])
# Check that real author names from API response are processed correctly
author_names = [f"{author.given_name} {author.family_name}" for author in authors]
# Verify that at least some expected authors from the API response are found
found_authors = 0
for expected_name in expected_authors:
if any(expected_name in author_name for author_name in author_names):
found_authors += 1
# Should find at least some authors from the API response
assert (
found_authors > 0
), f"No expected authors found. Expected: {expected_authors}, Got: {author_names}"
def test_process_author_edge_cases(pi_finder):
"""Test _process_author with various edge cases"""
builder = CitationBuilder(
data={"authorships": []}, doi="10.1000/test", pi_finder=pi_finder
)
# Test with minimal author data
minimal_author = {"display_name": "John Smith"}
empty_authorship = {}
person = builder._process_author(minimal_author, empty_authorship)
assert person.given_name == "John"
assert person.family_name == "Smith"
# Test with ORCID
author_with_orcid = {
"display_name": "Jane Doe",
"orcid": "https://orcid.org/0000-0000-0000-0000",
}
person = builder._process_author(author_with_orcid, empty_authorship)
assert person.orcid == "0000-0000-0000-0000" # URL part is stripped
def test_build_grants_with_default_config(pi_finder):
"""Test that grants include default grants from config"""
# Use real data structure but focus on grants behavior
data = {"authorships": [], "grants": []}
builder = CitationBuilder(data=data, doi="10.1000/test", pi_finder=pi_finder)
grants = builder.build_grants()
# Should have at least the default grants from config
# The exact number depends on the config, but should be >= 0
assert isinstance(grants, list)
for grant in grants:
assert len(grant) == 2 # Should have agency and value fields
assert grant[0].name == "grantNumberAgency"
assert grant[1].name == "grantNumberValue"
def test_process_corresponding_author_no_email(pi_finder):
"""Test _process_corresponding_author when no email is available"""
builder = CitationBuilder(
data={"authorships": []}, doi="10.1000/test", pi_finder=pi_finder
)
# Create a Person without email
person = Person(
given_name="John", family_name="Doe", orcid=None, email=None, affiliation=None
)
authorship = {"is_corresponding": True}
result = builder._process_corresponding_author(person, authorship)
# Should return None when no email is available
assert result is None
def test_build_authors_skip_empty_authorships(pi_finder):
"""Test that empty author entries are skipped"""
data_with_empty_authors = {
"authorships": [
{"author": {}}, # Empty author
{}, # No author key
{"author": {"display_name": "John Doe"}}, # Valid author
]
}
builder = CitationBuilder(
data=data_with_empty_authors, doi="10.1000/test", pi_finder=pi_finder
)
authors, corresponding = builder.build_authors()
# Should only process the one valid author
assert len(authors) == 1
assert authors[0].given_name == "John"
assert authors[0].family_name == "Doe"

View file

@ -1,4 +1,3 @@
import json
import os
from unittest.mock import patch
@ -44,27 +43,15 @@ def load_config_test():
Config.load_config(config_path=config_path)
@pytest.fixture
def fake_openalex_response():
"""
Load the saved JSON response from the file 'srep45389.json'
located in the same directory as this test file.
"""
json_path = os.path.join(os.path.dirname(__file__), "srep45389.json")
with open(json_path, encoding="utf-8") as f:
data = json.load(f)
return data
def test_fetch_doi_data_with_file(mocker, fake_openalex_response):
def test_fetch_doi_data_with_file(mocker, openalex_data):
"""
Test fetching DOI metadata by simulating the API call with a locally saved JSON response.
The APIClient.make_request method is patched to return a fake response built from the contents
of 'srep45389.json', ensuring that the configuration is loaded from 'config_test.yaml'.
"""
doi = "10.1038/srep45389"
fake_response = FakeResponse(fake_openalex_response, 200)
doi = openalex_data["doi"].replace("https://doi.org/", "")
fake_response = FakeResponse(openalex_data, 200)
# Patch the make_request method of APIClient to return our fake_response.
mocker.patch("doi2dataset.APIClient.make_request", return_value=fake_response)
@ -75,11 +62,11 @@ def test_fetch_doi_data_with_file(mocker, fake_openalex_response):
# Call _fetch_data(), which should now return our fake JSON data.
data = processor._fetch_data()
# Verify that the fetched data matches the fake JSON data.
assert data == fake_openalex_response
# Verify that the fetched data matches the OpenAlex data.
assert data == openalex_data
def test_openalex_abstract_extraction(mocker, fake_openalex_response):
def test_openalex_abstract_extraction(openalex_data):
"""Test the extraction of abstracts from OpenAlex inverted index data."""
# Create API client for AbstractProcessor
api_client = APIClient()
@ -88,20 +75,20 @@ def test_openalex_abstract_extraction(mocker, fake_openalex_response):
processor = AbstractProcessor(api_client=api_client)
# Call the protected method directly with the fake response
abstract_text = processor._get_openalex_abstract(fake_openalex_response)
result = processor._get_openalex_abstract(openalex_data)
# Verify abstract was extracted
assert abstract_text is not None
assert result is not None
# If abstract exists in the response, it should be properly extracted
if "abstract_inverted_index" in fake_openalex_response:
assert len(abstract_text) > 0
if "abstract_inverted_index" in openalex_data:
assert len(result) > 0
def test_subject_mapper(fake_openalex_response):
def test_subject_mapper(openalex_data):
"""Test that the SubjectMapper correctly maps OpenAlex topics to subjects."""
# Extract topics from the OpenAlex response
topics = fake_openalex_response.get("topics", [])
topics = openalex_data.get("topics", [])
# Get subjects using the class method
subjects = SubjectMapper.get_subjects({"topics": topics})
@ -111,15 +98,15 @@ def test_subject_mapper(fake_openalex_response):
assert isinstance(subjects, list)
def test_citation_builder(fake_openalex_response):
def test_citation_builder(openalex_data):
"""Test that the CitationBuilder correctly builds author information."""
doi = "10.1038/srep45389"
doi = openalex_data["doi"].replace("https://doi.org/", "")
# Mock PIFinder with an empty list of PIs
pi_finder = PIFinder(pis=[])
# Create builder with required arguments
builder = CitationBuilder(data=fake_openalex_response, doi=doi, pi_finder=pi_finder)
builder = CitationBuilder(data=openalex_data, doi=doi, pi_finder=pi_finder)
# Test building other IDs
other_ids = builder.build_other_ids()
@ -134,12 +121,10 @@ def test_citation_builder(fake_openalex_response):
assert isinstance(topics, list)
def test_license_processor(fake_openalex_response):
def test_license_processor(openalex_data):
"""Test that the LicenseProcessor correctly identifies and processes licenses."""
# Create a simplified data structure that contains license info
license_data = {
"primary_location": fake_openalex_response.get("primary_location", {})
}
license_data = {"primary_location": openalex_data.get("primary_location", {})}
# Process the license
license_obj = LicenseProcessor.process_license(license_data)
@ -182,14 +167,14 @@ def test_config_load_invalid_path():
Config.load_config(config_path=invalid_path)
def test_metadata_processor_fetch_data(mocker, fake_openalex_response):
def test_metadata_processor_fetch_data(mocker, openalex_data):
"""Test the _fetch_data method of the MetadataProcessor class with mocked responses."""
doi = "10.1038/srep45389"
doi = openalex_data["doi"].replace("https://doi.org/", "")
# Mock API response
mocker.patch(
"doi2dataset.APIClient.make_request",
return_value=FakeResponse(fake_openalex_response, 200),
return_value=FakeResponse(openalex_data, 200),
)
# Create processor with upload disabled and progress disabled
@ -200,7 +185,7 @@ def test_metadata_processor_fetch_data(mocker, fake_openalex_response):
# Verify that data was fetched correctly
assert data is not None
assert data == fake_openalex_response
assert data == openalex_data
# Verify the DOI is correctly stored
assert processor.doi == doi

View file

@ -102,3 +102,82 @@ def test_derivative_allowed_licenses_set_completeness():
"""Test that DERIVATIVE_ALLOWED_LICENSES contains expected licenses"""
expected_licenses = {"cc-by", "cc-by-sa", "cc-by-nc", "cc-by-nc-sa", "cc0", "pd"}
assert DERIVATIVE_ALLOWED_LICENSES == expected_licenses
def test_license_processing_with_real_openalex_structure(openalex_data):
"""Test that license processor correctly handles real OpenAlex data structure."""
# Process license data exactly as the real application would
license_obj = LicenseProcessor.process_license(openalex_data)
# Verify the processing logic works with real data structure
assert isinstance(license_obj, License)
assert hasattr(license_obj, "short")
assert hasattr(license_obj, "name")
assert hasattr(license_obj, "uri")
# Test derivative permission logic with real license
if license_obj.short in DERIVATIVE_ALLOWED_LICENSES:
# Should be able to use CrossRef abstract
assert license_obj.short in [
"cc-by",
"cc-by-sa",
"cc-by-nc",
"cc-by-nc-sa",
"cc0",
"pd",
]
else:
# Should use OpenAlex abstract reconstruction
assert license_obj.short not in DERIVATIVE_ALLOWED_LICENSES
def test_license_processing_with_multiple_locations(openalex_data):
"""Test license processing logic with multiple publication locations."""
# Process all locations like the real application might encounter
locations = openalex_data.get("locations", [])
processed_licenses = []
for location in locations:
# Create data structure as it would appear from API
location_data = {"primary_location": location}
license_obj = LicenseProcessor.process_license(location_data)
processed_licenses.append(license_obj)
# Verify processing logic works for all location types
assert len(processed_licenses) > 0
assert all(isinstance(lic, License) for lic in processed_licenses)
# Should handle various license states consistently
for license_obj in processed_licenses:
if license_obj.short != "unknown":
assert (
license_obj.short in DERIVATIVE_ALLOWED_LICENSES
or license_obj.short not in DERIVATIVE_ALLOWED_LICENSES
)
def test_crossref_license_url_mapping_logic(crossref_data):
"""Test license URL to short-form mapping logic with real CrossRef data."""
# Extract license information as the real application would
crossref_licenses = crossref_data.get("message", {}).get("license", [])
if crossref_licenses:
license_url = crossref_licenses[0].get("URL", "")
# Test the mapping logic that would be used in practice
from doi2dataset import LICENSE_MAP
# Find corresponding short form by URL matching
matching_short = None
for short, (uri, _name) in LICENSE_MAP.items():
if uri == license_url:
matching_short = short
break
if matching_short:
# Test that our license processor handles this correctly
test_data = {"primary_location": {"license": matching_short}}
license_obj = LicenseProcessor.process_license(test_data)
assert license_obj.short == matching_short
assert license_obj.uri == license_url

View file

@ -1,5 +1,4 @@
import json
import os
import tempfile
from http import HTTPStatus
from pathlib import Path
@ -9,14 +8,7 @@ import pytest
from doi2dataset import MetadataProcessor
@pytest.fixture
def openalex_data():
"""Load the saved JSON response from the file 'srep45389.json'"""
json_path = os.path.join(os.path.dirname(__file__), "srep45389.json")
with open(json_path, encoding="utf-8") as f:
data = json.load(f)
return data
# openalex_data fixture now comes from conftest.py
@pytest.fixture
@ -444,6 +436,175 @@ class TestMetadataProcessorErrorHandling:
with pytest.raises(KeyError, match="Missing required field"):
processor.process()
def test_update_progress_with_progress_bar(self):
"""Test progress update when progress bar is enabled."""
processor = MetadataProcessor(
doi="10.1000/test", output_path=Path("/tmp/test.json"), progress=True
)
processor.console = MagicMock()
# Mock progress bar
mock_progress = MagicMock()
processor.progress = mock_progress
processor.task_id = "test_task_id"
processor._update_progress()
# Verify progress.advance was called
mock_progress.advance.assert_called_once_with("test_task_id")
def test_update_progress_without_progress_bar(self):
"""Test progress update when progress bar is disabled."""
processor = MetadataProcessor(
doi="10.1000/test", output_path=Path("/tmp/test.json"), progress=False
)
processor.console = MagicMock()
# No progress bar set
processor.progress = None
processor.task_id = None
# Should not raise any errors
processor._update_progress()
@patch("doi2dataset.processing.metadata.APIClient")
def test_upload_success_with_persistent_id(self, mock_api_client_class):
"""Test successful upload with persistent ID response."""
import os
from doi2dataset import Config
# Load test config
config_path = os.path.join(os.path.dirname(__file__), "config_test.yaml")
Config.load_config(config_path=config_path)
# Mock the APIClient instance and response
mock_client = Mock()
mock_response = Mock()
mock_response.status_code = 201 # Success status for upload
mock_response.json.return_value = {
"data": {"persistentId": "doi:10.7910/DVN/TEST123"}
}
mock_client.make_request.return_value = mock_response
mock_api_client_class.return_value = mock_client
processor = MetadataProcessor(
doi="10.1000/test", output_path=Path("/tmp/test.json"), upload=True
)
processor.console = MagicMock()
metadata = {"datasetVersion": {"files": []}}
result = processor._upload_data(metadata)
# Verify successful response handling
assert result["data"]["persistentId"] == "doi:10.7910/DVN/TEST123"
processor.console.print.assert_called()
@patch("doi2dataset.processing.metadata.APIClient")
def test_upload_success_console_output(self, mock_api_client_class):
"""Test console output during successful upload."""
import os
from unittest.mock import Mock
from doi2dataset import Config
# Load test config
config_path = os.path.join(os.path.dirname(__file__), "config_test.yaml")
Config.load_config(config_path=config_path)
# Mock the APIClient instance and response
mock_client = Mock()
mock_response = Mock()
mock_response.status_code = 201 # Success status for upload
mock_response.json.return_value = {
"data": {"persistentId": "doi:10.7910/DVN/TEST123"}
}
mock_client.make_request.return_value = mock_response
mock_api_client_class.return_value = mock_client
processor = MetadataProcessor(
doi="10.1000/test", output_path=Path("/tmp/test.json"), upload=True
)
processor.console = MagicMock()
metadata = {"datasetVersion": {"files": []}}
processor._upload_data(metadata)
# Verify successful upload message was printed
processor.console.print.assert_called()
call_args = [call[0][0] for call in processor.console.print.call_args_list]
upload_message = next(
(msg for msg in call_args if "Dataset uploaded to:" in msg), None
)
assert upload_message is not None
assert "TEST123" in upload_message
def test_progress_update_integration(self):
"""Test progress updates during complete processing workflow."""
from unittest.mock import patch
# Mock all external dependencies
mock_data = {"title": "Test Paper", "authorships": []}
with patch(
"doi2dataset.processing.metadata.MetadataProcessor._fetch_data",
return_value=mock_data,
):
with patch(
"doi2dataset.processing.metadata.MetadataProcessor._build_metadata",
return_value={"test": "metadata"},
):
with patch(
"doi2dataset.processing.metadata.MetadataProcessor._save_output"
):
processor = MetadataProcessor(
doi="10.1000/test",
output_path=Path("/tmp/test.json"),
progress=True,
)
processor.console = MagicMock()
# Mock progress bar
mock_progress = MagicMock()
processor.progress = mock_progress
processor.task_id = "test_task"
# Process should call _update_progress multiple times
processor.process()
# Verify progress was advanced multiple times (fetch, build, save)
assert mock_progress.advance.call_count >= 3
for call in mock_progress.advance.call_args_list:
assert call[0][0] == "test_task"
def test_fetch_data_with_real_structure(self, openalex_data):
"""Test _fetch_data method with realistic OpenAlex response structure."""
from http import HTTPStatus
from unittest.mock import Mock, patch
mock_client = Mock()
mock_response = Mock()
mock_response.status_code = HTTPStatus.OK
mock_response.json.return_value = openalex_data
# Test fetch_data with real structure
mock_client.make_request.return_value = mock_response
with patch(
"doi2dataset.processing.metadata.APIClient", return_value=mock_client
):
processor = MetadataProcessor(
doi="10.1038/srep45389", output_path=Path("/tmp/test.json")
)
processor.console = MagicMock()
result = processor._fetch_data()
# Verify we got the expected data structure
assert result == openalex_data
assert "title" in result
assert "authorships" in result
assert "publication_date" in result
def test_partial_data(self):
"""Test handling of incomplete API responses."""
with patch(

View file

@ -84,3 +84,81 @@ def test_person_to_dict_with_no_affiliation():
assert result["family_name"] == "Green"
assert result["given_name"] == "Alice"
assert result["orcid"] == "0000-0002-1111-2222"
def test_person_creation_from_real_authorship_data(openalex_data):
"""Test Person creation by processing real OpenAlex authorship data."""
from doi2dataset.utils.validation import split_name
# Process first authorship like the real application would
first_authorship = openalex_data["authorships"][0]
author_data = first_authorship["author"]
# Extract display_name and process it like CitationBuilder does
display_name = author_data.get("display_name", "")
given_name, family_name = split_name(display_name)
# Extract ORCID and clean it like the real application
orcid = author_data.get("orcid")
if orcid and "orcid.org/" in orcid:
orcid = orcid.split("orcid.org/")[-1]
person = Person(
family_name=family_name,
given_name=given_name,
orcid=orcid,
email=None,
affiliation=None,
)
# Verify the processing worked correctly
assert person.family_name != ""
assert person.given_name != ""
if orcid:
assert len(person.orcid) == 19 # ORCID format: 0000-0000-0000-0000
def test_institution_processing_from_real_data(openalex_data):
"""Test Institution creation by processing real OpenAlex institution data."""
# Process first institution like the real application would
first_authorship = openalex_data["authorships"][0]
institution_data = first_authorship["institutions"][0]
# Extract and process data like CitationBuilder does
display_name = institution_data.get("display_name", "")
ror = institution_data.get("ror", "")
institution = Institution(display_name=display_name, ror=ror)
# Test that processing preserves essential functionality
assert len(institution.display_name) > 0
if ror:
assert ror.startswith("https://ror.org/")
affiliation_field = institution.affiliation_field()
assert affiliation_field.value == ror
assert affiliation_field.expanded_value["termName"] == display_name
def test_multiple_institutions_processing(openalex_data):
"""Test processing multiple institutions from real authorship data."""
institutions_created = []
# Process all institutions like the real application would
for authorship in openalex_data["authorships"]:
for institution_data in authorship.get("institutions", []):
display_name = institution_data.get("display_name", "")
ror = institution_data.get("ror", "")
if display_name: # Only create if there's actual data
institution = Institution(display_name=display_name, ror=ror)
institutions_created.append(institution)
# Verify we processed multiple institutions successfully
assert len(institutions_created) > 0
# All should have valid display names
assert all(len(inst.display_name) > 0 for inst in institutions_created)
# Some should have ROR IDs (based on real data)
ror_institutions = [inst for inst in institutions_created if inst.ror]
assert len(ror_institutions) > 0

View file

@ -29,6 +29,86 @@ def test_get_publication_year_with_date(metadata_processor):
assert year == ""
def test_publication_year_processing_logic(openalex_data):
"""Test publication year extraction logic with real OpenAlex data structure."""
doi = openalex_data["doi"].replace("https://doi.org/", "")
processor = MetadataProcessor(doi=doi, upload=False, progress=False)
processor.console = MagicMock()
# Test the actual processing logic used by the application
year = processor._get_publication_year(openalex_data)
# Verify the processing logic works (should prefer publication_year field)
assert isinstance(year, int)
assert year > 1900 # Reasonable publication year
assert year <= 2030 # Not future date
def test_doi_validation_processing_pipeline(openalex_data):
"""Test DOI processing pipeline with real OpenAlex DOI format."""
from doi2dataset.utils.validation import normalize_doi, validate_doi
# Extract DOI as the real application would
doi_from_data = openalex_data.get("doi", "")
# Process DOI through the same pipeline as real application
if doi_from_data.startswith("https://doi.org/"):
clean_doi = doi_from_data.replace("https://doi.org/", "")
else:
clean_doi = doi_from_data
# Test validation and normalization logic
is_valid = validate_doi(clean_doi)
normalized = normalize_doi(clean_doi)
assert is_valid is True
assert normalized.startswith("10.")
assert len(normalized.split("/")) == 2 # Should have registrant/suffix format
def test_subject_mapping_processing_logic(openalex_data):
"""Test subject mapping logic with real OpenAlex topics structure."""
from doi2dataset import SubjectMapper
# Process topics exactly as the real application would
topics = openalex_data.get("topics", [])
# Test SubjectMapper processing logic
subjects = SubjectMapper.get_subjects({"topics": topics})
# Verify the mapping logic produces valid results
assert isinstance(subjects, list)
# If we have topics, we should get subjects
if topics:
assert len(subjects) > 0
# Each subject should be a string
assert all(isinstance(subj, str) for subj in subjects)
def test_abstract_reconstruction_processing(openalex_data):
"""Test abstract reconstruction logic with real inverted index data."""
from doi2dataset.api.client import APIClient
from doi2dataset.api.processors import AbstractProcessor
# Test the actual reconstruction logic used in the application
processor = AbstractProcessor(APIClient())
# Process abstract inverted index as the real application would
reconstructed = processor._get_openalex_abstract(openalex_data)
if openalex_data.get("abstract_inverted_index"):
# Should successfully reconstruct abstract
assert reconstructed is not None
assert isinstance(reconstructed, str)
assert len(reconstructed) > 0
# Should contain readable text with spaces
assert " " in reconstructed
else:
# Should handle missing abstract gracefully
assert reconstructed is None
def test_get_publication_year_with_both_fields(metadata_processor):
"""Test that _get_publication_year prioritizes publication_year over date"""
data = {"publication_year": 2020, "publication_date": "2019-05-15"}

View file

@ -188,6 +188,47 @@ def test_validate_email_validator_error():
assert result is False
@patch("dns.resolver.resolve")
def test_validate_email_dns_exceptions(mock_resolve):
"""Test email validation with DNS-related exceptions."""
# Test with mocked DNS resolver raising various exceptions
with patch("email_validator.validate_email") as mock_validate:
mock_result = Mock()
mock_result.normalized = "test@example.com"
mock_validate.return_value = mock_result
# Test with NoAnswer exception
mock_resolve.side_effect = dns.resolver.NoAnswer()
result = validate_email_address("test@example.com")
assert result is False
# Test with NXDOMAIN exception
mock_resolve.side_effect = dns.resolver.NXDOMAIN()
result = validate_email_address("test@example.com")
assert result is False
def test_validate_email_validator_exceptions():
"""Test email validation with email_validator exceptions."""
# Test email validator error
with patch("email_validator.validate_email") as mock_validate:
mock_validate.side_effect = EmailNotValidError("Invalid format")
result = validate_email_address("invalid-email")
assert result is False
# Test with various malformed emails that should fail validation
invalid_emails = [
"plainaddress",
"@missingusername.com",
"username@.com",
"username@com",
"username..double.dot@example.com",
]
for email in invalid_emails:
assert validate_email_address(email) is False
# DOI validation edge cases
def test_validate_doi_formats():
"""Test validation of various valid DOI formats."""