test: replace hardcoded with dynamic extraction
- Replace hardcoded author names with dynamic openalex_data extraction - Extract DOIs from API response when tests use other response values - Remove redundant fake_openalex_response fixture - Add abstract_inverted_index_v3 to allowed None fields in API tests - Fix test robustness against fixture data changes - Improve test coverage from ~84% to ~90%
This commit is contained in:
parent
c282cd1047
commit
cc94e495ff
14 changed files with 883 additions and 56 deletions
|
@ -31,12 +31,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
- Consolidate overlapping test concerns into dedicated files
|
||||
- Extract CLI tests into dedicated `test_cli.py` module
|
||||
- Improve test coverage from 63.87% to 84.84%
|
||||
- Replace hardcoded test values with dynamic extraction from API response fixtures
|
||||
- Extract DOIs from API response data in tests that use other response values for consistency
|
||||
- Remove redundant test fixtures and parameters
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fix list formatting in API documentation docstrings for better sphinx rendering
|
||||
- Fix formatting inconsistencies in constants.py (remove double empty lines)
|
||||
- Fix ruff linting issues with unused mock variables in tests
|
||||
- Replace hardcoded author names with dynamic extraction from OpenAlex data
|
||||
- Replace hardcoded content checks with dynamic validation using actual API response data
|
||||
- Fix test robustness against changes in fixture data by using real API response processing
|
||||
- Remove duplicate fake_openalex_response fixture in favor of direct openalex_data usage
|
||||
- Add abstract_inverted_index_v3 to allowed None fields in API response structure tests
|
||||
|
||||
## [v3.0.1] - 2025-07-25
|
||||
|
||||
|
|
|
@ -1,3 +1,10 @@
|
|||
dataverse:
|
||||
url: "https://test.dataverse.org"
|
||||
api_token: "test_token"
|
||||
dataverse: "test_dataverse"
|
||||
auth_user: "test_user"
|
||||
auth_password: "test_password"
|
||||
|
||||
default_grants:
|
||||
- funder: "Awesome Funding Agency"
|
||||
id: "ABC12345"
|
||||
|
|
|
@ -1,8 +1,27 @@
|
|||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
|
||||
# Get the path to the parent directory of tests
|
||||
parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
||||
|
||||
# Add the parent directory to sys.path
|
||||
sys.path.insert(0, parent_dir)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def openalex_data():
|
||||
"""Load OpenAlex API response data for reuse across tests."""
|
||||
json_path = os.path.join(os.path.dirname(__file__), "srep45389_openalex.json")
|
||||
with open(json_path, encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def crossref_data():
|
||||
"""Load CrossRef API response data for reuse across tests."""
|
||||
json_path = os.path.join(os.path.dirname(__file__), "srep45389_crossref.json")
|
||||
with open(json_path, encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
|
1
tests/srep45389_crossref.json
Normal file
1
tests/srep45389_crossref.json
Normal file
File diff suppressed because one or more lines are too long
|
@ -204,3 +204,173 @@ class TestAbstractProcessor:
|
|||
mock_crossref.assert_not_called()
|
||||
mock_openalex.assert_called_once()
|
||||
assert result2.source == "openalex"
|
||||
|
||||
def test_custom_license_console_output(self):
|
||||
"""Test console output for custom licenses without names"""
|
||||
# Create a custom license without a name
|
||||
custom_license = License(name="", uri="http://custom.license", short="custom")
|
||||
|
||||
with patch.object(
|
||||
self.processor, "_get_openalex_abstract", return_value="OpenAlex text"
|
||||
):
|
||||
with patch.object(self.processor.console, "print") as mock_print:
|
||||
result = self.processor.get_abstract("10.1234/test", {}, custom_license)
|
||||
|
||||
# Should print custom license message
|
||||
mock_print.assert_called()
|
||||
# Check that it mentions "Custom license"
|
||||
call_args = mock_print.call_args[0][0]
|
||||
assert "Custom license does not allow derivative works" in call_args
|
||||
assert result.source == "openalex"
|
||||
|
||||
def test_crossref_api_failure(self):
|
||||
"""Test _get_crossref_abstract when API call fails"""
|
||||
from unittest.mock import Mock
|
||||
|
||||
# Mock API response failure
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 404
|
||||
|
||||
with patch.object(
|
||||
self.processor.api_client, "make_request", return_value=mock_response
|
||||
):
|
||||
result = self.processor._get_crossref_abstract("10.1234/test")
|
||||
assert result is None
|
||||
|
||||
# Test with no response
|
||||
with patch.object(self.processor.api_client, "make_request", return_value=None):
|
||||
result = self.processor._get_crossref_abstract("10.1234/test")
|
||||
assert result is None
|
||||
|
||||
def test_get_openalex_abstract_no_inverted_index(self):
|
||||
"""Test _get_openalex_abstract when no abstract_inverted_index exists"""
|
||||
data = {"title": "Test Article"} # No abstract_inverted_index
|
||||
|
||||
result = self.processor._get_openalex_abstract(data)
|
||||
assert result is None
|
||||
|
||||
def test_clean_jats_comprehensive(self):
|
||||
"""Test _clean_jats method with various JATS tags"""
|
||||
# Test with None input
|
||||
result = self.processor._clean_jats(None)
|
||||
assert result == ""
|
||||
|
||||
# Test with empty string
|
||||
result = self.processor._clean_jats("")
|
||||
assert result == ""
|
||||
|
||||
# Test with ordered list
|
||||
jats_text = '<jats:list list-type="order"><jats:list-item>First item</jats:list-item><jats:list-item>Second item</jats:list-item></jats:list>'
|
||||
expected = "<ol><li>First item</li><li>Second item</li></ol>"
|
||||
result = self.processor._clean_jats(jats_text)
|
||||
assert result == expected
|
||||
|
||||
# Test with unordered list
|
||||
jats_text = '<jats:list list-type="bullet"><jats:list-item>Bullet one</jats:list-item><jats:list-item>Bullet two</jats:list-item></jats:list>'
|
||||
expected = "<ul><li>Bullet one</li><li>Bullet two</li></ul>"
|
||||
result = self.processor._clean_jats(jats_text)
|
||||
assert result == expected
|
||||
|
||||
# Test with mixed formatting tags
|
||||
jats_text = "<jats:p>This is <jats:italic>italic</jats:italic> and <jats:bold>bold</jats:bold> text with <jats:sup>superscript</jats:sup> and <jats:sub>subscript</jats:sub>.</jats:p>"
|
||||
expected = "<p>This is <i>italic</i> and <b>bold</b> text with <sup>superscript</sup> and <sub>subscript</sub>.</p>"
|
||||
result = self.processor._clean_jats(jats_text)
|
||||
assert result == expected
|
||||
|
||||
# Test with other formatting tags
|
||||
jats_text = "<jats:underline>Underlined</jats:underline> <jats:monospace>Code</jats:monospace> <jats:sc>Small caps</jats:sc>"
|
||||
expected = "<u>Underlined</u> <code>Code</code> <small>Small caps</small>"
|
||||
result = self.processor._clean_jats(jats_text)
|
||||
assert result == expected
|
||||
|
||||
# Test with title and blockquote
|
||||
jats_text = "<jats:title>Section Title</jats:title><jats:blockquote>This is a quote</jats:blockquote>"
|
||||
expected = "<h2>Section Title</h2><blockquote>This is a quote</blockquote>"
|
||||
result = self.processor._clean_jats(jats_text)
|
||||
assert result == expected
|
||||
|
||||
def test_no_abstract_found_console_messages(self):
|
||||
"""Test console messages when no abstract is found"""
|
||||
license_obj = create_license_from_map("cc-by-nd") # No derivative allowed
|
||||
|
||||
with patch.object(self.processor, "_get_openalex_abstract", return_value=None):
|
||||
with patch.object(self.processor.console, "print") as mock_print:
|
||||
result = self.processor.get_abstract("10.1234/test", {}, license_obj)
|
||||
|
||||
# Should print warning messages
|
||||
assert mock_print.call_count >= 2
|
||||
|
||||
# Check for specific warning messages
|
||||
call_messages = [call[0][0] for call in mock_print.call_args_list]
|
||||
assert any(
|
||||
"No abstract found in OpenAlex!" in msg for msg in call_messages
|
||||
)
|
||||
assert any(
|
||||
"No abstract found in either CrossRef nor OpenAlex!" in msg
|
||||
for msg in call_messages
|
||||
)
|
||||
|
||||
assert result.text == ""
|
||||
assert result.source == "none"
|
||||
|
||||
def test_crossref_abstract_with_real_data(self, crossref_data):
|
||||
"""Test CrossRef abstract extraction using real CrossRef data"""
|
||||
from http import HTTPStatus
|
||||
from unittest.mock import Mock
|
||||
|
||||
# Mock successful API response with real data
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = HTTPStatus.OK
|
||||
mock_response.json.return_value = crossref_data
|
||||
|
||||
# Extract DOI from CrossRef data since we're using other values from the response
|
||||
expected_doi = crossref_data["message"]["DOI"]
|
||||
|
||||
with patch.object(
|
||||
self.processor.api_client, "make_request", return_value=mock_response
|
||||
):
|
||||
result = self.processor._get_crossref_abstract(expected_doi)
|
||||
|
||||
# Should successfully extract and clean the abstract
|
||||
assert result is not None
|
||||
assert len(result) > 0
|
||||
|
||||
# Check that JATS tags were converted to HTML
|
||||
assert "<p>" in result # JATS paragraphs converted
|
||||
assert "<i>" in result # JATS italic converted
|
||||
assert "<sub>" in result # JATS subscript converted
|
||||
assert "jats:" not in result # No JATS tags should remain
|
||||
|
||||
def test_jats_cleaning_comprehensive_real_data(self, crossref_data):
|
||||
"""Test JATS cleaning with real CrossRef abstract data"""
|
||||
|
||||
raw_abstract = crossref_data["message"]["abstract"]
|
||||
|
||||
# Clean the JATS tags
|
||||
cleaned = self.processor._clean_jats(raw_abstract)
|
||||
|
||||
# Verify specific transformations from the real data
|
||||
assert "<jats:title>" not in cleaned
|
||||
assert "<h2>" in cleaned # Title should be converted
|
||||
assert "<jats:p>" not in cleaned
|
||||
assert "<p>" in cleaned # Paragraphs should be converted
|
||||
assert "<jats:sub>" not in cleaned
|
||||
assert "<sub>" in cleaned # Subscripts should be converted
|
||||
assert "<jats:italic>" not in cleaned
|
||||
assert "<i>" in cleaned # Italics should be converted
|
||||
|
||||
# Ensure the content is preserved by checking for specific content from the abstract
|
||||
assert "pyruvate phosphate dikinase" in cleaned.lower()
|
||||
assert "Abstract" in cleaned
|
||||
|
||||
def test_openalex_abstract_reconstruction_with_real_data(self, openalex_data):
|
||||
"""Test OpenAlex abstract reconstruction using real inverted index data"""
|
||||
|
||||
# Extract the abstract using the inverted index
|
||||
result = self.processor._get_openalex_abstract(openalex_data)
|
||||
|
||||
if result: # Only test if there's an abstract in the data
|
||||
assert isinstance(result, str)
|
||||
assert len(result) > 0
|
||||
# Should be reconstructed from word positions
|
||||
assert " " in result # Should have spaces between words
|
||||
|
|
|
@ -428,3 +428,101 @@ class TestAPIClientUsageScenarios:
|
|||
assert "X-Dataverse-key" in client.session.headers
|
||||
assert "Custom-Header" in client.session.headers
|
||||
assert client.session.headers["Custom-Header"] == "custom-value"
|
||||
|
||||
|
||||
def test_api_response_structure_processing(openalex_data):
|
||||
"""Test API client processes complex nested response structures correctly."""
|
||||
client = APIClient()
|
||||
|
||||
with patch.object(client.session, "request") as mock_request:
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.json.return_value = openalex_data
|
||||
mock_request.return_value = mock_response
|
||||
|
||||
response = client.make_request("https://api.openalex.org/works/test")
|
||||
|
||||
assert response is not None
|
||||
data = response.json()
|
||||
|
||||
# Test that nested structures are preserved through the request pipeline
|
||||
if "authorships" in data:
|
||||
assert isinstance(data["authorships"], list)
|
||||
# Test deep nesting preservation
|
||||
for authorship in data["authorships"]:
|
||||
if "institutions" in authorship:
|
||||
assert isinstance(authorship["institutions"], list)
|
||||
|
||||
# Test data type preservation through JSON serialization/deserialization
|
||||
for key, value in data.items():
|
||||
assert value is not None or key in [
|
||||
"abstract_inverted_index",
|
||||
"abstract_inverted_index_v3",
|
||||
] # Some fields can legitimately be None
|
||||
|
||||
|
||||
def test_api_unicode_encoding_processing(openalex_data):
|
||||
"""Test API client correctly processes Unicode characters in responses."""
|
||||
client = APIClient()
|
||||
|
||||
with patch.object(client.session, "request") as mock_request:
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.json.return_value = openalex_data
|
||||
mock_response.encoding = "utf-8"
|
||||
mock_request.return_value = mock_response
|
||||
|
||||
response = client.make_request("https://api.openalex.org/works/test")
|
||||
|
||||
assert response is not None
|
||||
data = response.json()
|
||||
|
||||
# Test that Unicode characters are preserved through processing pipeline
|
||||
def check_unicode_preservation(obj):
|
||||
if isinstance(obj, str):
|
||||
# Should preserve Unicode characters
|
||||
try:
|
||||
obj.encode("utf-8")
|
||||
return True
|
||||
except UnicodeEncodeError:
|
||||
return False
|
||||
elif isinstance(obj, dict):
|
||||
return all(check_unicode_preservation(v) for v in obj.values())
|
||||
elif isinstance(obj, list):
|
||||
return all(check_unicode_preservation(item) for item in obj)
|
||||
return True
|
||||
|
||||
assert check_unicode_preservation(data)
|
||||
|
||||
|
||||
def test_large_response_processing_efficiency(openalex_data):
|
||||
"""Test API client efficiently processes large response payloads."""
|
||||
client = APIClient()
|
||||
|
||||
# Create large response based on real structure
|
||||
large_data = dict(openalex_data)
|
||||
if "referenced_works" in large_data:
|
||||
# Extend existing referenced works
|
||||
base_works = (
|
||||
large_data["referenced_works"][:10]
|
||||
if large_data["referenced_works"]
|
||||
else []
|
||||
)
|
||||
large_data["referenced_works"] = base_works * 100 # Create large list
|
||||
|
||||
with patch.object(client.session, "request") as mock_request:
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.json.return_value = large_data
|
||||
mock_request.return_value = mock_response
|
||||
|
||||
response = client.make_request("https://api.openalex.org/works/test")
|
||||
|
||||
assert response is not None
|
||||
data = response.json()
|
||||
|
||||
# Verify large data structures are handled correctly
|
||||
if "referenced_works" in data:
|
||||
assert len(data["referenced_works"]) > 100
|
||||
# All elements should maintain structure integrity
|
||||
assert all(isinstance(work, str) for work in data["referenced_works"])
|
||||
|
|
|
@ -1,18 +1,8 @@
|
|||
import json
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from doi2dataset import CitationBuilder, Person, PIFinder
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def openalex_data():
|
||||
"""Load the saved JSON response from the file 'srep45389.json'"""
|
||||
json_path = os.path.join(os.path.dirname(__file__), "srep45389.json")
|
||||
with open(json_path, encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
return data
|
||||
# openalex_data fixture now comes from conftest.py
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -169,3 +159,113 @@ def test_build_authors_with_ror(openalex_data, pi_finder):
|
|||
|
||||
assert "@type" in expanded_value
|
||||
assert expanded_value["@type"] == "https://schema.org/Organization"
|
||||
|
||||
|
||||
def test_build_authors_with_real_data(openalex_data, pi_finder):
|
||||
"""Test author building with real OpenAlex data structure"""
|
||||
doi = openalex_data["doi"].replace("https://doi.org/", "")
|
||||
builder = CitationBuilder(data=openalex_data, doi=doi, pi_finder=pi_finder)
|
||||
|
||||
authors, corresponding = builder.build_authors()
|
||||
|
||||
# Should have multiple authors from the real data
|
||||
assert len(authors) > 0
|
||||
|
||||
# Extract expected author names from the API response data
|
||||
expected_authors = []
|
||||
for authorship in openalex_data.get("authorships", []):
|
||||
if "author" in authorship and "display_name" in authorship["author"]:
|
||||
expected_authors.append(authorship["author"]["display_name"])
|
||||
|
||||
# Check that real author names from API response are processed correctly
|
||||
author_names = [f"{author.given_name} {author.family_name}" for author in authors]
|
||||
|
||||
# Verify that at least some expected authors from the API response are found
|
||||
found_authors = 0
|
||||
for expected_name in expected_authors:
|
||||
if any(expected_name in author_name for author_name in author_names):
|
||||
found_authors += 1
|
||||
|
||||
# Should find at least some authors from the API response
|
||||
assert (
|
||||
found_authors > 0
|
||||
), f"No expected authors found. Expected: {expected_authors}, Got: {author_names}"
|
||||
|
||||
|
||||
def test_process_author_edge_cases(pi_finder):
|
||||
"""Test _process_author with various edge cases"""
|
||||
builder = CitationBuilder(
|
||||
data={"authorships": []}, doi="10.1000/test", pi_finder=pi_finder
|
||||
)
|
||||
|
||||
# Test with minimal author data
|
||||
minimal_author = {"display_name": "John Smith"}
|
||||
empty_authorship = {}
|
||||
person = builder._process_author(minimal_author, empty_authorship)
|
||||
assert person.given_name == "John"
|
||||
assert person.family_name == "Smith"
|
||||
|
||||
# Test with ORCID
|
||||
author_with_orcid = {
|
||||
"display_name": "Jane Doe",
|
||||
"orcid": "https://orcid.org/0000-0000-0000-0000",
|
||||
}
|
||||
person = builder._process_author(author_with_orcid, empty_authorship)
|
||||
assert person.orcid == "0000-0000-0000-0000" # URL part is stripped
|
||||
|
||||
|
||||
def test_build_grants_with_default_config(pi_finder):
|
||||
"""Test that grants include default grants from config"""
|
||||
# Use real data structure but focus on grants behavior
|
||||
data = {"authorships": [], "grants": []}
|
||||
|
||||
builder = CitationBuilder(data=data, doi="10.1000/test", pi_finder=pi_finder)
|
||||
grants = builder.build_grants()
|
||||
|
||||
# Should have at least the default grants from config
|
||||
# The exact number depends on the config, but should be >= 0
|
||||
assert isinstance(grants, list)
|
||||
for grant in grants:
|
||||
assert len(grant) == 2 # Should have agency and value fields
|
||||
assert grant[0].name == "grantNumberAgency"
|
||||
assert grant[1].name == "grantNumberValue"
|
||||
|
||||
|
||||
def test_process_corresponding_author_no_email(pi_finder):
|
||||
"""Test _process_corresponding_author when no email is available"""
|
||||
builder = CitationBuilder(
|
||||
data={"authorships": []}, doi="10.1000/test", pi_finder=pi_finder
|
||||
)
|
||||
|
||||
# Create a Person without email
|
||||
person = Person(
|
||||
given_name="John", family_name="Doe", orcid=None, email=None, affiliation=None
|
||||
)
|
||||
|
||||
authorship = {"is_corresponding": True}
|
||||
|
||||
result = builder._process_corresponding_author(person, authorship)
|
||||
|
||||
# Should return None when no email is available
|
||||
assert result is None
|
||||
|
||||
|
||||
def test_build_authors_skip_empty_authorships(pi_finder):
|
||||
"""Test that empty author entries are skipped"""
|
||||
data_with_empty_authors = {
|
||||
"authorships": [
|
||||
{"author": {}}, # Empty author
|
||||
{}, # No author key
|
||||
{"author": {"display_name": "John Doe"}}, # Valid author
|
||||
]
|
||||
}
|
||||
|
||||
builder = CitationBuilder(
|
||||
data=data_with_empty_authors, doi="10.1000/test", pi_finder=pi_finder
|
||||
)
|
||||
authors, corresponding = builder.build_authors()
|
||||
|
||||
# Should only process the one valid author
|
||||
assert len(authors) == 1
|
||||
assert authors[0].given_name == "John"
|
||||
assert authors[0].family_name == "Doe"
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
import json
|
||||
import os
|
||||
from unittest.mock import patch
|
||||
|
||||
|
@ -44,27 +43,15 @@ def load_config_test():
|
|||
Config.load_config(config_path=config_path)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fake_openalex_response():
|
||||
"""
|
||||
Load the saved JSON response from the file 'srep45389.json'
|
||||
located in the same directory as this test file.
|
||||
"""
|
||||
json_path = os.path.join(os.path.dirname(__file__), "srep45389.json")
|
||||
with open(json_path, encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
return data
|
||||
|
||||
|
||||
def test_fetch_doi_data_with_file(mocker, fake_openalex_response):
|
||||
def test_fetch_doi_data_with_file(mocker, openalex_data):
|
||||
"""
|
||||
Test fetching DOI metadata by simulating the API call with a locally saved JSON response.
|
||||
|
||||
The APIClient.make_request method is patched to return a fake response built from the contents
|
||||
of 'srep45389.json', ensuring that the configuration is loaded from 'config_test.yaml'.
|
||||
"""
|
||||
doi = "10.1038/srep45389"
|
||||
fake_response = FakeResponse(fake_openalex_response, 200)
|
||||
doi = openalex_data["doi"].replace("https://doi.org/", "")
|
||||
fake_response = FakeResponse(openalex_data, 200)
|
||||
|
||||
# Patch the make_request method of APIClient to return our fake_response.
|
||||
mocker.patch("doi2dataset.APIClient.make_request", return_value=fake_response)
|
||||
|
@ -75,11 +62,11 @@ def test_fetch_doi_data_with_file(mocker, fake_openalex_response):
|
|||
# Call _fetch_data(), which should now return our fake JSON data.
|
||||
data = processor._fetch_data()
|
||||
|
||||
# Verify that the fetched data matches the fake JSON data.
|
||||
assert data == fake_openalex_response
|
||||
# Verify that the fetched data matches the OpenAlex data.
|
||||
assert data == openalex_data
|
||||
|
||||
|
||||
def test_openalex_abstract_extraction(mocker, fake_openalex_response):
|
||||
def test_openalex_abstract_extraction(openalex_data):
|
||||
"""Test the extraction of abstracts from OpenAlex inverted index data."""
|
||||
# Create API client for AbstractProcessor
|
||||
api_client = APIClient()
|
||||
|
@ -88,20 +75,20 @@ def test_openalex_abstract_extraction(mocker, fake_openalex_response):
|
|||
processor = AbstractProcessor(api_client=api_client)
|
||||
|
||||
# Call the protected method directly with the fake response
|
||||
abstract_text = processor._get_openalex_abstract(fake_openalex_response)
|
||||
result = processor._get_openalex_abstract(openalex_data)
|
||||
|
||||
# Verify abstract was extracted
|
||||
assert abstract_text is not None
|
||||
assert result is not None
|
||||
|
||||
# If abstract exists in the response, it should be properly extracted
|
||||
if "abstract_inverted_index" in fake_openalex_response:
|
||||
assert len(abstract_text) > 0
|
||||
if "abstract_inverted_index" in openalex_data:
|
||||
assert len(result) > 0
|
||||
|
||||
|
||||
def test_subject_mapper(fake_openalex_response):
|
||||
def test_subject_mapper(openalex_data):
|
||||
"""Test that the SubjectMapper correctly maps OpenAlex topics to subjects."""
|
||||
# Extract topics from the OpenAlex response
|
||||
topics = fake_openalex_response.get("topics", [])
|
||||
topics = openalex_data.get("topics", [])
|
||||
|
||||
# Get subjects using the class method
|
||||
subjects = SubjectMapper.get_subjects({"topics": topics})
|
||||
|
@ -111,15 +98,15 @@ def test_subject_mapper(fake_openalex_response):
|
|||
assert isinstance(subjects, list)
|
||||
|
||||
|
||||
def test_citation_builder(fake_openalex_response):
|
||||
def test_citation_builder(openalex_data):
|
||||
"""Test that the CitationBuilder correctly builds author information."""
|
||||
doi = "10.1038/srep45389"
|
||||
doi = openalex_data["doi"].replace("https://doi.org/", "")
|
||||
|
||||
# Mock PIFinder with an empty list of PIs
|
||||
pi_finder = PIFinder(pis=[])
|
||||
|
||||
# Create builder with required arguments
|
||||
builder = CitationBuilder(data=fake_openalex_response, doi=doi, pi_finder=pi_finder)
|
||||
builder = CitationBuilder(data=openalex_data, doi=doi, pi_finder=pi_finder)
|
||||
|
||||
# Test building other IDs
|
||||
other_ids = builder.build_other_ids()
|
||||
|
@ -134,12 +121,10 @@ def test_citation_builder(fake_openalex_response):
|
|||
assert isinstance(topics, list)
|
||||
|
||||
|
||||
def test_license_processor(fake_openalex_response):
|
||||
def test_license_processor(openalex_data):
|
||||
"""Test that the LicenseProcessor correctly identifies and processes licenses."""
|
||||
# Create a simplified data structure that contains license info
|
||||
license_data = {
|
||||
"primary_location": fake_openalex_response.get("primary_location", {})
|
||||
}
|
||||
license_data = {"primary_location": openalex_data.get("primary_location", {})}
|
||||
|
||||
# Process the license
|
||||
license_obj = LicenseProcessor.process_license(license_data)
|
||||
|
@ -182,14 +167,14 @@ def test_config_load_invalid_path():
|
|||
Config.load_config(config_path=invalid_path)
|
||||
|
||||
|
||||
def test_metadata_processor_fetch_data(mocker, fake_openalex_response):
|
||||
def test_metadata_processor_fetch_data(mocker, openalex_data):
|
||||
"""Test the _fetch_data method of the MetadataProcessor class with mocked responses."""
|
||||
doi = "10.1038/srep45389"
|
||||
doi = openalex_data["doi"].replace("https://doi.org/", "")
|
||||
|
||||
# Mock API response
|
||||
mocker.patch(
|
||||
"doi2dataset.APIClient.make_request",
|
||||
return_value=FakeResponse(fake_openalex_response, 200),
|
||||
return_value=FakeResponse(openalex_data, 200),
|
||||
)
|
||||
|
||||
# Create processor with upload disabled and progress disabled
|
||||
|
@ -200,7 +185,7 @@ def test_metadata_processor_fetch_data(mocker, fake_openalex_response):
|
|||
|
||||
# Verify that data was fetched correctly
|
||||
assert data is not None
|
||||
assert data == fake_openalex_response
|
||||
assert data == openalex_data
|
||||
|
||||
# Verify the DOI is correctly stored
|
||||
assert processor.doi == doi
|
||||
|
|
|
@ -102,3 +102,82 @@ def test_derivative_allowed_licenses_set_completeness():
|
|||
"""Test that DERIVATIVE_ALLOWED_LICENSES contains expected licenses"""
|
||||
expected_licenses = {"cc-by", "cc-by-sa", "cc-by-nc", "cc-by-nc-sa", "cc0", "pd"}
|
||||
assert DERIVATIVE_ALLOWED_LICENSES == expected_licenses
|
||||
|
||||
|
||||
def test_license_processing_with_real_openalex_structure(openalex_data):
|
||||
"""Test that license processor correctly handles real OpenAlex data structure."""
|
||||
# Process license data exactly as the real application would
|
||||
license_obj = LicenseProcessor.process_license(openalex_data)
|
||||
|
||||
# Verify the processing logic works with real data structure
|
||||
assert isinstance(license_obj, License)
|
||||
assert hasattr(license_obj, "short")
|
||||
assert hasattr(license_obj, "name")
|
||||
assert hasattr(license_obj, "uri")
|
||||
|
||||
# Test derivative permission logic with real license
|
||||
if license_obj.short in DERIVATIVE_ALLOWED_LICENSES:
|
||||
# Should be able to use CrossRef abstract
|
||||
assert license_obj.short in [
|
||||
"cc-by",
|
||||
"cc-by-sa",
|
||||
"cc-by-nc",
|
||||
"cc-by-nc-sa",
|
||||
"cc0",
|
||||
"pd",
|
||||
]
|
||||
else:
|
||||
# Should use OpenAlex abstract reconstruction
|
||||
assert license_obj.short not in DERIVATIVE_ALLOWED_LICENSES
|
||||
|
||||
|
||||
def test_license_processing_with_multiple_locations(openalex_data):
|
||||
"""Test license processing logic with multiple publication locations."""
|
||||
# Process all locations like the real application might encounter
|
||||
locations = openalex_data.get("locations", [])
|
||||
|
||||
processed_licenses = []
|
||||
for location in locations:
|
||||
# Create data structure as it would appear from API
|
||||
location_data = {"primary_location": location}
|
||||
license_obj = LicenseProcessor.process_license(location_data)
|
||||
processed_licenses.append(license_obj)
|
||||
|
||||
# Verify processing logic works for all location types
|
||||
assert len(processed_licenses) > 0
|
||||
assert all(isinstance(lic, License) for lic in processed_licenses)
|
||||
|
||||
# Should handle various license states consistently
|
||||
for license_obj in processed_licenses:
|
||||
if license_obj.short != "unknown":
|
||||
assert (
|
||||
license_obj.short in DERIVATIVE_ALLOWED_LICENSES
|
||||
or license_obj.short not in DERIVATIVE_ALLOWED_LICENSES
|
||||
)
|
||||
|
||||
|
||||
def test_crossref_license_url_mapping_logic(crossref_data):
|
||||
"""Test license URL to short-form mapping logic with real CrossRef data."""
|
||||
# Extract license information as the real application would
|
||||
crossref_licenses = crossref_data.get("message", {}).get("license", [])
|
||||
|
||||
if crossref_licenses:
|
||||
license_url = crossref_licenses[0].get("URL", "")
|
||||
|
||||
# Test the mapping logic that would be used in practice
|
||||
from doi2dataset import LICENSE_MAP
|
||||
|
||||
# Find corresponding short form by URL matching
|
||||
matching_short = None
|
||||
for short, (uri, _name) in LICENSE_MAP.items():
|
||||
if uri == license_url:
|
||||
matching_short = short
|
||||
break
|
||||
|
||||
if matching_short:
|
||||
# Test that our license processor handles this correctly
|
||||
test_data = {"primary_location": {"license": matching_short}}
|
||||
license_obj = LicenseProcessor.process_license(test_data)
|
||||
|
||||
assert license_obj.short == matching_short
|
||||
assert license_obj.uri == license_url
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
import json
|
||||
import os
|
||||
import tempfile
|
||||
from http import HTTPStatus
|
||||
from pathlib import Path
|
||||
|
@ -9,14 +8,7 @@ import pytest
|
|||
|
||||
from doi2dataset import MetadataProcessor
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def openalex_data():
|
||||
"""Load the saved JSON response from the file 'srep45389.json'"""
|
||||
json_path = os.path.join(os.path.dirname(__file__), "srep45389.json")
|
||||
with open(json_path, encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
return data
|
||||
# openalex_data fixture now comes from conftest.py
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -444,6 +436,175 @@ class TestMetadataProcessorErrorHandling:
|
|||
with pytest.raises(KeyError, match="Missing required field"):
|
||||
processor.process()
|
||||
|
||||
def test_update_progress_with_progress_bar(self):
|
||||
"""Test progress update when progress bar is enabled."""
|
||||
processor = MetadataProcessor(
|
||||
doi="10.1000/test", output_path=Path("/tmp/test.json"), progress=True
|
||||
)
|
||||
processor.console = MagicMock()
|
||||
|
||||
# Mock progress bar
|
||||
mock_progress = MagicMock()
|
||||
processor.progress = mock_progress
|
||||
processor.task_id = "test_task_id"
|
||||
|
||||
processor._update_progress()
|
||||
|
||||
# Verify progress.advance was called
|
||||
mock_progress.advance.assert_called_once_with("test_task_id")
|
||||
|
||||
def test_update_progress_without_progress_bar(self):
|
||||
"""Test progress update when progress bar is disabled."""
|
||||
processor = MetadataProcessor(
|
||||
doi="10.1000/test", output_path=Path("/tmp/test.json"), progress=False
|
||||
)
|
||||
processor.console = MagicMock()
|
||||
|
||||
# No progress bar set
|
||||
processor.progress = None
|
||||
processor.task_id = None
|
||||
|
||||
# Should not raise any errors
|
||||
processor._update_progress()
|
||||
|
||||
@patch("doi2dataset.processing.metadata.APIClient")
|
||||
def test_upload_success_with_persistent_id(self, mock_api_client_class):
|
||||
"""Test successful upload with persistent ID response."""
|
||||
import os
|
||||
|
||||
from doi2dataset import Config
|
||||
|
||||
# Load test config
|
||||
config_path = os.path.join(os.path.dirname(__file__), "config_test.yaml")
|
||||
Config.load_config(config_path=config_path)
|
||||
|
||||
# Mock the APIClient instance and response
|
||||
mock_client = Mock()
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 201 # Success status for upload
|
||||
mock_response.json.return_value = {
|
||||
"data": {"persistentId": "doi:10.7910/DVN/TEST123"}
|
||||
}
|
||||
mock_client.make_request.return_value = mock_response
|
||||
mock_api_client_class.return_value = mock_client
|
||||
|
||||
processor = MetadataProcessor(
|
||||
doi="10.1000/test", output_path=Path("/tmp/test.json"), upload=True
|
||||
)
|
||||
processor.console = MagicMock()
|
||||
|
||||
metadata = {"datasetVersion": {"files": []}}
|
||||
result = processor._upload_data(metadata)
|
||||
|
||||
# Verify successful response handling
|
||||
assert result["data"]["persistentId"] == "doi:10.7910/DVN/TEST123"
|
||||
processor.console.print.assert_called()
|
||||
|
||||
@patch("doi2dataset.processing.metadata.APIClient")
|
||||
def test_upload_success_console_output(self, mock_api_client_class):
|
||||
"""Test console output during successful upload."""
|
||||
import os
|
||||
from unittest.mock import Mock
|
||||
|
||||
from doi2dataset import Config
|
||||
|
||||
# Load test config
|
||||
config_path = os.path.join(os.path.dirname(__file__), "config_test.yaml")
|
||||
Config.load_config(config_path=config_path)
|
||||
|
||||
# Mock the APIClient instance and response
|
||||
mock_client = Mock()
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 201 # Success status for upload
|
||||
mock_response.json.return_value = {
|
||||
"data": {"persistentId": "doi:10.7910/DVN/TEST123"}
|
||||
}
|
||||
mock_client.make_request.return_value = mock_response
|
||||
mock_api_client_class.return_value = mock_client
|
||||
|
||||
processor = MetadataProcessor(
|
||||
doi="10.1000/test", output_path=Path("/tmp/test.json"), upload=True
|
||||
)
|
||||
processor.console = MagicMock()
|
||||
|
||||
metadata = {"datasetVersion": {"files": []}}
|
||||
processor._upload_data(metadata)
|
||||
|
||||
# Verify successful upload message was printed
|
||||
processor.console.print.assert_called()
|
||||
call_args = [call[0][0] for call in processor.console.print.call_args_list]
|
||||
upload_message = next(
|
||||
(msg for msg in call_args if "Dataset uploaded to:" in msg), None
|
||||
)
|
||||
assert upload_message is not None
|
||||
assert "TEST123" in upload_message
|
||||
|
||||
def test_progress_update_integration(self):
|
||||
"""Test progress updates during complete processing workflow."""
|
||||
from unittest.mock import patch
|
||||
|
||||
# Mock all external dependencies
|
||||
mock_data = {"title": "Test Paper", "authorships": []}
|
||||
|
||||
with patch(
|
||||
"doi2dataset.processing.metadata.MetadataProcessor._fetch_data",
|
||||
return_value=mock_data,
|
||||
):
|
||||
with patch(
|
||||
"doi2dataset.processing.metadata.MetadataProcessor._build_metadata",
|
||||
return_value={"test": "metadata"},
|
||||
):
|
||||
with patch(
|
||||
"doi2dataset.processing.metadata.MetadataProcessor._save_output"
|
||||
):
|
||||
processor = MetadataProcessor(
|
||||
doi="10.1000/test",
|
||||
output_path=Path("/tmp/test.json"),
|
||||
progress=True,
|
||||
)
|
||||
processor.console = MagicMock()
|
||||
|
||||
# Mock progress bar
|
||||
mock_progress = MagicMock()
|
||||
processor.progress = mock_progress
|
||||
processor.task_id = "test_task"
|
||||
|
||||
# Process should call _update_progress multiple times
|
||||
processor.process()
|
||||
|
||||
# Verify progress was advanced multiple times (fetch, build, save)
|
||||
assert mock_progress.advance.call_count >= 3
|
||||
for call in mock_progress.advance.call_args_list:
|
||||
assert call[0][0] == "test_task"
|
||||
|
||||
def test_fetch_data_with_real_structure(self, openalex_data):
|
||||
"""Test _fetch_data method with realistic OpenAlex response structure."""
|
||||
from http import HTTPStatus
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
mock_client = Mock()
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = HTTPStatus.OK
|
||||
mock_response.json.return_value = openalex_data
|
||||
# Test fetch_data with real structure
|
||||
mock_client.make_request.return_value = mock_response
|
||||
|
||||
with patch(
|
||||
"doi2dataset.processing.metadata.APIClient", return_value=mock_client
|
||||
):
|
||||
processor = MetadataProcessor(
|
||||
doi="10.1038/srep45389", output_path=Path("/tmp/test.json")
|
||||
)
|
||||
processor.console = MagicMock()
|
||||
|
||||
result = processor._fetch_data()
|
||||
|
||||
# Verify we got the expected data structure
|
||||
assert result == openalex_data
|
||||
assert "title" in result
|
||||
assert "authorships" in result
|
||||
assert "publication_date" in result
|
||||
|
||||
def test_partial_data(self):
|
||||
"""Test handling of incomplete API responses."""
|
||||
with patch(
|
||||
|
|
|
@ -84,3 +84,81 @@ def test_person_to_dict_with_no_affiliation():
|
|||
assert result["family_name"] == "Green"
|
||||
assert result["given_name"] == "Alice"
|
||||
assert result["orcid"] == "0000-0002-1111-2222"
|
||||
|
||||
|
||||
def test_person_creation_from_real_authorship_data(openalex_data):
|
||||
"""Test Person creation by processing real OpenAlex authorship data."""
|
||||
from doi2dataset.utils.validation import split_name
|
||||
|
||||
# Process first authorship like the real application would
|
||||
first_authorship = openalex_data["authorships"][0]
|
||||
author_data = first_authorship["author"]
|
||||
|
||||
# Extract display_name and process it like CitationBuilder does
|
||||
display_name = author_data.get("display_name", "")
|
||||
given_name, family_name = split_name(display_name)
|
||||
|
||||
# Extract ORCID and clean it like the real application
|
||||
orcid = author_data.get("orcid")
|
||||
if orcid and "orcid.org/" in orcid:
|
||||
orcid = orcid.split("orcid.org/")[-1]
|
||||
|
||||
person = Person(
|
||||
family_name=family_name,
|
||||
given_name=given_name,
|
||||
orcid=orcid,
|
||||
email=None,
|
||||
affiliation=None,
|
||||
)
|
||||
|
||||
# Verify the processing worked correctly
|
||||
assert person.family_name != ""
|
||||
assert person.given_name != ""
|
||||
if orcid:
|
||||
assert len(person.orcid) == 19 # ORCID format: 0000-0000-0000-0000
|
||||
|
||||
|
||||
def test_institution_processing_from_real_data(openalex_data):
|
||||
"""Test Institution creation by processing real OpenAlex institution data."""
|
||||
# Process first institution like the real application would
|
||||
first_authorship = openalex_data["authorships"][0]
|
||||
institution_data = first_authorship["institutions"][0]
|
||||
|
||||
# Extract and process data like CitationBuilder does
|
||||
display_name = institution_data.get("display_name", "")
|
||||
ror = institution_data.get("ror", "")
|
||||
|
||||
institution = Institution(display_name=display_name, ror=ror)
|
||||
|
||||
# Test that processing preserves essential functionality
|
||||
assert len(institution.display_name) > 0
|
||||
if ror:
|
||||
assert ror.startswith("https://ror.org/")
|
||||
affiliation_field = institution.affiliation_field()
|
||||
assert affiliation_field.value == ror
|
||||
assert affiliation_field.expanded_value["termName"] == display_name
|
||||
|
||||
|
||||
def test_multiple_institutions_processing(openalex_data):
|
||||
"""Test processing multiple institutions from real authorship data."""
|
||||
institutions_created = []
|
||||
|
||||
# Process all institutions like the real application would
|
||||
for authorship in openalex_data["authorships"]:
|
||||
for institution_data in authorship.get("institutions", []):
|
||||
display_name = institution_data.get("display_name", "")
|
||||
ror = institution_data.get("ror", "")
|
||||
|
||||
if display_name: # Only create if there's actual data
|
||||
institution = Institution(display_name=display_name, ror=ror)
|
||||
institutions_created.append(institution)
|
||||
|
||||
# Verify we processed multiple institutions successfully
|
||||
assert len(institutions_created) > 0
|
||||
|
||||
# All should have valid display names
|
||||
assert all(len(inst.display_name) > 0 for inst in institutions_created)
|
||||
|
||||
# Some should have ROR IDs (based on real data)
|
||||
ror_institutions = [inst for inst in institutions_created if inst.ror]
|
||||
assert len(ror_institutions) > 0
|
||||
|
|
|
@ -29,6 +29,86 @@ def test_get_publication_year_with_date(metadata_processor):
|
|||
assert year == ""
|
||||
|
||||
|
||||
def test_publication_year_processing_logic(openalex_data):
|
||||
"""Test publication year extraction logic with real OpenAlex data structure."""
|
||||
doi = openalex_data["doi"].replace("https://doi.org/", "")
|
||||
processor = MetadataProcessor(doi=doi, upload=False, progress=False)
|
||||
processor.console = MagicMock()
|
||||
|
||||
# Test the actual processing logic used by the application
|
||||
year = processor._get_publication_year(openalex_data)
|
||||
|
||||
# Verify the processing logic works (should prefer publication_year field)
|
||||
assert isinstance(year, int)
|
||||
assert year > 1900 # Reasonable publication year
|
||||
assert year <= 2030 # Not future date
|
||||
|
||||
|
||||
def test_doi_validation_processing_pipeline(openalex_data):
|
||||
"""Test DOI processing pipeline with real OpenAlex DOI format."""
|
||||
from doi2dataset.utils.validation import normalize_doi, validate_doi
|
||||
|
||||
# Extract DOI as the real application would
|
||||
doi_from_data = openalex_data.get("doi", "")
|
||||
|
||||
# Process DOI through the same pipeline as real application
|
||||
if doi_from_data.startswith("https://doi.org/"):
|
||||
clean_doi = doi_from_data.replace("https://doi.org/", "")
|
||||
else:
|
||||
clean_doi = doi_from_data
|
||||
|
||||
# Test validation and normalization logic
|
||||
is_valid = validate_doi(clean_doi)
|
||||
normalized = normalize_doi(clean_doi)
|
||||
|
||||
assert is_valid is True
|
||||
assert normalized.startswith("10.")
|
||||
assert len(normalized.split("/")) == 2 # Should have registrant/suffix format
|
||||
|
||||
|
||||
def test_subject_mapping_processing_logic(openalex_data):
|
||||
"""Test subject mapping logic with real OpenAlex topics structure."""
|
||||
from doi2dataset import SubjectMapper
|
||||
|
||||
# Process topics exactly as the real application would
|
||||
topics = openalex_data.get("topics", [])
|
||||
|
||||
# Test SubjectMapper processing logic
|
||||
subjects = SubjectMapper.get_subjects({"topics": topics})
|
||||
|
||||
# Verify the mapping logic produces valid results
|
||||
assert isinstance(subjects, list)
|
||||
|
||||
# If we have topics, we should get subjects
|
||||
if topics:
|
||||
assert len(subjects) > 0
|
||||
# Each subject should be a string
|
||||
assert all(isinstance(subj, str) for subj in subjects)
|
||||
|
||||
|
||||
def test_abstract_reconstruction_processing(openalex_data):
|
||||
"""Test abstract reconstruction logic with real inverted index data."""
|
||||
from doi2dataset.api.client import APIClient
|
||||
from doi2dataset.api.processors import AbstractProcessor
|
||||
|
||||
# Test the actual reconstruction logic used in the application
|
||||
processor = AbstractProcessor(APIClient())
|
||||
|
||||
# Process abstract inverted index as the real application would
|
||||
reconstructed = processor._get_openalex_abstract(openalex_data)
|
||||
|
||||
if openalex_data.get("abstract_inverted_index"):
|
||||
# Should successfully reconstruct abstract
|
||||
assert reconstructed is not None
|
||||
assert isinstance(reconstructed, str)
|
||||
assert len(reconstructed) > 0
|
||||
# Should contain readable text with spaces
|
||||
assert " " in reconstructed
|
||||
else:
|
||||
# Should handle missing abstract gracefully
|
||||
assert reconstructed is None
|
||||
|
||||
|
||||
def test_get_publication_year_with_both_fields(metadata_processor):
|
||||
"""Test that _get_publication_year prioritizes publication_year over date"""
|
||||
data = {"publication_year": 2020, "publication_date": "2019-05-15"}
|
||||
|
|
|
@ -188,6 +188,47 @@ def test_validate_email_validator_error():
|
|||
assert result is False
|
||||
|
||||
|
||||
@patch("dns.resolver.resolve")
|
||||
def test_validate_email_dns_exceptions(mock_resolve):
|
||||
"""Test email validation with DNS-related exceptions."""
|
||||
# Test with mocked DNS resolver raising various exceptions
|
||||
with patch("email_validator.validate_email") as mock_validate:
|
||||
mock_result = Mock()
|
||||
mock_result.normalized = "test@example.com"
|
||||
mock_validate.return_value = mock_result
|
||||
|
||||
# Test with NoAnswer exception
|
||||
mock_resolve.side_effect = dns.resolver.NoAnswer()
|
||||
result = validate_email_address("test@example.com")
|
||||
assert result is False
|
||||
|
||||
# Test with NXDOMAIN exception
|
||||
mock_resolve.side_effect = dns.resolver.NXDOMAIN()
|
||||
result = validate_email_address("test@example.com")
|
||||
assert result is False
|
||||
|
||||
|
||||
def test_validate_email_validator_exceptions():
|
||||
"""Test email validation with email_validator exceptions."""
|
||||
# Test email validator error
|
||||
with patch("email_validator.validate_email") as mock_validate:
|
||||
mock_validate.side_effect = EmailNotValidError("Invalid format")
|
||||
result = validate_email_address("invalid-email")
|
||||
assert result is False
|
||||
|
||||
# Test with various malformed emails that should fail validation
|
||||
invalid_emails = [
|
||||
"plainaddress",
|
||||
"@missingusername.com",
|
||||
"username@.com",
|
||||
"username@com",
|
||||
"username..double.dot@example.com",
|
||||
]
|
||||
|
||||
for email in invalid_emails:
|
||||
assert validate_email_address(email) is False
|
||||
|
||||
|
||||
# DOI validation edge cases
|
||||
def test_validate_doi_formats():
|
||||
"""Test validation of various valid DOI formats."""
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue