feat!: generalize script by removing organizational metadata
All checks were successful
Test pipeline / test (push) Successful in 14s

Remove Phase class, organizational metadata blocks, and unused project fields. Update configuration
to use 'default_grants' and simplify PI usage to fallback corresponding author determination only.

BREAKING CHANGES: - Remove 'phase' and 'project' fields from configuration - Use 'default_grants'
instead of 'default_grant' - Generate only standard Dataverse citation metadata
This commit is contained in:
Alexander Minges 2025-07-07 14:41:39 +02:00
parent 01bc537bd8
commit 67b46d5140
Signed by: Athemis
SSH key fingerprint: SHA256:TUXshgulbwL+FRYvBNo54pCsI0auROsSEgSvueKbkZ4
11 changed files with 207 additions and 269 deletions

View file

@ -1,23 +1,16 @@
default_grant:
default_grants:
- funder: "Awesome Funding Agency"
id: "ABC12345"
phase:
"Phase 1 (2021/2025)":
start: 2021
end: 2025
pis:
- family_name: "Doe"
given_name: "Jon"
orcid: "0000-0000-0000-0000"
email: "jon.doe@iana.org"
affiliation: "Institute of Science, Some University"
project: ["Project A01"]
- family_name: "Doe"
given_name: "Jane"
orcid: "0000-0000-0000-0001"
email: "jane.doe@iana.org"
affiliation: "Institute of Science, Some University"
project: ["Project A02"]

View file

@ -1,13 +1,9 @@
import json
import os
import pytest
from unittest.mock import MagicMock
from doi2dataset import (
CitationBuilder,
PIFinder,
Person
)
import pytest
from doi2dataset import CitationBuilder, Person, PIFinder
@pytest.fixture
@ -27,8 +23,7 @@ def test_pi():
given_name="Author",
orcid="0000-0000-0000-1234",
email="test.author@example.org",
affiliation="Test University",
project=["Test Project"]
affiliation="Test University"
)
@ -43,15 +38,15 @@ def test_build_authors(openalex_data, pi_finder):
"""Test that CitationBuilder.build_authors correctly processes author information"""
doi = "10.1038/srep45389"
builder = CitationBuilder(data=openalex_data, doi=doi, pi_finder=pi_finder)
# Call the build_authors method - returns tuple of (authors, corresponding_authors)
authors, corresponding_authors = builder.build_authors()
# Verify that authors were created
assert authors is not None
assert isinstance(authors, list)
assert len(authors) > 0
# Check the structure of the authors
for author in authors:
assert hasattr(author, "given_name")
@ -64,17 +59,17 @@ def test_build_authors_with_affiliations(openalex_data, pi_finder):
"""Test that author affiliations are correctly processed"""
doi = "10.1038/srep45389"
builder = CitationBuilder(data=openalex_data, doi=doi, pi_finder=pi_finder)
# Call the build_authors method
authors, _ = builder.build_authors()
# Check if any authors have affiliation
affiliation_found = False
for author in authors:
if hasattr(author, "affiliation") and author.affiliation:
affiliation_found = True
break
# We may not have affiliations in the test data, so only assert if we found any
if affiliation_found:
assert affiliation_found, "No author with affiliation found"
@ -84,14 +79,14 @@ def test_build_authors_with_corresponding_author(openalex_data, pi_finder):
"""Test that corresponding authors are correctly identified"""
doi = "10.1038/srep45389"
builder = CitationBuilder(data=openalex_data, doi=doi, pi_finder=pi_finder)
# Process authors
authors, corresponding_authors = builder.build_authors()
# Verify that corresponding authors were identified
if len(corresponding_authors) > 0:
assert len(corresponding_authors) > 0, "No corresponding authors identified"
# Check structure of corresponding authors
for author in corresponding_authors:
assert hasattr(author, "given_name")
@ -103,7 +98,7 @@ def test_build_authors_with_corresponding_author(openalex_data, pi_finder):
def test_build_authors_with_ror(openalex_data, pi_finder):
"""Test that ROR (Research Organization Registry) identifiers are correctly used when ror=True"""
doi = "10.1038/srep45389"
# First confirm the sample data contains at least one institution with a ROR identifier
has_ror_institution = False
for authorship in openalex_data.get("authorships", []):
@ -114,61 +109,61 @@ def test_build_authors_with_ror(openalex_data, pi_finder):
break
if has_ror_institution:
break
# Skip test if no ROR identifiers in sample data
if not has_ror_institution:
pytest.skip("Test data doesn't contain any ROR identifiers")
# Create builder with ror=True to enable ROR identifiers
builder = CitationBuilder(data=openalex_data, doi=doi, pi_finder=pi_finder, ror=True)
# Get authors
authors, _ = builder.build_authors()
# Verify we got authors back
assert len(authors) > 0, "No authors were extracted from the test data"
# Check for at least one Institution with a ROR ID
ror_found = False
institution_with_ror = None
for author in authors:
# Check if author has affiliation
if not hasattr(author, 'affiliation') or not author.affiliation:
continue
# Check if affiliation is an Institution with a ROR ID
if not hasattr(author.affiliation, 'ror'):
continue
# Check if ROR ID is present and contains "ror.org"
if author.affiliation.ror and "ror.org" in author.affiliation.ror:
ror_found = True
institution_with_ror = author.affiliation
break
# Verify ROR IDs are used when ror=True
assert ror_found, "Expected at least one author with a ROR ID when ror=True"
# Check expanded_value in the affiliation field when ROR is used
if institution_with_ror:
# Get the affiliation field
affiliation_field = institution_with_ror.affiliation_field()
# Verify it's set up correctly with the ROR ID as the value
assert affiliation_field.value == institution_with_ror.ror
# Verify the expanded_value dictionary has the expected structure
assert hasattr(affiliation_field, 'expanded_value')
assert isinstance(affiliation_field.expanded_value, dict)
# Check specific fields in the expanded_value
expanded_value = affiliation_field.expanded_value
assert "scheme" in expanded_value
assert expanded_value["scheme"] == "http://www.grid.ac/ontology/"
assert "termName" in expanded_value
assert expanded_value["termName"] == institution_with_ror.display_name
assert "@type" in expanded_value
assert expanded_value["@type"] == "https://schema.org/Organization"
assert expanded_value["@type"] == "https://schema.org/Organization"

View file

@ -3,21 +3,9 @@ import sys
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
from doi2dataset import NameProcessor, Phase, sanitize_filename, validate_email_address
from doi2dataset import NameProcessor, sanitize_filename, validate_email_address
def test_phase_check_year():
"""Test that check_year correctly determines if a year is within the phase boundaries."""
phase = Phase("TestPhase", 2000, 2010)
# Within boundaries
assert phase.check_year(2005) is True
# Outside boundaries
assert phase.check_year(1999) is False
assert phase.check_year(2011) is False
# Boundary cases
assert phase.check_year(2000) is True
assert phase.check_year(2010) is True
def test_sanitize_filename():
"""Test the sanitize_filename function to convert DOI to a valid filename."""
doi = "10.1234/abc.def"

View file

@ -4,16 +4,15 @@ import os
import pytest
from doi2dataset import (
AbstractProcessor,
AbstractProcessor,
APIClient,
CitationBuilder,
Config,
License,
LicenseProcessor,
CitationBuilder,
Config,
LicenseProcessor,
MetadataProcessor,
Person,
PIFinder,
SubjectMapper
SubjectMapper,
)
@ -78,16 +77,16 @@ def test_openalex_abstract_extraction(mocker, fake_openalex_response):
"""Test the extraction of abstracts from OpenAlex inverted index data."""
# Create API client for AbstractProcessor
api_client = APIClient()
# Create processor
processor = AbstractProcessor(api_client=api_client)
# Call the protected method directly with the fake response
abstract_text = processor._get_openalex_abstract(fake_openalex_response)
# Verify abstract was extracted
assert abstract_text is not None
# If abstract exists in the response, it should be properly extracted
if 'abstract_inverted_index' in fake_openalex_response:
assert len(abstract_text) > 0
@ -97,15 +96,15 @@ def test_subject_mapper(fake_openalex_response):
"""Test that the SubjectMapper correctly maps OpenAlex topics to subjects."""
# Extract topics from the OpenAlex response
topics = fake_openalex_response.get("topics", [])
# Convert topics to strings - we'll use display_name
topic_names = []
if topics:
topic_names = [topic.get("display_name") for topic in topics if topic.get("display_name")]
# Get subjects using the class method
subjects = SubjectMapper.get_subjects({"topics": topics})
# Verify subjects were returned
assert subjects is not None
assert isinstance(subjects, list)
@ -114,21 +113,21 @@ def test_subject_mapper(fake_openalex_response):
def test_citation_builder(fake_openalex_response):
"""Test that the CitationBuilder correctly builds author information."""
doi = "10.1038/srep45389"
# Mock PIFinder with an empty list of PIs
pi_finder = PIFinder(pis=[])
# Create builder with required arguments
builder = CitationBuilder(data=fake_openalex_response, doi=doi, pi_finder=pi_finder)
# Test building other IDs
other_ids = builder.build_other_ids()
assert isinstance(other_ids, list)
# Test building grants
grants = builder.build_grants()
assert isinstance(grants, list)
# Test building topics
topics = builder.build_topics()
assert isinstance(topics, list)
@ -140,10 +139,10 @@ def test_license_processor(fake_openalex_response):
license_data = {
"primary_location": fake_openalex_response.get("primary_location", {})
}
# Process the license
license_obj = LicenseProcessor.process_license(license_data)
# Verify license processing
assert license_obj is not None
assert hasattr(license_obj, "name")
@ -158,16 +157,15 @@ def test_pi_finder_find_by_orcid():
given_name="Jon",
orcid="0000-0000-0000-0000",
email="jon.doe@iana.org",
affiliation="Institute of Science, Some University",
project=["Project A01"]
affiliation="Institute of Science, Some University"
)
# Create PIFinder with our test PI
finder = PIFinder(pis=[test_pi])
# Find PI by ORCID
pi = finder._find_by_orcid("0000-0000-0000-0000")
# Verify the PI was found
assert pi is not None
assert pi.family_name == "Doe"
@ -177,7 +175,7 @@ def test_pi_finder_find_by_orcid():
def test_config_load_invalid_path():
"""Test that Config.load_config raises an error when an invalid path is provided."""
invalid_path = "non_existent_config.yaml"
# Verify that attempting to load a non-existent config raises an error
with pytest.raises(FileNotFoundError):
Config.load_config(config_path=invalid_path)
@ -186,20 +184,20 @@ def test_config_load_invalid_path():
def test_metadata_processor_fetch_data(mocker, fake_openalex_response):
"""Test the _fetch_data method of the MetadataProcessor class with mocked responses."""
doi = "10.1038/srep45389"
# Mock API response
mocker.patch("doi2dataset.APIClient.make_request",
mocker.patch("doi2dataset.APIClient.make_request",
return_value=FakeResponse(fake_openalex_response, 200))
# Create processor with upload disabled and progress disabled
processor = MetadataProcessor(doi=doi, upload=False, progress=False)
# Test the _fetch_data method directly
data = processor._fetch_data()
# Verify that data was fetched correctly
assert data is not None
assert data == fake_openalex_response
# Verify the DOI is correctly stored
assert processor.doi == doi

View file

@ -1,7 +1,8 @@
import json
import os
from unittest.mock import MagicMock
import pytest
from unittest.mock import MagicMock, patch
from doi2dataset import MetadataProcessor
@ -27,36 +28,35 @@ def test_build_metadata_basic_fields(metadata_processor, openalex_data, monkeypa
"""Test that _build_metadata correctly extracts basic metadata fields"""
# Mock the console to avoid print errors
metadata_processor.console = MagicMock()
# Mock the Abstract related methods and objects to avoid console errors
abstract_mock = MagicMock()
abstract_mock.text = "This is a sample abstract"
abstract_mock.source = "openalex"
monkeypatch.setattr("doi2dataset.AbstractProcessor.get_abstract", lambda *args, **kwargs: abstract_mock)
# Mock the _fetch_data method to return our test data
metadata_processor._fetch_data = MagicMock(return_value=openalex_data)
# Mock methods that might cause issues in isolation
metadata_processor._build_description = MagicMock(return_value="Test description")
metadata_processor._get_involved_pis = MagicMock(return_value=[])
metadata_processor._build_organization_metadata = MagicMock(return_value={})
# Call the method we're testing
metadata = metadata_processor._build_metadata(openalex_data)
# Verify the basic metadata fields were extracted correctly
assert metadata is not None
assert 'datasetVersion' in metadata
# Examine the fields inside datasetVersion.metadataBlocks
assert 'metadataBlocks' in metadata['datasetVersion']
citation = metadata['datasetVersion']['metadataBlocks'].get('citation', {})
# Check fields in citation section
assert 'fields' in citation
fields = citation['fields']
# Check for basic metadata fields in a more flexible way
field_names = [field.get('typeName') for field in fields]
assert 'title' in field_names
@ -68,44 +68,43 @@ def test_build_metadata_authors(metadata_processor, openalex_data, monkeypatch):
"""Test that _build_metadata correctly processes author information"""
# Mock the console to avoid print errors
metadata_processor.console = MagicMock()
# Mock the Abstract related methods and objects to avoid console errors
abstract_mock = MagicMock()
abstract_mock.text = "This is a sample abstract"
abstract_mock.source = "openalex"
monkeypatch.setattr("doi2dataset.AbstractProcessor.get_abstract", lambda *args, **kwargs: abstract_mock)
# Mock the _fetch_data method to return our test data
metadata_processor._fetch_data = MagicMock(return_value=openalex_data)
# Mock methods that might cause issues in isolation
metadata_processor._build_description = MagicMock(return_value="Test description")
metadata_processor._get_involved_pis = MagicMock(return_value=[])
metadata_processor._build_organization_metadata = MagicMock(return_value={})
# Call the method we're testing
metadata = metadata_processor._build_metadata(openalex_data)
# Examine the fields inside datasetVersion.metadataBlocks
assert 'metadataBlocks' in metadata['datasetVersion']
citation = metadata['datasetVersion']['metadataBlocks'].get('citation', {})
# Check fields in citation section
assert 'fields' in citation
fields = citation['fields']
# Check for author and datasetContact fields
field_names = [field.get('typeName') for field in fields]
assert 'author' in field_names
assert 'datasetContact' in field_names
# Verify these are compound fields with actual entries
for field in fields:
if field.get('typeName') == 'author':
assert 'value' in field
assert isinstance(field['value'], list)
assert len(field['value']) > 0
if field.get('typeName') == 'datasetContact':
assert 'value' in field
assert isinstance(field['value'], list)
@ -117,46 +116,45 @@ def test_build_metadata_keywords_and_topics(metadata_processor, openalex_data, m
"""Test that _build_metadata correctly extracts keywords and topics"""
# Mock the console to avoid print errors
metadata_processor.console = MagicMock()
# Mock the Abstract related methods and objects to avoid console errors
abstract_mock = MagicMock()
abstract_mock.text = "This is a sample abstract"
abstract_mock.source = "openalex"
monkeypatch.setattr("doi2dataset.AbstractProcessor.get_abstract", lambda *args, **kwargs: abstract_mock)
# Mock the _fetch_data method to return our test data
metadata_processor._fetch_data = MagicMock(return_value=openalex_data)
# Mock methods that might cause issues in isolation
metadata_processor._build_description = MagicMock(return_value="Test description")
metadata_processor._get_involved_pis = MagicMock(return_value=[])
metadata_processor._build_organization_metadata = MagicMock(return_value={})
# Call the method we're testing
metadata = metadata_processor._build_metadata(openalex_data)
# Examine the fields inside datasetVersion.metadataBlocks
assert 'metadataBlocks' in metadata['datasetVersion']
citation = metadata['datasetVersion']['metadataBlocks'].get('citation', {})
# Check fields in citation section
assert 'fields' in citation
fields = citation['fields']
# Check for keyword and subject fields
field_names = [field.get('typeName') for field in fields]
# If keywords exist, verify structure
if 'keyword' in field_names:
for field in fields:
if field.get('typeName') == 'keyword':
assert 'value' in field
assert isinstance(field['value'], list)
# Check for subject field which should definitely exist
assert 'subject' in field_names
for field in fields:
if field.get('typeName') == 'subject':
assert 'value' in field
assert isinstance(field['value'], list)
assert len(field['value']) > 0
assert len(field['value']) > 0

View file

@ -1,5 +1,5 @@
import pytest
from doi2dataset import Person, Institution
from doi2dataset import Institution, Person
def test_person_to_dict_with_string_affiliation():
"""Test Person.to_dict() with a string affiliation."""
@ -8,35 +8,32 @@ def test_person_to_dict_with_string_affiliation():
given_name="John",
orcid="0000-0001-2345-6789",
email="john.doe@example.org",
affiliation="Test University",
project=["Project A"]
affiliation="Test University"
)
result = person.to_dict()
assert result["family_name"] == "Doe"
assert result["given_name"] == "John"
assert result["orcid"] == "0000-0001-2345-6789"
assert result["email"] == "john.doe@example.org"
assert result["project"] == ["Project A"]
assert result["affiliation"] == "Test University"
def test_person_to_dict_with_institution_ror():
"""Test Person.to_dict() with an Institution that has a ROR ID."""
inst = Institution("Test University", "https://ror.org/12345")
person = Person(
family_name="Doe",
given_name="John",
orcid="0000-0001-2345-6789",
email="john.doe@example.org",
affiliation=inst,
project=["Project A"]
affiliation=inst
)
result = person.to_dict()
assert result["affiliation"] == "https://ror.org/12345"
# Check other fields too
assert result["family_name"] == "Doe"
@ -46,16 +43,16 @@ def test_person_to_dict_with_institution_ror():
def test_person_to_dict_with_institution_display_name_only():
"""Test Person.to_dict() with an Institution that has only a display_name."""
inst = Institution("Test University") # No ROR ID
person = Person(
family_name="Smith",
given_name="Jane",
orcid="0000-0001-9876-5432",
affiliation=inst
)
result = person.to_dict()
assert result["affiliation"] == "Test University"
assert result["family_name"] == "Smith"
assert result["given_name"] == "Jane"
@ -65,15 +62,15 @@ def test_person_to_dict_with_empty_institution():
"""Test Person.to_dict() with an Institution that has neither ROR nor display_name."""
# Create an Institution with empty values
inst = Institution("")
person = Person(
family_name="Brown",
given_name="Robert",
affiliation=inst
)
result = person.to_dict()
assert result["affiliation"] == ""
assert result["family_name"] == "Brown"
assert result["given_name"] == "Robert"
@ -86,10 +83,10 @@ def test_person_to_dict_with_no_affiliation():
given_name="Alice",
orcid="0000-0002-1111-2222"
)
result = person.to_dict()
assert result["affiliation"] == ""
assert result["family_name"] == "Green"
assert result["given_name"] == "Alice"
assert result["orcid"] == "0000-0002-1111-2222"
assert result["orcid"] == "0000-0002-1111-2222"