feat!: generalize script by removing organizational metadata

Remove Phase class, organizational metadata blocks, and unused project fields. Update configuration to use 'default_grants' and simplify PI usage to fallback corresponding author determination only. BREAKING CHANGES: - Remove 'phase' and 'project' fields from configuration - Use 'default_grants' instead of 'default_grant' - Generate only standard Dataverse citation metadata
2025-07-07 14:41:39 +02:00 · 2025-07-07 14:41:39 +02:00 · 67b46d5140
commit 67b46d5140
parent 01bc537bd8
11 changed files with 207 additions and 269 deletions
--- a/tests/config_test.yaml
+++ b/tests/config_test.yaml
@ -1,23 +1,16 @@
-default_grant:
+default_grants:
  - funder: "Awesome Funding Agency"
    id: "ABC12345"

-phase:
-  "Phase 1 (2021/2025)":
-    start: 2021
-    end: 2025
-
 pis:
  - family_name: "Doe"
    given_name: "Jon"
    orcid: "0000-0000-0000-0000"
    email: "jon.doe@iana.org"
    affiliation: "Institute of Science, Some University"
-    project: ["Project A01"]

  - family_name: "Doe"
    given_name: "Jane"
    orcid: "0000-0000-0000-0001"
    email: "jane.doe@iana.org"
    affiliation: "Institute of Science, Some University"
-    project: ["Project A02"]
--- a/tests/test_citation_builder.py
+++ b/tests/test_citation_builder.py
@ -1,13 +1,9 @@
 import json
 import os
-import pytest
-from unittest.mock import MagicMock

-from doi2dataset import (
-    CitationBuilder,
-    PIFinder,
-    Person
-)
+import pytest
+
+from doi2dataset import CitationBuilder, Person, PIFinder


@pytest.fixture
@ -27,8 +23,7 @@ def test_pi():
        given_name="Author",
        orcid="0000-0000-0000-1234",
        email="test.author@example.org",
-        affiliation="Test University",
-        project=["Test Project"]
+        affiliation="Test University"
    )


@ -43,15 +38,15 @@ def test_build_authors(openalex_data, pi_finder):
    """Test that CitationBuilder.build_authors correctly processes author information"""
    doi = "10.1038/srep45389"
    builder = CitationBuilder(data=openalex_data, doi=doi, pi_finder=pi_finder)
-    
+
    # Call the build_authors method - returns tuple of (authors, corresponding_authors)
    authors, corresponding_authors = builder.build_authors()
-    
+
    # Verify that authors were created
    assert authors is not None
    assert isinstance(authors, list)
    assert len(authors) > 0
-    
+
    # Check the structure of the authors
    for author in authors:
        assert hasattr(author, "given_name")
@ -64,17 +59,17 @@ def test_build_authors_with_affiliations(openalex_data, pi_finder):
    """Test that author affiliations are correctly processed"""
    doi = "10.1038/srep45389"
    builder = CitationBuilder(data=openalex_data, doi=doi, pi_finder=pi_finder)
-    
+
    # Call the build_authors method
    authors, _ = builder.build_authors()
-    
+
    # Check if any authors have affiliation
    affiliation_found = False
    for author in authors:
        if hasattr(author, "affiliation") and author.affiliation:
            affiliation_found = True
            break
-    
+
    # We may not have affiliations in the test data, so only assert if we found any
    if affiliation_found:
        assert affiliation_found, "No author with affiliation found"
@ -84,14 +79,14 @@ def test_build_authors_with_corresponding_author(openalex_data, pi_finder):
    """Test that corresponding authors are correctly identified"""
    doi = "10.1038/srep45389"
    builder = CitationBuilder(data=openalex_data, doi=doi, pi_finder=pi_finder)
-    
+
    # Process authors
    authors, corresponding_authors = builder.build_authors()
-    
+
    # Verify that corresponding authors were identified
    if len(corresponding_authors) > 0:
        assert len(corresponding_authors) > 0, "No corresponding authors identified"
-        
+
        # Check structure of corresponding authors
        for author in corresponding_authors:
            assert hasattr(author, "given_name")
@ -103,7 +98,7 @@ def test_build_authors_with_corresponding_author(openalex_data, pi_finder):
 def test_build_authors_with_ror(openalex_data, pi_finder):
    """Test that ROR (Research Organization Registry) identifiers are correctly used when ror=True"""
    doi = "10.1038/srep45389"
-    
+
    # First confirm the sample data contains at least one institution with a ROR identifier
    has_ror_institution = False
    for authorship in openalex_data.get("authorships", []):
@ -114,61 +109,61 @@ def test_build_authors_with_ror(openalex_data, pi_finder):
                break
        if has_ror_institution:
            break
-    
+
    # Skip test if no ROR identifiers in sample data
    if not has_ror_institution:
        pytest.skip("Test data doesn't contain any ROR identifiers")
-    
+
    # Create builder with ror=True to enable ROR identifiers
    builder = CitationBuilder(data=openalex_data, doi=doi, pi_finder=pi_finder, ror=True)
-    
+
    # Get authors
    authors, _ = builder.build_authors()
-    
+
    # Verify we got authors back
    assert len(authors) > 0, "No authors were extracted from the test data"
-    
+
    # Check for at least one Institution with a ROR ID
    ror_found = False
    institution_with_ror = None
-    
+
    for author in authors:
        # Check if author has affiliation
        if not hasattr(author, 'affiliation') or not author.affiliation:
            continue
-        
+
        # Check if affiliation is an Institution with a ROR ID
        if not hasattr(author.affiliation, 'ror'):
            continue
-            
+
        # Check if ROR ID is present and contains "ror.org"
        if author.affiliation.ror and "ror.org" in author.affiliation.ror:
            ror_found = True
            institution_with_ror = author.affiliation
            break
-    
+
    # Verify ROR IDs are used when ror=True
    assert ror_found, "Expected at least one author with a ROR ID when ror=True"
-    
+
    # Check expanded_value in the affiliation field when ROR is used
    if institution_with_ror:
        # Get the affiliation field
        affiliation_field = institution_with_ror.affiliation_field()
-        
+
        # Verify it's set up correctly with the ROR ID as the value
        assert affiliation_field.value == institution_with_ror.ror
-        
+
        # Verify the expanded_value dictionary has the expected structure
        assert hasattr(affiliation_field, 'expanded_value')
        assert isinstance(affiliation_field.expanded_value, dict)
-        
+
        # Check specific fields in the expanded_value
        expanded_value = affiliation_field.expanded_value
        assert "scheme" in expanded_value
        assert expanded_value["scheme"] == "http://www.grid.ac/ontology/"
-        
+
        assert "termName" in expanded_value
        assert expanded_value["termName"] == institution_with_ror.display_name
-        
+
        assert "@type" in expanded_value
-        assert expanded_value["@type"] == "https://schema.org/Organization"
+        assert expanded_value["@type"] == "https://schema.org/Organization"
--- a/tests/test_doi2dataset.py
+++ b/tests/test_doi2dataset.py
@ -3,21 +3,9 @@ import sys

 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

-from doi2dataset import NameProcessor, Phase, sanitize_filename, validate_email_address
+from doi2dataset import NameProcessor, sanitize_filename, validate_email_address


-def test_phase_check_year():
-    """Test that check_year correctly determines if a year is within the phase boundaries."""
-    phase = Phase("TestPhase", 2000, 2010)
-    # Within boundaries
-    assert phase.check_year(2005) is True
-    # Outside boundaries
-    assert phase.check_year(1999) is False
-    assert phase.check_year(2011) is False
-    # Boundary cases
-    assert phase.check_year(2000) is True
-    assert phase.check_year(2010) is True
-
 def test_sanitize_filename():
    """Test the sanitize_filename function to convert DOI to a valid filename."""
    doi = "10.1234/abc.def"
--- a/tests/test_fetch_doi_mock.py
+++ b/tests/test_fetch_doi_mock.py
@ -4,16 +4,15 @@ import os
 import pytest

 from doi2dataset import (
-    AbstractProcessor, 
+    AbstractProcessor,
    APIClient,
-    CitationBuilder, 
-    Config, 
-    License,
-    LicenseProcessor, 
+    CitationBuilder,
+    Config,
+    LicenseProcessor,
    MetadataProcessor,
    Person,
    PIFinder,
-    SubjectMapper
+    SubjectMapper,
 )


@ -78,16 +77,16 @@ def test_openalex_abstract_extraction(mocker, fake_openalex_response):
    """Test the extraction of abstracts from OpenAlex inverted index data."""
    # Create API client for AbstractProcessor
    api_client = APIClient()
-    
+
    # Create processor
    processor = AbstractProcessor(api_client=api_client)
-    
+
    # Call the protected method directly with the fake response
    abstract_text = processor._get_openalex_abstract(fake_openalex_response)
-    
+
    # Verify abstract was extracted
    assert abstract_text is not None
-    
+
    # If abstract exists in the response, it should be properly extracted
    if 'abstract_inverted_index' in fake_openalex_response:
        assert len(abstract_text) > 0
@ -97,15 +96,15 @@ def test_subject_mapper(fake_openalex_response):
    """Test that the SubjectMapper correctly maps OpenAlex topics to subjects."""
    # Extract topics from the OpenAlex response
    topics = fake_openalex_response.get("topics", [])
-    
+
    # Convert topics to strings - we'll use display_name
    topic_names = []
    if topics:
        topic_names = [topic.get("display_name") for topic in topics if topic.get("display_name")]
-    
+
    # Get subjects using the class method
    subjects = SubjectMapper.get_subjects({"topics": topics})
-    
+
    # Verify subjects were returned
    assert subjects is not None
    assert isinstance(subjects, list)
@ -114,21 +113,21 @@ def test_subject_mapper(fake_openalex_response):
 def test_citation_builder(fake_openalex_response):
    """Test that the CitationBuilder correctly builds author information."""
    doi = "10.1038/srep45389"
-    
+
    # Mock PIFinder with an empty list of PIs
    pi_finder = PIFinder(pis=[])
-    
+
    # Create builder with required arguments
    builder = CitationBuilder(data=fake_openalex_response, doi=doi, pi_finder=pi_finder)
-    
+
    # Test building other IDs
    other_ids = builder.build_other_ids()
    assert isinstance(other_ids, list)
-    
+
    # Test building grants
    grants = builder.build_grants()
    assert isinstance(grants, list)
-    
+
    # Test building topics
    topics = builder.build_topics()
    assert isinstance(topics, list)
@ -140,10 +139,10 @@ def test_license_processor(fake_openalex_response):
    license_data = {
        "primary_location": fake_openalex_response.get("primary_location", {})
    }
-    
+
    # Process the license
    license_obj = LicenseProcessor.process_license(license_data)
-    
+
    # Verify license processing
    assert license_obj is not None
    assert hasattr(license_obj, "name")
@ -158,16 +157,15 @@ def test_pi_finder_find_by_orcid():
        given_name="Jon",
        orcid="0000-0000-0000-0000",
        email="jon.doe@iana.org",
-        affiliation="Institute of Science, Some University",
-        project=["Project A01"]
+        affiliation="Institute of Science, Some University"
    )
-    
+
    # Create PIFinder with our test PI
    finder = PIFinder(pis=[test_pi])
-    
+
    # Find PI by ORCID
    pi = finder._find_by_orcid("0000-0000-0000-0000")
-    
+
    # Verify the PI was found
    assert pi is not None
    assert pi.family_name == "Doe"
@ -177,7 +175,7 @@ def test_pi_finder_find_by_orcid():
 def test_config_load_invalid_path():
    """Test that Config.load_config raises an error when an invalid path is provided."""
    invalid_path = "non_existent_config.yaml"
-    
+
    # Verify that attempting to load a non-existent config raises an error
    with pytest.raises(FileNotFoundError):
        Config.load_config(config_path=invalid_path)
@ -186,20 +184,20 @@ def test_config_load_invalid_path():
 def test_metadata_processor_fetch_data(mocker, fake_openalex_response):
    """Test the _fetch_data method of the MetadataProcessor class with mocked responses."""
    doi = "10.1038/srep45389"
-    
+
    # Mock API response
-    mocker.patch("doi2dataset.APIClient.make_request", 
+    mocker.patch("doi2dataset.APIClient.make_request",
                 return_value=FakeResponse(fake_openalex_response, 200))
-    
+
    # Create processor with upload disabled and progress disabled
    processor = MetadataProcessor(doi=doi, upload=False, progress=False)
-    
+
    # Test the _fetch_data method directly
    data = processor._fetch_data()
-    
+
    # Verify that data was fetched correctly
    assert data is not None
    assert data == fake_openalex_response
-    
+
    # Verify the DOI is correctly stored
    assert processor.doi == doi
--- a/tests/test_metadata_processor.py
+++ b/tests/test_metadata_processor.py
@ -1,7 +1,8 @@
 import json
 import os
+from unittest.mock import MagicMock
+
 import pytest
-from unittest.mock import MagicMock, patch

 from doi2dataset import MetadataProcessor

@ -27,36 +28,35 @@ def test_build_metadata_basic_fields(metadata_processor, openalex_data, monkeypa
    """Test that _build_metadata correctly extracts basic metadata fields"""
    # Mock the console to avoid print errors
    metadata_processor.console = MagicMock()
-    
+
    # Mock the Abstract related methods and objects to avoid console errors
    abstract_mock = MagicMock()
    abstract_mock.text = "This is a sample abstract"
    abstract_mock.source = "openalex"
    monkeypatch.setattr("doi2dataset.AbstractProcessor.get_abstract", lambda *args, **kwargs: abstract_mock)
-    
+
    # Mock the _fetch_data method to return our test data
    metadata_processor._fetch_data = MagicMock(return_value=openalex_data)
-    
+
    # Mock methods that might cause issues in isolation
    metadata_processor._build_description = MagicMock(return_value="Test description")
    metadata_processor._get_involved_pis = MagicMock(return_value=[])
-    metadata_processor._build_organization_metadata = MagicMock(return_value={})
-    
+
    # Call the method we're testing
    metadata = metadata_processor._build_metadata(openalex_data)

    # Verify the basic metadata fields were extracted correctly
    assert metadata is not None
    assert 'datasetVersion' in metadata
-    
+
    # Examine the fields inside datasetVersion.metadataBlocks
    assert 'metadataBlocks' in metadata['datasetVersion']
    citation = metadata['datasetVersion']['metadataBlocks'].get('citation', {})
-    
+
    # Check fields in citation section
    assert 'fields' in citation
    fields = citation['fields']
-    
+
    # Check for basic metadata fields in a more flexible way
    field_names = [field.get('typeName') for field in fields]
    assert 'title' in field_names
@ -68,44 +68,43 @@ def test_build_metadata_authors(metadata_processor, openalex_data, monkeypatch):
    """Test that _build_metadata correctly processes author information"""
    # Mock the console to avoid print errors
    metadata_processor.console = MagicMock()
-    
+
    # Mock the Abstract related methods and objects to avoid console errors
    abstract_mock = MagicMock()
    abstract_mock.text = "This is a sample abstract"
    abstract_mock.source = "openalex"
    monkeypatch.setattr("doi2dataset.AbstractProcessor.get_abstract", lambda *args, **kwargs: abstract_mock)
-    
+
    # Mock the _fetch_data method to return our test data
    metadata_processor._fetch_data = MagicMock(return_value=openalex_data)
-    
+
    # Mock methods that might cause issues in isolation
    metadata_processor._build_description = MagicMock(return_value="Test description")
    metadata_processor._get_involved_pis = MagicMock(return_value=[])
-    metadata_processor._build_organization_metadata = MagicMock(return_value={})
-    
+
    # Call the method we're testing
    metadata = metadata_processor._build_metadata(openalex_data)

    # Examine the fields inside datasetVersion.metadataBlocks
    assert 'metadataBlocks' in metadata['datasetVersion']
    citation = metadata['datasetVersion']['metadataBlocks'].get('citation', {})
-    
+
    # Check fields in citation section
    assert 'fields' in citation
    fields = citation['fields']
-    
+
    # Check for author and datasetContact fields
    field_names = [field.get('typeName') for field in fields]
    assert 'author' in field_names
    assert 'datasetContact' in field_names
-    
+
    # Verify these are compound fields with actual entries
    for field in fields:
        if field.get('typeName') == 'author':
            assert 'value' in field
            assert isinstance(field['value'], list)
            assert len(field['value']) > 0
-        
+
        if field.get('typeName') == 'datasetContact':
            assert 'value' in field
            assert isinstance(field['value'], list)
@ -117,46 +116,45 @@ def test_build_metadata_keywords_and_topics(metadata_processor, openalex_data, m
    """Test that _build_metadata correctly extracts keywords and topics"""
    # Mock the console to avoid print errors
    metadata_processor.console = MagicMock()
-    
+
    # Mock the Abstract related methods and objects to avoid console errors
    abstract_mock = MagicMock()
    abstract_mock.text = "This is a sample abstract"
    abstract_mock.source = "openalex"
    monkeypatch.setattr("doi2dataset.AbstractProcessor.get_abstract", lambda *args, **kwargs: abstract_mock)
-    
+
    # Mock the _fetch_data method to return our test data
    metadata_processor._fetch_data = MagicMock(return_value=openalex_data)
-    
+
    # Mock methods that might cause issues in isolation
    metadata_processor._build_description = MagicMock(return_value="Test description")
    metadata_processor._get_involved_pis = MagicMock(return_value=[])
-    metadata_processor._build_organization_metadata = MagicMock(return_value={})
-    
+
    # Call the method we're testing
    metadata = metadata_processor._build_metadata(openalex_data)
-        
+
    # Examine the fields inside datasetVersion.metadataBlocks
    assert 'metadataBlocks' in metadata['datasetVersion']
    citation = metadata['datasetVersion']['metadataBlocks'].get('citation', {})
-    
+
    # Check fields in citation section
    assert 'fields' in citation
    fields = citation['fields']
-    
+
    # Check for keyword and subject fields
    field_names = [field.get('typeName') for field in fields]
-    
+
    # If keywords exist, verify structure
    if 'keyword' in field_names:
        for field in fields:
            if field.get('typeName') == 'keyword':
                assert 'value' in field
                assert isinstance(field['value'], list)
-    
+
    # Check for subject field which should definitely exist
    assert 'subject' in field_names
    for field in fields:
        if field.get('typeName') == 'subject':
            assert 'value' in field
            assert isinstance(field['value'], list)
-            assert len(field['value']) > 0
+            assert len(field['value']) > 0
--- a/tests/test_person.py
+++ b/tests/test_person.py
@ -1,5 +1,5 @@
-import pytest
-from doi2dataset import Person, Institution
+from doi2dataset import Institution, Person
+

 def test_person_to_dict_with_string_affiliation():
    """Test Person.to_dict() with a string affiliation."""
@ -8,35 +8,32 @@ def test_person_to_dict_with_string_affiliation():
        given_name="John",
        orcid="0000-0001-2345-6789",
        email="john.doe@example.org",
-        affiliation="Test University",
-        project=["Project A"]
+        affiliation="Test University"
    )
-    
+
    result = person.to_dict()
-    
+
    assert result["family_name"] == "Doe"
    assert result["given_name"] == "John"
    assert result["orcid"] == "0000-0001-2345-6789"
    assert result["email"] == "john.doe@example.org"
-    assert result["project"] == ["Project A"]
    assert result["affiliation"] == "Test University"


 def test_person_to_dict_with_institution_ror():
    """Test Person.to_dict() with an Institution that has a ROR ID."""
    inst = Institution("Test University", "https://ror.org/12345")
-    
+
    person = Person(
        family_name="Doe",
        given_name="John",
        orcid="0000-0001-2345-6789",
        email="john.doe@example.org",
-        affiliation=inst,
-        project=["Project A"]
+        affiliation=inst
    )
-    
+
    result = person.to_dict()
-    
+
    assert result["affiliation"] == "https://ror.org/12345"
    # Check other fields too
    assert result["family_name"] == "Doe"
@ -46,16 +43,16 @@ def test_person_to_dict_with_institution_ror():
 def test_person_to_dict_with_institution_display_name_only():
    """Test Person.to_dict() with an Institution that has only a display_name."""
    inst = Institution("Test University")  # No ROR ID
-    
+
    person = Person(
        family_name="Smith",
        given_name="Jane",
        orcid="0000-0001-9876-5432",
        affiliation=inst
    )
-    
+
    result = person.to_dict()
-    
+
    assert result["affiliation"] == "Test University"
    assert result["family_name"] == "Smith"
    assert result["given_name"] == "Jane"
@ -65,15 +62,15 @@ def test_person_to_dict_with_empty_institution():
    """Test Person.to_dict() with an Institution that has neither ROR nor display_name."""
    # Create an Institution with empty values
    inst = Institution("")
-    
+
    person = Person(
        family_name="Brown",
        given_name="Robert",
        affiliation=inst
    )
-    
+
    result = person.to_dict()
-    
+
    assert result["affiliation"] == ""
    assert result["family_name"] == "Brown"
    assert result["given_name"] == "Robert"
@ -86,10 +83,10 @@ def test_person_to_dict_with_no_affiliation():
        given_name="Alice",
        orcid="0000-0002-1111-2222"
    )
-    
+
    result = person.to_dict()
-    
+
    assert result["affiliation"] == ""
    assert result["family_name"] == "Green"
    assert result["given_name"] == "Alice"
-    assert result["orcid"] == "0000-0002-1111-2222"
+    assert result["orcid"] == "0000-0002-1111-2222"