From 67b46d5140691cb154924171b823c78c9246fea1 Mon Sep 17 00:00:00 2001 From: Alexander Minges Date: Mon, 7 Jul 2025 14:41:39 +0200 Subject: [PATCH] feat!: generalize script by removing organizational metadata Remove Phase class, organizational metadata blocks, and unused project fields. Update configuration to use 'default_grants' and simplify PI usage to fallback corresponding author determination only. BREAKING CHANGES: - Remove 'phase' and 'project' fields from configuration - Use 'default_grants' instead of 'default_grant' - Generate only standard Dataverse citation metadata --- .gitignore | 1 + README.md | 80 +++++++++++++++++--- __init__.py | 5 +- config_example.yaml | 22 +++--- doi2dataset.py | 123 ++++--------------------------- tests/config_test.yaml | 9 +-- tests/test_citation_builder.py | 65 ++++++++-------- tests/test_doi2dataset.py | 14 +--- tests/test_fetch_doi_mock.py | 62 ++++++++-------- tests/test_metadata_processor.py | 56 +++++++------- tests/test_person.py | 39 +++++----- 11 files changed, 207 insertions(+), 269 deletions(-) diff --git a/.gitignore b/.gitignore index 6ff6e4c..73052a1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # Config file config.yaml +.config.yaml # Processed DOIs *.json diff --git a/README.md b/README.md index 8b66b2a..6470fb4 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,14 @@ - **DOI Validation and Normalization:** Validates DOIs and converts them into a standardized format. - **Metadata Retrieval:** Fetches metadata such as title, abstract, license, and author information from external sources. -- **Metadata Mapping:** Automatically maps and generates metadata fields (e.g., title, description, keywords) including support for controlled vocabularies and compound fields. +- **Standard Dataverse Metadata:** Generates standard Dataverse citation metadata including: + - Title, publication date, and alternative URL + - Author information with affiliations and ORCID identifiers + - Dataset contact information (corresponding authors) + - Abstract and description + - Keywords and subject classification + - Grant/funding information + - License information when available - **Optional Upload:** Allows uploading of metadata directly to a Dataverse.org server. - **Progress Tracking:** Uses the Rich library for user-friendly progress tracking and error handling. @@ -23,14 +30,41 @@ cd doi2dataset ## Configuration -Configuration - Before running the tool, configure the necessary settings in the `config.yaml` file located in the project root. This file contains configuration details such as: -- Connection details (URL, API token, authentication credentials) -- Mapping of project phases -- Principal Investigator (PI) information -- Default grant configurations +- **Connection details**: URL, API token, authentication credentials for Dataverse server +- **Principal Investigator (PI) information**: Optional - used for fallback determination of corresponding authors when not explicitly specified in the publication +- **Default grant configurations**: Funding information to be included in the metadata (supports multiple grants) + +### Configuration File Structure + +The configuration file should follow this structure: + +```yaml +# Dataverse server connection details +dataverse: + url: "https://your-dataverse-instance.org" + api_token: "your-api-token" + +# Default grant information (supports multiple grants) +default_grants: + - funder: "Your Funding Agency" + id: "GRANT123456" + - funder: "Another Funding Agency" + id: "GRANT789012" + +# Principal investigators for fallback corresponding author determination (optional) +pis: + - family_name: "Doe" + given_name: "John" + orcid: "0000-0000-0000-0000" + email: "john.doe@university.edu" + affiliation: "Department of Science, University" +``` + +See `config_example.yaml` for a complete example configuration. + +**Note**: The PI section is optional. If no corresponding authors are found in the publication metadata and no PIs are configured, the tool will still generate metadata but may issue a warning about missing corresponding author information. ## Usage @@ -102,11 +136,13 @@ pytest --cov=. --cov-report=html This creates a `htmlcov` directory. Open `htmlcov/index.html` in a browser to view the detailed coverage report. A `.coveragerc` configuration file is provided that: + - Excludes test files, documentation, and boilerplate code from coverage analysis - Configures reporting to ignore common non-testable lines (like defensive imports) - Sets the output directory for HTML reports Recent improvements have increased coverage from 48% to 61% by adding focused tests for: + - Citation building functionality - License processing and validation - Metadata field extraction @@ -114,6 +150,7 @@ Recent improvements have increased coverage from 48% to 61% by adding focused te - Publication data parsing and validation Areas that could benefit from additional testing: + - More edge cases in the MetadataProcessor class workflow - Additional CitationBuilder scenarios with diverse inputs - Complex network interactions and error handling @@ -122,7 +159,7 @@ Areas that could benefit from additional testing: The test suite is organized into six main files: -1. **test_doi2dataset.py**: Basic tests for core functions like phase checking, name splitting and DOI validation. +1. **test_doi2dataset.py**: Basic tests for core functions like name splitting, DOI validation, and filename sanitization. 2. **test_fetch_doi_mock.py**: Tests API interactions using a mock OpenAlex response stored in `srep45389.json`. 3. **test_citation_builder.py**: Tests for building citation metadata from API responses. 4. **test_metadata_processor.py**: Tests for the metadata processing workflow. @@ -136,7 +173,6 @@ The test suite covers the following categories of functionality: #### Core Functionality Tests - **DOI Validation and Processing**: Parameterized tests for DOI normalization, validation, and filename sanitization with various inputs. -- **Phase Management**: Tests for checking publication year against defined project phases, including boundary cases. - **Name Processing**: Extensive tests for parsing and splitting author names in different formats (with/without commas, middle initials, etc.). - **Email Validation**: Tests for proper validation of email addresses with various domain configurations. @@ -151,12 +187,36 @@ The test suite covers the following categories of functionality: - **Citation Building**: Tests for properly building citation metadata from API responses. - **License Processing**: Tests for correctly identifying and formatting license information from various license IDs. -- **Principal Investigator Matching**: Tests for finding project PIs based on ORCID identifiers. +- **Principal Investigator Matching**: Tests for finding project PIs based on ORCID identifiers (used for fallback corresponding author determination). - **Configuration Loading**: Tests for properly loading and validating configuration from files. - **Metadata Workflow**: Tests for the complete metadata processing workflow. These tests ensure that all components work correctly in isolation and together as a system, with special attention to edge cases and error handling. +## Changelog + +### Version 0.2.0 - Generalization Update + +This version has been updated to make the tool more generalized and suitable for broader use cases: + +**Breaking Changes:** + +- Removed organizational-specific metadata blocks (project phases, organizational fields) +- Removed `Phase` class and phase-related configuration +- Simplified configuration structure + +**What's New:** + +- Streamlined metadata generation focusing on standard Dataverse citation metadata +- Reduced configuration requirements for easier adoption +- Maintained PI information support for corresponding author fallback functionality + +**Migration Guide:** + +- Remove the `phase` section from your configuration file +- The tool will now generate only standard citation metadata blocks +- PI information is still supported and used for fallback corresponding author determination + ## Contributing Contributions are welcome! Please fork the repository and submit a pull request with your improvements. diff --git a/__init__.py b/__init__.py index 0db05d7..f1bd259 100644 --- a/__init__.py +++ b/__init__.py @@ -8,10 +8,9 @@ from .doi2dataset import ( LicenseProcessor, MetadataProcessor, NameProcessor, + Person, PIFinder, - Person, - Phase, SubjectMapper, sanitize_filename, validate_email_address, -) \ No newline at end of file +) diff --git a/config_example.yaml b/config_example.yaml index f14cdad..d00d523 100644 --- a/config_example.yaml +++ b/config_example.yaml @@ -1,23 +1,25 @@ -default_grant: +dataverse: + url: "https://your-dataverse-instance.org" + api_token: "your-api-token-here" + dataverse: "your-dataverse-alias" + auth_user: "your-username" + auth_password: "your-password" + +default_grants: - funder: "Awesome Funding Agency" id: "ABC12345" - -phase: - "Phase 1 (2021/2025)": - start: 2021 - end: 2025 + - funder: "Another Funding Agency" + id: "DEF67890" pis: - family_name: "Doe" given_name: "Jon" orcid: "0000-0000-0000-0000" - email: "jon.doe@some-university.edu" + email: "jon.doe@iana.org" affiliation: "Institute of Science, Some University" - project: ["Project A01"] - family_name: "Doe" given_name: "Jane" orcid: "0000-0000-0000-0001" - email: "jane.doe@some-university.edu" + email: "jane.doe@iana.org" affiliation: "Institute of Science, Some University" - project: ["Project A02"] diff --git a/doi2dataset.py b/doi2dataset.py index 162a53a..438e3a2 100755 --- a/doi2dataset.py +++ b/doi2dataset.py @@ -109,36 +109,6 @@ class FieldType(Enum): COMPOUND = "compound" VOCABULARY = "controlledVocabulary" -@dataclass -class Phase: - """ - Represents a project phase with a defined time span. - - Attributes: - name (str): The name of the project phase. - start (int): The start year of the project phase. - end (int): The end year of the project phase. - """ - - name: str - start: int - end: int - - def check_year(self, year: int) -> bool: - """ - Checks whether a given year falls within the project's phase boundaries. - - Args: - year (int): The year to check. - - Returns: - bool: True if the year is within the phase boundaries, otherwise False. - """ - - if self.start <= year <= self.end: - return True - return False - @dataclass class BaseMetadataField[T]: """ @@ -301,7 +271,7 @@ class Institution: "termName": self.display_name, "@type": "https://schema.org/Organization" } - return PrimitiveMetadataField("authorAffiliation", False, self.ror, expanded_value) + return PrimitiveMetadataField("authorAffiliation", False, self.ror, expanded_value=expanded_value) else: return PrimitiveMetadataField("authorAffiliation", False, self.display_name) @@ -316,14 +286,12 @@ class Person: orcid (str): ORCID identifier (optional). email (str): Email address (optional). affiliation (Institution): Affiliation of the person (optional). - project (list[str]): List of associated projects. """ family_name: str given_name: str orcid: str = "" email: str = "" affiliation: Institution | str = "" - project: list[str] = field(default_factory=list) def to_dict(self) -> dict[str, str | list[str] | dict[str, str]]: """ @@ -340,8 +308,7 @@ class Person: "family_name": self.family_name, "given_name": self.given_name, "orcid": self.orcid, - "email": self.email, - "project": self.project + "email": self.email } if isinstance(self.affiliation, Institution): @@ -464,12 +431,10 @@ class ConfigData: Attributes: dataverse (dict[str, str]): Dataverse-related configuration. - phase (dict[str, dict[str, int]]): Mapping of project phases. pis (list[dict[str, Any]]): List of principal investigator configurations. default_grants (list[dict[str, str]]): Default grant configurations. """ dataverse: dict[str, str] - phase: dict[str, dict[str, int]] pis: list[dict[str, Any]] default_grants: list[dict[str, str]] @@ -523,7 +488,6 @@ class Config: cls._config_data = ConfigData( dataverse=config_data.get('dataverse', {}), - phase=config_data.get('phase', {}), pis=config_data.get('pis', []), default_grants=config_data.get('default_grants', []) ) @@ -545,16 +509,6 @@ class Config: raise RuntimeError("Failed to load configuration") return cls._config_data - @property - def PHASE(self) -> dict[str, dict[str, int]]: - """ - Get phase configuration. - - Returns: - dict[str, dict[str, int]]: Mapping of phases. - """ - return self.get_config().phase - @property def PIS(self) -> list[dict[str, Any]]: """ @@ -833,7 +787,10 @@ class AbstractProcessor: else: console.print(f"\n{ICONS['warning']} No abstract found in CrossRef!", style="warning") else: - console.print(f"\n{ICONS['info']} License {license.name} does not allow derivative works. Reconstructing abstract from OpenAlex!", style="info") + if license.name: + console.print(f"\n{ICONS['info']} License {license.name} does not allow derivative works. Reconstructing abstract from OpenAlex!", style="info") + else: + console.print(f"\n{ICONS['info']} Custom license does not allow derivative works. Reconstructing abstract from OpenAlex!", style="info") openalex_abstract = self._get_openalex_abstract(data) @@ -1406,8 +1363,7 @@ class MetadataProcessor: CompoundMetadataField("grantNumber", True, grants).to_dict() ], "displayName": "Citation Metadata" - }, - "crc1430_org_v1": self._build_organization_metadata(data) + } }, "files": [] } @@ -1473,71 +1429,22 @@ class MetadataProcessor: """ return data.get("publication_year", "") - def _build_organization_metadata(self, data: dict[str, Any]) -> dict[str, Any]: - """ - Build organization metadata fields (phase, project, PI names). - Args: - data (dict[str, Any]): The metadata. - - Returns: - dict[str, Any]: Organization metadata. - """ - publication_year = self._get_publication_year(data) - if publication_year: - phases = self._get_phases(int(publication_year)) - else: - phases = [] - - pis = self._get_involved_pis(data) - projects: list[str] = [] - for pi in pis: - for project in pi.project: - projects.append(project) - - pi_names: list[str] = [] - for pi in pis: - pi_names.append(pi.format_name()) - - # Deduplicate projects and PI names - unique_projects = list(set(projects)) - unique_pi_names = list(set(pi_names)) - - return { - "fields": [ - ControlledVocabularyMetadataField("crc1430OrgV1Phase", True, phases).to_dict(), - ControlledVocabularyMetadataField("crc1430OrgV1Project", True, unique_projects).to_dict(), - ControlledVocabularyMetadataField("crc1430OrgV1PI", True, unique_pi_names).to_dict() - ] - } - - def _get_phases(self, year: int) -> list[str]: - """ - Determine the project phases matching a given publication year. - - Args: - year (int): The publication year. - - Returns: - list[str]: List of matching phase names. - """ - config = Config() - matching_phases: list[str] = [] - for phase_name, phase_info in config.PHASE.items(): - phase = Phase(phase_name, phase_info["start"], phase_info["end"]) - if phase.check_year(year): - matching_phases.append(phase.name) - return matching_phases def _get_involved_pis(self, data: dict[str, Any]) -> list[Person]: """ - Identify involved principal investigators from the metadata. + Identify involved principal investigators from the metadata for use as fallback + corresponding authors. + + This method matches authors in the publication metadata against the configured + PIs and returns matching PIs. It is used as a fallback when no corresponding + authors are explicitly declared in the publication metadata. Args: - data (dict[str, Any]): The metadata. + data (dict[str, Any]): The metadata from OpenAlex. Returns: - list[Person]: List of PIs. + list[Person]: List of matching PIs for use as corresponding authors. """ involved_pis: list[Person] = [] for authorship in data.get("authorships", []): diff --git a/tests/config_test.yaml b/tests/config_test.yaml index e17f88d..9130659 100644 --- a/tests/config_test.yaml +++ b/tests/config_test.yaml @@ -1,23 +1,16 @@ -default_grant: +default_grants: - funder: "Awesome Funding Agency" id: "ABC12345" -phase: - "Phase 1 (2021/2025)": - start: 2021 - end: 2025 - pis: - family_name: "Doe" given_name: "Jon" orcid: "0000-0000-0000-0000" email: "jon.doe@iana.org" affiliation: "Institute of Science, Some University" - project: ["Project A01"] - family_name: "Doe" given_name: "Jane" orcid: "0000-0000-0000-0001" email: "jane.doe@iana.org" affiliation: "Institute of Science, Some University" - project: ["Project A02"] diff --git a/tests/test_citation_builder.py b/tests/test_citation_builder.py index f66aa1a..055e93e 100644 --- a/tests/test_citation_builder.py +++ b/tests/test_citation_builder.py @@ -1,13 +1,9 @@ import json import os -import pytest -from unittest.mock import MagicMock -from doi2dataset import ( - CitationBuilder, - PIFinder, - Person -) +import pytest + +from doi2dataset import CitationBuilder, Person, PIFinder @pytest.fixture @@ -27,8 +23,7 @@ def test_pi(): given_name="Author", orcid="0000-0000-0000-1234", email="test.author@example.org", - affiliation="Test University", - project=["Test Project"] + affiliation="Test University" ) @@ -43,15 +38,15 @@ def test_build_authors(openalex_data, pi_finder): """Test that CitationBuilder.build_authors correctly processes author information""" doi = "10.1038/srep45389" builder = CitationBuilder(data=openalex_data, doi=doi, pi_finder=pi_finder) - + # Call the build_authors method - returns tuple of (authors, corresponding_authors) authors, corresponding_authors = builder.build_authors() - + # Verify that authors were created assert authors is not None assert isinstance(authors, list) assert len(authors) > 0 - + # Check the structure of the authors for author in authors: assert hasattr(author, "given_name") @@ -64,17 +59,17 @@ def test_build_authors_with_affiliations(openalex_data, pi_finder): """Test that author affiliations are correctly processed""" doi = "10.1038/srep45389" builder = CitationBuilder(data=openalex_data, doi=doi, pi_finder=pi_finder) - + # Call the build_authors method authors, _ = builder.build_authors() - + # Check if any authors have affiliation affiliation_found = False for author in authors: if hasattr(author, "affiliation") and author.affiliation: affiliation_found = True break - + # We may not have affiliations in the test data, so only assert if we found any if affiliation_found: assert affiliation_found, "No author with affiliation found" @@ -84,14 +79,14 @@ def test_build_authors_with_corresponding_author(openalex_data, pi_finder): """Test that corresponding authors are correctly identified""" doi = "10.1038/srep45389" builder = CitationBuilder(data=openalex_data, doi=doi, pi_finder=pi_finder) - + # Process authors authors, corresponding_authors = builder.build_authors() - + # Verify that corresponding authors were identified if len(corresponding_authors) > 0: assert len(corresponding_authors) > 0, "No corresponding authors identified" - + # Check structure of corresponding authors for author in corresponding_authors: assert hasattr(author, "given_name") @@ -103,7 +98,7 @@ def test_build_authors_with_corresponding_author(openalex_data, pi_finder): def test_build_authors_with_ror(openalex_data, pi_finder): """Test that ROR (Research Organization Registry) identifiers are correctly used when ror=True""" doi = "10.1038/srep45389" - + # First confirm the sample data contains at least one institution with a ROR identifier has_ror_institution = False for authorship in openalex_data.get("authorships", []): @@ -114,61 +109,61 @@ def test_build_authors_with_ror(openalex_data, pi_finder): break if has_ror_institution: break - + # Skip test if no ROR identifiers in sample data if not has_ror_institution: pytest.skip("Test data doesn't contain any ROR identifiers") - + # Create builder with ror=True to enable ROR identifiers builder = CitationBuilder(data=openalex_data, doi=doi, pi_finder=pi_finder, ror=True) - + # Get authors authors, _ = builder.build_authors() - + # Verify we got authors back assert len(authors) > 0, "No authors were extracted from the test data" - + # Check for at least one Institution with a ROR ID ror_found = False institution_with_ror = None - + for author in authors: # Check if author has affiliation if not hasattr(author, 'affiliation') or not author.affiliation: continue - + # Check if affiliation is an Institution with a ROR ID if not hasattr(author.affiliation, 'ror'): continue - + # Check if ROR ID is present and contains "ror.org" if author.affiliation.ror and "ror.org" in author.affiliation.ror: ror_found = True institution_with_ror = author.affiliation break - + # Verify ROR IDs are used when ror=True assert ror_found, "Expected at least one author with a ROR ID when ror=True" - + # Check expanded_value in the affiliation field when ROR is used if institution_with_ror: # Get the affiliation field affiliation_field = institution_with_ror.affiliation_field() - + # Verify it's set up correctly with the ROR ID as the value assert affiliation_field.value == institution_with_ror.ror - + # Verify the expanded_value dictionary has the expected structure assert hasattr(affiliation_field, 'expanded_value') assert isinstance(affiliation_field.expanded_value, dict) - + # Check specific fields in the expanded_value expanded_value = affiliation_field.expanded_value assert "scheme" in expanded_value assert expanded_value["scheme"] == "http://www.grid.ac/ontology/" - + assert "termName" in expanded_value assert expanded_value["termName"] == institution_with_ror.display_name - + assert "@type" in expanded_value - assert expanded_value["@type"] == "https://schema.org/Organization" \ No newline at end of file + assert expanded_value["@type"] == "https://schema.org/Organization" diff --git a/tests/test_doi2dataset.py b/tests/test_doi2dataset.py index 65ceecb..e5515d8 100644 --- a/tests/test_doi2dataset.py +++ b/tests/test_doi2dataset.py @@ -3,21 +3,9 @@ import sys sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) -from doi2dataset import NameProcessor, Phase, sanitize_filename, validate_email_address +from doi2dataset import NameProcessor, sanitize_filename, validate_email_address -def test_phase_check_year(): - """Test that check_year correctly determines if a year is within the phase boundaries.""" - phase = Phase("TestPhase", 2000, 2010) - # Within boundaries - assert phase.check_year(2005) is True - # Outside boundaries - assert phase.check_year(1999) is False - assert phase.check_year(2011) is False - # Boundary cases - assert phase.check_year(2000) is True - assert phase.check_year(2010) is True - def test_sanitize_filename(): """Test the sanitize_filename function to convert DOI to a valid filename.""" doi = "10.1234/abc.def" diff --git a/tests/test_fetch_doi_mock.py b/tests/test_fetch_doi_mock.py index e9f1f44..6e2745c 100644 --- a/tests/test_fetch_doi_mock.py +++ b/tests/test_fetch_doi_mock.py @@ -4,16 +4,15 @@ import os import pytest from doi2dataset import ( - AbstractProcessor, + AbstractProcessor, APIClient, - CitationBuilder, - Config, - License, - LicenseProcessor, + CitationBuilder, + Config, + LicenseProcessor, MetadataProcessor, Person, PIFinder, - SubjectMapper + SubjectMapper, ) @@ -78,16 +77,16 @@ def test_openalex_abstract_extraction(mocker, fake_openalex_response): """Test the extraction of abstracts from OpenAlex inverted index data.""" # Create API client for AbstractProcessor api_client = APIClient() - + # Create processor processor = AbstractProcessor(api_client=api_client) - + # Call the protected method directly with the fake response abstract_text = processor._get_openalex_abstract(fake_openalex_response) - + # Verify abstract was extracted assert abstract_text is not None - + # If abstract exists in the response, it should be properly extracted if 'abstract_inverted_index' in fake_openalex_response: assert len(abstract_text) > 0 @@ -97,15 +96,15 @@ def test_subject_mapper(fake_openalex_response): """Test that the SubjectMapper correctly maps OpenAlex topics to subjects.""" # Extract topics from the OpenAlex response topics = fake_openalex_response.get("topics", []) - + # Convert topics to strings - we'll use display_name topic_names = [] if topics: topic_names = [topic.get("display_name") for topic in topics if topic.get("display_name")] - + # Get subjects using the class method subjects = SubjectMapper.get_subjects({"topics": topics}) - + # Verify subjects were returned assert subjects is not None assert isinstance(subjects, list) @@ -114,21 +113,21 @@ def test_subject_mapper(fake_openalex_response): def test_citation_builder(fake_openalex_response): """Test that the CitationBuilder correctly builds author information.""" doi = "10.1038/srep45389" - + # Mock PIFinder with an empty list of PIs pi_finder = PIFinder(pis=[]) - + # Create builder with required arguments builder = CitationBuilder(data=fake_openalex_response, doi=doi, pi_finder=pi_finder) - + # Test building other IDs other_ids = builder.build_other_ids() assert isinstance(other_ids, list) - + # Test building grants grants = builder.build_grants() assert isinstance(grants, list) - + # Test building topics topics = builder.build_topics() assert isinstance(topics, list) @@ -140,10 +139,10 @@ def test_license_processor(fake_openalex_response): license_data = { "primary_location": fake_openalex_response.get("primary_location", {}) } - + # Process the license license_obj = LicenseProcessor.process_license(license_data) - + # Verify license processing assert license_obj is not None assert hasattr(license_obj, "name") @@ -158,16 +157,15 @@ def test_pi_finder_find_by_orcid(): given_name="Jon", orcid="0000-0000-0000-0000", email="jon.doe@iana.org", - affiliation="Institute of Science, Some University", - project=["Project A01"] + affiliation="Institute of Science, Some University" ) - + # Create PIFinder with our test PI finder = PIFinder(pis=[test_pi]) - + # Find PI by ORCID pi = finder._find_by_orcid("0000-0000-0000-0000") - + # Verify the PI was found assert pi is not None assert pi.family_name == "Doe" @@ -177,7 +175,7 @@ def test_pi_finder_find_by_orcid(): def test_config_load_invalid_path(): """Test that Config.load_config raises an error when an invalid path is provided.""" invalid_path = "non_existent_config.yaml" - + # Verify that attempting to load a non-existent config raises an error with pytest.raises(FileNotFoundError): Config.load_config(config_path=invalid_path) @@ -186,20 +184,20 @@ def test_config_load_invalid_path(): def test_metadata_processor_fetch_data(mocker, fake_openalex_response): """Test the _fetch_data method of the MetadataProcessor class with mocked responses.""" doi = "10.1038/srep45389" - + # Mock API response - mocker.patch("doi2dataset.APIClient.make_request", + mocker.patch("doi2dataset.APIClient.make_request", return_value=FakeResponse(fake_openalex_response, 200)) - + # Create processor with upload disabled and progress disabled processor = MetadataProcessor(doi=doi, upload=False, progress=False) - + # Test the _fetch_data method directly data = processor._fetch_data() - + # Verify that data was fetched correctly assert data is not None assert data == fake_openalex_response - + # Verify the DOI is correctly stored assert processor.doi == doi diff --git a/tests/test_metadata_processor.py b/tests/test_metadata_processor.py index fcca30d..b8a3c62 100644 --- a/tests/test_metadata_processor.py +++ b/tests/test_metadata_processor.py @@ -1,7 +1,8 @@ import json import os +from unittest.mock import MagicMock + import pytest -from unittest.mock import MagicMock, patch from doi2dataset import MetadataProcessor @@ -27,36 +28,35 @@ def test_build_metadata_basic_fields(metadata_processor, openalex_data, monkeypa """Test that _build_metadata correctly extracts basic metadata fields""" # Mock the console to avoid print errors metadata_processor.console = MagicMock() - + # Mock the Abstract related methods and objects to avoid console errors abstract_mock = MagicMock() abstract_mock.text = "This is a sample abstract" abstract_mock.source = "openalex" monkeypatch.setattr("doi2dataset.AbstractProcessor.get_abstract", lambda *args, **kwargs: abstract_mock) - + # Mock the _fetch_data method to return our test data metadata_processor._fetch_data = MagicMock(return_value=openalex_data) - + # Mock methods that might cause issues in isolation metadata_processor._build_description = MagicMock(return_value="Test description") metadata_processor._get_involved_pis = MagicMock(return_value=[]) - metadata_processor._build_organization_metadata = MagicMock(return_value={}) - + # Call the method we're testing metadata = metadata_processor._build_metadata(openalex_data) # Verify the basic metadata fields were extracted correctly assert metadata is not None assert 'datasetVersion' in metadata - + # Examine the fields inside datasetVersion.metadataBlocks assert 'metadataBlocks' in metadata['datasetVersion'] citation = metadata['datasetVersion']['metadataBlocks'].get('citation', {}) - + # Check fields in citation section assert 'fields' in citation fields = citation['fields'] - + # Check for basic metadata fields in a more flexible way field_names = [field.get('typeName') for field in fields] assert 'title' in field_names @@ -68,44 +68,43 @@ def test_build_metadata_authors(metadata_processor, openalex_data, monkeypatch): """Test that _build_metadata correctly processes author information""" # Mock the console to avoid print errors metadata_processor.console = MagicMock() - + # Mock the Abstract related methods and objects to avoid console errors abstract_mock = MagicMock() abstract_mock.text = "This is a sample abstract" abstract_mock.source = "openalex" monkeypatch.setattr("doi2dataset.AbstractProcessor.get_abstract", lambda *args, **kwargs: abstract_mock) - + # Mock the _fetch_data method to return our test data metadata_processor._fetch_data = MagicMock(return_value=openalex_data) - + # Mock methods that might cause issues in isolation metadata_processor._build_description = MagicMock(return_value="Test description") metadata_processor._get_involved_pis = MagicMock(return_value=[]) - metadata_processor._build_organization_metadata = MagicMock(return_value={}) - + # Call the method we're testing metadata = metadata_processor._build_metadata(openalex_data) # Examine the fields inside datasetVersion.metadataBlocks assert 'metadataBlocks' in metadata['datasetVersion'] citation = metadata['datasetVersion']['metadataBlocks'].get('citation', {}) - + # Check fields in citation section assert 'fields' in citation fields = citation['fields'] - + # Check for author and datasetContact fields field_names = [field.get('typeName') for field in fields] assert 'author' in field_names assert 'datasetContact' in field_names - + # Verify these are compound fields with actual entries for field in fields: if field.get('typeName') == 'author': assert 'value' in field assert isinstance(field['value'], list) assert len(field['value']) > 0 - + if field.get('typeName') == 'datasetContact': assert 'value' in field assert isinstance(field['value'], list) @@ -117,46 +116,45 @@ def test_build_metadata_keywords_and_topics(metadata_processor, openalex_data, m """Test that _build_metadata correctly extracts keywords and topics""" # Mock the console to avoid print errors metadata_processor.console = MagicMock() - + # Mock the Abstract related methods and objects to avoid console errors abstract_mock = MagicMock() abstract_mock.text = "This is a sample abstract" abstract_mock.source = "openalex" monkeypatch.setattr("doi2dataset.AbstractProcessor.get_abstract", lambda *args, **kwargs: abstract_mock) - + # Mock the _fetch_data method to return our test data metadata_processor._fetch_data = MagicMock(return_value=openalex_data) - + # Mock methods that might cause issues in isolation metadata_processor._build_description = MagicMock(return_value="Test description") metadata_processor._get_involved_pis = MagicMock(return_value=[]) - metadata_processor._build_organization_metadata = MagicMock(return_value={}) - + # Call the method we're testing metadata = metadata_processor._build_metadata(openalex_data) - + # Examine the fields inside datasetVersion.metadataBlocks assert 'metadataBlocks' in metadata['datasetVersion'] citation = metadata['datasetVersion']['metadataBlocks'].get('citation', {}) - + # Check fields in citation section assert 'fields' in citation fields = citation['fields'] - + # Check for keyword and subject fields field_names = [field.get('typeName') for field in fields] - + # If keywords exist, verify structure if 'keyword' in field_names: for field in fields: if field.get('typeName') == 'keyword': assert 'value' in field assert isinstance(field['value'], list) - + # Check for subject field which should definitely exist assert 'subject' in field_names for field in fields: if field.get('typeName') == 'subject': assert 'value' in field assert isinstance(field['value'], list) - assert len(field['value']) > 0 \ No newline at end of file + assert len(field['value']) > 0 diff --git a/tests/test_person.py b/tests/test_person.py index 3086088..2e1e030 100644 --- a/tests/test_person.py +++ b/tests/test_person.py @@ -1,5 +1,5 @@ -import pytest -from doi2dataset import Person, Institution +from doi2dataset import Institution, Person + def test_person_to_dict_with_string_affiliation(): """Test Person.to_dict() with a string affiliation.""" @@ -8,35 +8,32 @@ def test_person_to_dict_with_string_affiliation(): given_name="John", orcid="0000-0001-2345-6789", email="john.doe@example.org", - affiliation="Test University", - project=["Project A"] + affiliation="Test University" ) - + result = person.to_dict() - + assert result["family_name"] == "Doe" assert result["given_name"] == "John" assert result["orcid"] == "0000-0001-2345-6789" assert result["email"] == "john.doe@example.org" - assert result["project"] == ["Project A"] assert result["affiliation"] == "Test University" def test_person_to_dict_with_institution_ror(): """Test Person.to_dict() with an Institution that has a ROR ID.""" inst = Institution("Test University", "https://ror.org/12345") - + person = Person( family_name="Doe", given_name="John", orcid="0000-0001-2345-6789", email="john.doe@example.org", - affiliation=inst, - project=["Project A"] + affiliation=inst ) - + result = person.to_dict() - + assert result["affiliation"] == "https://ror.org/12345" # Check other fields too assert result["family_name"] == "Doe" @@ -46,16 +43,16 @@ def test_person_to_dict_with_institution_ror(): def test_person_to_dict_with_institution_display_name_only(): """Test Person.to_dict() with an Institution that has only a display_name.""" inst = Institution("Test University") # No ROR ID - + person = Person( family_name="Smith", given_name="Jane", orcid="0000-0001-9876-5432", affiliation=inst ) - + result = person.to_dict() - + assert result["affiliation"] == "Test University" assert result["family_name"] == "Smith" assert result["given_name"] == "Jane" @@ -65,15 +62,15 @@ def test_person_to_dict_with_empty_institution(): """Test Person.to_dict() with an Institution that has neither ROR nor display_name.""" # Create an Institution with empty values inst = Institution("") - + person = Person( family_name="Brown", given_name="Robert", affiliation=inst ) - + result = person.to_dict() - + assert result["affiliation"] == "" assert result["family_name"] == "Brown" assert result["given_name"] == "Robert" @@ -86,10 +83,10 @@ def test_person_to_dict_with_no_affiliation(): given_name="Alice", orcid="0000-0002-1111-2222" ) - + result = person.to_dict() - + assert result["affiliation"] == "" assert result["family_name"] == "Green" assert result["given_name"] == "Alice" - assert result["orcid"] == "0000-0002-1111-2222" \ No newline at end of file + assert result["orcid"] == "0000-0002-1111-2222"