diff --git a/.coveragerc b/.coveragerc index 9b4d454..d898768 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,12 +1,8 @@ [run] source = doi2dataset -omit = +omit = */tests/* - */test_* */docs/* - */__pycache__/* - */venv/* - */.venv/* setup.py conf.py __init__.py @@ -15,22 +11,13 @@ omit = exclude_lines = pragma: no cover def __repr__ - def __str__ if self.debug: raise NotImplementedError - raise AssertionError if __name__ == .__main__.: - if TYPE_CHECKING: - @abstractmethod pass raise ImportError except ImportError - -show_missing = true -precision = 2 + def __str__ [html] -directory = htmlcov - -[xml] -output = coverage.xml +directory = htmlcov \ No newline at end of file diff --git a/.gitignore b/.gitignore index 85ed911..6ff6e4c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,5 @@ # Config file config.yaml -.config.yaml # Processed DOIs *.json @@ -59,7 +58,6 @@ htmlcov/ .cache nosetests.xml coverage.xml -junit.xml *.cover *.py,cover .hypothesis/ diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml deleted file mode 100644 index 376689d..0000000 --- a/.gitlab-ci.yml +++ /dev/null @@ -1,36 +0,0 @@ -# GitLab CI/CD pipeline for doi2dataset -# Compatible with GitLab v18.1.1 - -stages: - - test - -variables: - PIP_CACHE_DIR: "$CI_PROJECT_DIR/.cache/pip" - -cache: - paths: - - .cache/pip/ - - .venv/ - -test: - stage: test - image: python:3 - before_script: - - python -m pip install --upgrade pip - - pip install -r requirements.txt - - pip install -r requirements-dev.txt - script: - - pytest - artifacts: - reports: - junit: junit.xml - coverage_report: - coverage_format: cobertura - path: coverage.xml - paths: - - htmlcov/ - expire_in: 1 week - coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/' - only: - - branches - - merge_requests diff --git a/README.md b/README.md index 9c00601..8b66b2a 100644 --- a/README.md +++ b/README.md @@ -8,14 +8,7 @@ - **DOI Validation and Normalization:** Validates DOIs and converts them into a standardized format. - **Metadata Retrieval:** Fetches metadata such as title, abstract, license, and author information from external sources. -- **Standard Dataverse Metadata:** Generates standard Dataverse citation metadata including: - - Title, publication date, and alternative URL - - Author information with affiliations and ORCID identifiers - - Dataset contact information (corresponding authors) - - Abstract and description - - Keywords and subject classification - - Grant/funding information - - License information when available +- **Metadata Mapping:** Automatically maps and generates metadata fields (e.g., title, description, keywords) including support for controlled vocabularies and compound fields. - **Optional Upload:** Allows uploading of metadata directly to a Dataverse.org server. - **Progress Tracking:** Uses the Rich library for user-friendly progress tracking and error handling. @@ -30,41 +23,14 @@ cd doi2dataset ## Configuration +Configuration + Before running the tool, configure the necessary settings in the `config.yaml` file located in the project root. This file contains configuration details such as: -- **Connection details**: URL, API token, authentication credentials for Dataverse server -- **Principal Investigator (PI) information**: Optional - used for fallback determination of corresponding authors when not explicitly specified in the publication -- **Default grant configurations**: Funding information to be included in the metadata (supports multiple grants) - -### Configuration File Structure - -The configuration file should follow this structure: - -```yaml -# Dataverse server connection details -dataverse: - url: "https://your-dataverse-instance.org" - api_token: "your-api-token" - -# Default grant information (supports multiple grants) -default_grants: - - funder: "Your Funding Agency" - id: "GRANT123456" - - funder: "Another Funding Agency" - id: "GRANT789012" - -# Principal investigators for fallback corresponding author determination (optional) -pis: - - family_name: "Doe" - given_name: "John" - orcid: "0000-0000-0000-0000" - email: "john.doe@university.edu" - affiliation: "Department of Science, University" -``` - -See `config_example.yaml` for a complete example configuration. - -**Note**: The PI section is optional. If no corresponding authors are found in the publication metadata and no PIs are configured, the tool will still generate metadata but may issue a warning about missing corresponding author information. +- Connection details (URL, API token, authentication credentials) +- Mapping of project phases +- Principal Investigator (PI) information +- Default grant configurations ## Usage @@ -103,6 +69,8 @@ Documentation is generated using Sphinx. See the `docs/` directory for detailed ## Testing +## Testing + Tests are implemented with pytest. The test suite provides comprehensive coverage of core functionalities. To run the tests, execute: ```bash @@ -134,13 +102,11 @@ pytest --cov=. --cov-report=html This creates a `htmlcov` directory. Open `htmlcov/index.html` in a browser to view the detailed coverage report. A `.coveragerc` configuration file is provided that: - - Excludes test files, documentation, and boilerplate code from coverage analysis - Configures reporting to ignore common non-testable lines (like defensive imports) - Sets the output directory for HTML reports Recent improvements have increased coverage from 48% to 61% by adding focused tests for: - - Citation building functionality - License processing and validation - Metadata field extraction @@ -148,7 +114,6 @@ Recent improvements have increased coverage from 48% to 61% by adding focused te - Publication data parsing and validation Areas that could benefit from additional testing: - - More edge cases in the MetadataProcessor class workflow - Additional CitationBuilder scenarios with diverse inputs - Complex network interactions and error handling @@ -157,7 +122,7 @@ Areas that could benefit from additional testing: The test suite is organized into six main files: -1. **test_doi2dataset.py**: Basic tests for core functions like name splitting, DOI validation, and filename sanitization. +1. **test_doi2dataset.py**: Basic tests for core functions like phase checking, name splitting and DOI validation. 2. **test_fetch_doi_mock.py**: Tests API interactions using a mock OpenAlex response stored in `srep45389.json`. 3. **test_citation_builder.py**: Tests for building citation metadata from API responses. 4. **test_metadata_processor.py**: Tests for the metadata processing workflow. @@ -171,6 +136,7 @@ The test suite covers the following categories of functionality: #### Core Functionality Tests - **DOI Validation and Processing**: Parameterized tests for DOI normalization, validation, and filename sanitization with various inputs. +- **Phase Management**: Tests for checking publication year against defined project phases, including boundary cases. - **Name Processing**: Extensive tests for parsing and splitting author names in different formats (with/without commas, middle initials, etc.). - **Email Validation**: Tests for proper validation of email addresses with various domain configurations. @@ -185,36 +151,12 @@ The test suite covers the following categories of functionality: - **Citation Building**: Tests for properly building citation metadata from API responses. - **License Processing**: Tests for correctly identifying and formatting license information from various license IDs. -- **Principal Investigator Matching**: Tests for finding project PIs based on ORCID identifiers (used for fallback corresponding author determination). +- **Principal Investigator Matching**: Tests for finding project PIs based on ORCID identifiers. - **Configuration Loading**: Tests for properly loading and validating configuration from files. - **Metadata Workflow**: Tests for the complete metadata processing workflow. These tests ensure that all components work correctly in isolation and together as a system, with special attention to edge cases and error handling. -## Changelog - -### Version 2.0 - Generalization Update - -This version has been updated to make the tool more generalized and suitable for broader use cases: - -**Breaking Changes:** - -- Removed organizational-specific metadata blocks (project phases, organizational fields) -- Removed `Phase` class and phase-related configuration -- Simplified configuration structure - -**What's New:** - -- Streamlined metadata generation focusing on standard Dataverse citation metadata -- Reduced configuration requirements for easier adoption -- Maintained PI information support for corresponding author fallback functionality - -**Migration Guide:** - -- Remove the `phase` section from your configuration file -- The tool will now generate only standard citation metadata blocks -- PI information is still supported and used for fallback corresponding author determination - ## Contributing Contributions are welcome! Please fork the repository and submit a pull request with your improvements. diff --git a/__init__.py b/__init__.py index f1bd259..0db05d7 100644 --- a/__init__.py +++ b/__init__.py @@ -8,9 +8,10 @@ from .doi2dataset import ( LicenseProcessor, MetadataProcessor, NameProcessor, - Person, PIFinder, + Person, + Phase, SubjectMapper, sanitize_filename, validate_email_address, -) +) \ No newline at end of file diff --git a/config_example.yaml b/config_example.yaml index d00d523..f14cdad 100644 --- a/config_example.yaml +++ b/config_example.yaml @@ -1,25 +1,23 @@ -dataverse: - url: "https://your-dataverse-instance.org" - api_token: "your-api-token-here" - dataverse: "your-dataverse-alias" - auth_user: "your-username" - auth_password: "your-password" - -default_grants: +default_grant: - funder: "Awesome Funding Agency" id: "ABC12345" - - funder: "Another Funding Agency" - id: "DEF67890" + +phase: + "Phase 1 (2021/2025)": + start: 2021 + end: 2025 pis: - family_name: "Doe" given_name: "Jon" orcid: "0000-0000-0000-0000" - email: "jon.doe@iana.org" + email: "jon.doe@some-university.edu" affiliation: "Institute of Science, Some University" + project: ["Project A01"] - family_name: "Doe" given_name: "Jane" orcid: "0000-0000-0000-0001" - email: "jane.doe@iana.org" + email: "jane.doe@some-university.edu" affiliation: "Institute of Science, Some University" + project: ["Project A02"] diff --git a/doi2dataset.py b/doi2dataset.py index 438e3a2..162a53a 100755 --- a/doi2dataset.py +++ b/doi2dataset.py @@ -109,6 +109,36 @@ class FieldType(Enum): COMPOUND = "compound" VOCABULARY = "controlledVocabulary" +@dataclass +class Phase: + """ + Represents a project phase with a defined time span. + + Attributes: + name (str): The name of the project phase. + start (int): The start year of the project phase. + end (int): The end year of the project phase. + """ + + name: str + start: int + end: int + + def check_year(self, year: int) -> bool: + """ + Checks whether a given year falls within the project's phase boundaries. + + Args: + year (int): The year to check. + + Returns: + bool: True if the year is within the phase boundaries, otherwise False. + """ + + if self.start <= year <= self.end: + return True + return False + @dataclass class BaseMetadataField[T]: """ @@ -271,7 +301,7 @@ class Institution: "termName": self.display_name, "@type": "https://schema.org/Organization" } - return PrimitiveMetadataField("authorAffiliation", False, self.ror, expanded_value=expanded_value) + return PrimitiveMetadataField("authorAffiliation", False, self.ror, expanded_value) else: return PrimitiveMetadataField("authorAffiliation", False, self.display_name) @@ -286,12 +316,14 @@ class Person: orcid (str): ORCID identifier (optional). email (str): Email address (optional). affiliation (Institution): Affiliation of the person (optional). + project (list[str]): List of associated projects. """ family_name: str given_name: str orcid: str = "" email: str = "" affiliation: Institution | str = "" + project: list[str] = field(default_factory=list) def to_dict(self) -> dict[str, str | list[str] | dict[str, str]]: """ @@ -308,7 +340,8 @@ class Person: "family_name": self.family_name, "given_name": self.given_name, "orcid": self.orcid, - "email": self.email + "email": self.email, + "project": self.project } if isinstance(self.affiliation, Institution): @@ -431,10 +464,12 @@ class ConfigData: Attributes: dataverse (dict[str, str]): Dataverse-related configuration. + phase (dict[str, dict[str, int]]): Mapping of project phases. pis (list[dict[str, Any]]): List of principal investigator configurations. default_grants (list[dict[str, str]]): Default grant configurations. """ dataverse: dict[str, str] + phase: dict[str, dict[str, int]] pis: list[dict[str, Any]] default_grants: list[dict[str, str]] @@ -488,6 +523,7 @@ class Config: cls._config_data = ConfigData( dataverse=config_data.get('dataverse', {}), + phase=config_data.get('phase', {}), pis=config_data.get('pis', []), default_grants=config_data.get('default_grants', []) ) @@ -509,6 +545,16 @@ class Config: raise RuntimeError("Failed to load configuration") return cls._config_data + @property + def PHASE(self) -> dict[str, dict[str, int]]: + """ + Get phase configuration. + + Returns: + dict[str, dict[str, int]]: Mapping of phases. + """ + return self.get_config().phase + @property def PIS(self) -> list[dict[str, Any]]: """ @@ -787,10 +833,7 @@ class AbstractProcessor: else: console.print(f"\n{ICONS['warning']} No abstract found in CrossRef!", style="warning") else: - if license.name: - console.print(f"\n{ICONS['info']} License {license.name} does not allow derivative works. Reconstructing abstract from OpenAlex!", style="info") - else: - console.print(f"\n{ICONS['info']} Custom license does not allow derivative works. Reconstructing abstract from OpenAlex!", style="info") + console.print(f"\n{ICONS['info']} License {license.name} does not allow derivative works. Reconstructing abstract from OpenAlex!", style="info") openalex_abstract = self._get_openalex_abstract(data) @@ -1363,7 +1406,8 @@ class MetadataProcessor: CompoundMetadataField("grantNumber", True, grants).to_dict() ], "displayName": "Citation Metadata" - } + }, + "crc1430_org_v1": self._build_organization_metadata(data) }, "files": [] } @@ -1429,22 +1473,71 @@ class MetadataProcessor: """ return data.get("publication_year", "") + def _build_organization_metadata(self, data: dict[str, Any]) -> dict[str, Any]: + """ + Build organization metadata fields (phase, project, PI names). + Args: + data (dict[str, Any]): The metadata. + + Returns: + dict[str, Any]: Organization metadata. + """ + publication_year = self._get_publication_year(data) + if publication_year: + phases = self._get_phases(int(publication_year)) + else: + phases = [] + + pis = self._get_involved_pis(data) + projects: list[str] = [] + for pi in pis: + for project in pi.project: + projects.append(project) + + pi_names: list[str] = [] + for pi in pis: + pi_names.append(pi.format_name()) + + # Deduplicate projects and PI names + unique_projects = list(set(projects)) + unique_pi_names = list(set(pi_names)) + + return { + "fields": [ + ControlledVocabularyMetadataField("crc1430OrgV1Phase", True, phases).to_dict(), + ControlledVocabularyMetadataField("crc1430OrgV1Project", True, unique_projects).to_dict(), + ControlledVocabularyMetadataField("crc1430OrgV1PI", True, unique_pi_names).to_dict() + ] + } + + def _get_phases(self, year: int) -> list[str]: + """ + Determine the project phases matching a given publication year. + + Args: + year (int): The publication year. + + Returns: + list[str]: List of matching phase names. + """ + config = Config() + matching_phases: list[str] = [] + for phase_name, phase_info in config.PHASE.items(): + phase = Phase(phase_name, phase_info["start"], phase_info["end"]) + if phase.check_year(year): + matching_phases.append(phase.name) + return matching_phases def _get_involved_pis(self, data: dict[str, Any]) -> list[Person]: """ - Identify involved principal investigators from the metadata for use as fallback - corresponding authors. - - This method matches authors in the publication metadata against the configured - PIs and returns matching PIs. It is used as a fallback when no corresponding - authors are explicitly declared in the publication metadata. + Identify involved principal investigators from the metadata. Args: - data (dict[str, Any]): The metadata from OpenAlex. + data (dict[str, Any]): The metadata. Returns: - list[Person]: List of matching PIs for use as corresponding authors. + list[Person]: List of PIs. """ involved_pis: list[Person] = [] for authorship in data.get("authorships", []): diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index c32678f..0000000 --- a/pytest.ini +++ /dev/null @@ -1,15 +0,0 @@ -[pytest] -addopts = - --cov=doi2dataset - --cov-report=html - --cov-report=xml - --cov-report=term-missing - --junitxml=junit.xml - --verbose - --tb=short - -testpaths = tests - -python_files = test_*.py *_test.py -python_functions = test_* -python_classes = Test* diff --git a/requirements-dev.txt b/requirements-dev.txt index ab30c10..dfadb8c 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,3 @@ pytest>=8.3.5,<9.0 pytest-mock>=3.14.0,<4.0 -pytest-cov>=6.0.0,<7.0 ruff>=0.11.1,<0.20 diff --git a/setup.py b/setup.py index f88994f..be93bdd 100644 --- a/setup.py +++ b/setup.py @@ -25,8 +25,7 @@ setup( ], "dev": [ "pytest>=8.3.5,<9.0", - "pytest-mock>=3.14.0,<4.0", - "pytest-cov>=6.0.0,<7.0", + "pytest-mock>=3.14.0,4.0", "ruff>=0.11.1,<0.20" ] }, diff --git a/tests/config_test.yaml b/tests/config_test.yaml index 9130659..e17f88d 100644 --- a/tests/config_test.yaml +++ b/tests/config_test.yaml @@ -1,16 +1,23 @@ -default_grants: +default_grant: - funder: "Awesome Funding Agency" id: "ABC12345" +phase: + "Phase 1 (2021/2025)": + start: 2021 + end: 2025 + pis: - family_name: "Doe" given_name: "Jon" orcid: "0000-0000-0000-0000" email: "jon.doe@iana.org" affiliation: "Institute of Science, Some University" + project: ["Project A01"] - family_name: "Doe" given_name: "Jane" orcid: "0000-0000-0000-0001" email: "jane.doe@iana.org" affiliation: "Institute of Science, Some University" + project: ["Project A02"] diff --git a/tests/test_citation_builder.py b/tests/test_citation_builder.py index 055e93e..f66aa1a 100644 --- a/tests/test_citation_builder.py +++ b/tests/test_citation_builder.py @@ -1,9 +1,13 @@ import json import os - import pytest +from unittest.mock import MagicMock -from doi2dataset import CitationBuilder, Person, PIFinder +from doi2dataset import ( + CitationBuilder, + PIFinder, + Person +) @pytest.fixture @@ -23,7 +27,8 @@ def test_pi(): given_name="Author", orcid="0000-0000-0000-1234", email="test.author@example.org", - affiliation="Test University" + affiliation="Test University", + project=["Test Project"] ) @@ -38,15 +43,15 @@ def test_build_authors(openalex_data, pi_finder): """Test that CitationBuilder.build_authors correctly processes author information""" doi = "10.1038/srep45389" builder = CitationBuilder(data=openalex_data, doi=doi, pi_finder=pi_finder) - + # Call the build_authors method - returns tuple of (authors, corresponding_authors) authors, corresponding_authors = builder.build_authors() - + # Verify that authors were created assert authors is not None assert isinstance(authors, list) assert len(authors) > 0 - + # Check the structure of the authors for author in authors: assert hasattr(author, "given_name") @@ -59,17 +64,17 @@ def test_build_authors_with_affiliations(openalex_data, pi_finder): """Test that author affiliations are correctly processed""" doi = "10.1038/srep45389" builder = CitationBuilder(data=openalex_data, doi=doi, pi_finder=pi_finder) - + # Call the build_authors method authors, _ = builder.build_authors() - + # Check if any authors have affiliation affiliation_found = False for author in authors: if hasattr(author, "affiliation") and author.affiliation: affiliation_found = True break - + # We may not have affiliations in the test data, so only assert if we found any if affiliation_found: assert affiliation_found, "No author with affiliation found" @@ -79,14 +84,14 @@ def test_build_authors_with_corresponding_author(openalex_data, pi_finder): """Test that corresponding authors are correctly identified""" doi = "10.1038/srep45389" builder = CitationBuilder(data=openalex_data, doi=doi, pi_finder=pi_finder) - + # Process authors authors, corresponding_authors = builder.build_authors() - + # Verify that corresponding authors were identified if len(corresponding_authors) > 0: assert len(corresponding_authors) > 0, "No corresponding authors identified" - + # Check structure of corresponding authors for author in corresponding_authors: assert hasattr(author, "given_name") @@ -98,7 +103,7 @@ def test_build_authors_with_corresponding_author(openalex_data, pi_finder): def test_build_authors_with_ror(openalex_data, pi_finder): """Test that ROR (Research Organization Registry) identifiers are correctly used when ror=True""" doi = "10.1038/srep45389" - + # First confirm the sample data contains at least one institution with a ROR identifier has_ror_institution = False for authorship in openalex_data.get("authorships", []): @@ -109,61 +114,61 @@ def test_build_authors_with_ror(openalex_data, pi_finder): break if has_ror_institution: break - + # Skip test if no ROR identifiers in sample data if not has_ror_institution: pytest.skip("Test data doesn't contain any ROR identifiers") - + # Create builder with ror=True to enable ROR identifiers builder = CitationBuilder(data=openalex_data, doi=doi, pi_finder=pi_finder, ror=True) - + # Get authors authors, _ = builder.build_authors() - + # Verify we got authors back assert len(authors) > 0, "No authors were extracted from the test data" - + # Check for at least one Institution with a ROR ID ror_found = False institution_with_ror = None - + for author in authors: # Check if author has affiliation if not hasattr(author, 'affiliation') or not author.affiliation: continue - + # Check if affiliation is an Institution with a ROR ID if not hasattr(author.affiliation, 'ror'): continue - + # Check if ROR ID is present and contains "ror.org" if author.affiliation.ror and "ror.org" in author.affiliation.ror: ror_found = True institution_with_ror = author.affiliation break - + # Verify ROR IDs are used when ror=True assert ror_found, "Expected at least one author with a ROR ID when ror=True" - + # Check expanded_value in the affiliation field when ROR is used if institution_with_ror: # Get the affiliation field affiliation_field = institution_with_ror.affiliation_field() - + # Verify it's set up correctly with the ROR ID as the value assert affiliation_field.value == institution_with_ror.ror - + # Verify the expanded_value dictionary has the expected structure assert hasattr(affiliation_field, 'expanded_value') assert isinstance(affiliation_field.expanded_value, dict) - + # Check specific fields in the expanded_value expanded_value = affiliation_field.expanded_value assert "scheme" in expanded_value assert expanded_value["scheme"] == "http://www.grid.ac/ontology/" - + assert "termName" in expanded_value assert expanded_value["termName"] == institution_with_ror.display_name - + assert "@type" in expanded_value - assert expanded_value["@type"] == "https://schema.org/Organization" + assert expanded_value["@type"] == "https://schema.org/Organization" \ No newline at end of file diff --git a/tests/test_doi2dataset.py b/tests/test_doi2dataset.py index e5515d8..65ceecb 100644 --- a/tests/test_doi2dataset.py +++ b/tests/test_doi2dataset.py @@ -3,9 +3,21 @@ import sys sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) -from doi2dataset import NameProcessor, sanitize_filename, validate_email_address +from doi2dataset import NameProcessor, Phase, sanitize_filename, validate_email_address +def test_phase_check_year(): + """Test that check_year correctly determines if a year is within the phase boundaries.""" + phase = Phase("TestPhase", 2000, 2010) + # Within boundaries + assert phase.check_year(2005) is True + # Outside boundaries + assert phase.check_year(1999) is False + assert phase.check_year(2011) is False + # Boundary cases + assert phase.check_year(2000) is True + assert phase.check_year(2010) is True + def test_sanitize_filename(): """Test the sanitize_filename function to convert DOI to a valid filename.""" doi = "10.1234/abc.def" diff --git a/tests/test_fetch_doi_mock.py b/tests/test_fetch_doi_mock.py index 6e2745c..e9f1f44 100644 --- a/tests/test_fetch_doi_mock.py +++ b/tests/test_fetch_doi_mock.py @@ -4,15 +4,16 @@ import os import pytest from doi2dataset import ( - AbstractProcessor, + AbstractProcessor, APIClient, - CitationBuilder, - Config, - LicenseProcessor, + CitationBuilder, + Config, + License, + LicenseProcessor, MetadataProcessor, Person, PIFinder, - SubjectMapper, + SubjectMapper ) @@ -77,16 +78,16 @@ def test_openalex_abstract_extraction(mocker, fake_openalex_response): """Test the extraction of abstracts from OpenAlex inverted index data.""" # Create API client for AbstractProcessor api_client = APIClient() - + # Create processor processor = AbstractProcessor(api_client=api_client) - + # Call the protected method directly with the fake response abstract_text = processor._get_openalex_abstract(fake_openalex_response) - + # Verify abstract was extracted assert abstract_text is not None - + # If abstract exists in the response, it should be properly extracted if 'abstract_inverted_index' in fake_openalex_response: assert len(abstract_text) > 0 @@ -96,15 +97,15 @@ def test_subject_mapper(fake_openalex_response): """Test that the SubjectMapper correctly maps OpenAlex topics to subjects.""" # Extract topics from the OpenAlex response topics = fake_openalex_response.get("topics", []) - + # Convert topics to strings - we'll use display_name topic_names = [] if topics: topic_names = [topic.get("display_name") for topic in topics if topic.get("display_name")] - + # Get subjects using the class method subjects = SubjectMapper.get_subjects({"topics": topics}) - + # Verify subjects were returned assert subjects is not None assert isinstance(subjects, list) @@ -113,21 +114,21 @@ def test_subject_mapper(fake_openalex_response): def test_citation_builder(fake_openalex_response): """Test that the CitationBuilder correctly builds author information.""" doi = "10.1038/srep45389" - + # Mock PIFinder with an empty list of PIs pi_finder = PIFinder(pis=[]) - + # Create builder with required arguments builder = CitationBuilder(data=fake_openalex_response, doi=doi, pi_finder=pi_finder) - + # Test building other IDs other_ids = builder.build_other_ids() assert isinstance(other_ids, list) - + # Test building grants grants = builder.build_grants() assert isinstance(grants, list) - + # Test building topics topics = builder.build_topics() assert isinstance(topics, list) @@ -139,10 +140,10 @@ def test_license_processor(fake_openalex_response): license_data = { "primary_location": fake_openalex_response.get("primary_location", {}) } - + # Process the license license_obj = LicenseProcessor.process_license(license_data) - + # Verify license processing assert license_obj is not None assert hasattr(license_obj, "name") @@ -157,15 +158,16 @@ def test_pi_finder_find_by_orcid(): given_name="Jon", orcid="0000-0000-0000-0000", email="jon.doe@iana.org", - affiliation="Institute of Science, Some University" + affiliation="Institute of Science, Some University", + project=["Project A01"] ) - + # Create PIFinder with our test PI finder = PIFinder(pis=[test_pi]) - + # Find PI by ORCID pi = finder._find_by_orcid("0000-0000-0000-0000") - + # Verify the PI was found assert pi is not None assert pi.family_name == "Doe" @@ -175,7 +177,7 @@ def test_pi_finder_find_by_orcid(): def test_config_load_invalid_path(): """Test that Config.load_config raises an error when an invalid path is provided.""" invalid_path = "non_existent_config.yaml" - + # Verify that attempting to load a non-existent config raises an error with pytest.raises(FileNotFoundError): Config.load_config(config_path=invalid_path) @@ -184,20 +186,20 @@ def test_config_load_invalid_path(): def test_metadata_processor_fetch_data(mocker, fake_openalex_response): """Test the _fetch_data method of the MetadataProcessor class with mocked responses.""" doi = "10.1038/srep45389" - + # Mock API response - mocker.patch("doi2dataset.APIClient.make_request", + mocker.patch("doi2dataset.APIClient.make_request", return_value=FakeResponse(fake_openalex_response, 200)) - + # Create processor with upload disabled and progress disabled processor = MetadataProcessor(doi=doi, upload=False, progress=False) - + # Test the _fetch_data method directly data = processor._fetch_data() - + # Verify that data was fetched correctly assert data is not None assert data == fake_openalex_response - + # Verify the DOI is correctly stored assert processor.doi == doi diff --git a/tests/test_metadata_processor.py b/tests/test_metadata_processor.py index b8a3c62..fcca30d 100644 --- a/tests/test_metadata_processor.py +++ b/tests/test_metadata_processor.py @@ -1,8 +1,7 @@ import json import os -from unittest.mock import MagicMock - import pytest +from unittest.mock import MagicMock, patch from doi2dataset import MetadataProcessor @@ -28,35 +27,36 @@ def test_build_metadata_basic_fields(metadata_processor, openalex_data, monkeypa """Test that _build_metadata correctly extracts basic metadata fields""" # Mock the console to avoid print errors metadata_processor.console = MagicMock() - + # Mock the Abstract related methods and objects to avoid console errors abstract_mock = MagicMock() abstract_mock.text = "This is a sample abstract" abstract_mock.source = "openalex" monkeypatch.setattr("doi2dataset.AbstractProcessor.get_abstract", lambda *args, **kwargs: abstract_mock) - + # Mock the _fetch_data method to return our test data metadata_processor._fetch_data = MagicMock(return_value=openalex_data) - + # Mock methods that might cause issues in isolation metadata_processor._build_description = MagicMock(return_value="Test description") metadata_processor._get_involved_pis = MagicMock(return_value=[]) - + metadata_processor._build_organization_metadata = MagicMock(return_value={}) + # Call the method we're testing metadata = metadata_processor._build_metadata(openalex_data) # Verify the basic metadata fields were extracted correctly assert metadata is not None assert 'datasetVersion' in metadata - + # Examine the fields inside datasetVersion.metadataBlocks assert 'metadataBlocks' in metadata['datasetVersion'] citation = metadata['datasetVersion']['metadataBlocks'].get('citation', {}) - + # Check fields in citation section assert 'fields' in citation fields = citation['fields'] - + # Check for basic metadata fields in a more flexible way field_names = [field.get('typeName') for field in fields] assert 'title' in field_names @@ -68,43 +68,44 @@ def test_build_metadata_authors(metadata_processor, openalex_data, monkeypatch): """Test that _build_metadata correctly processes author information""" # Mock the console to avoid print errors metadata_processor.console = MagicMock() - + # Mock the Abstract related methods and objects to avoid console errors abstract_mock = MagicMock() abstract_mock.text = "This is a sample abstract" abstract_mock.source = "openalex" monkeypatch.setattr("doi2dataset.AbstractProcessor.get_abstract", lambda *args, **kwargs: abstract_mock) - + # Mock the _fetch_data method to return our test data metadata_processor._fetch_data = MagicMock(return_value=openalex_data) - + # Mock methods that might cause issues in isolation metadata_processor._build_description = MagicMock(return_value="Test description") metadata_processor._get_involved_pis = MagicMock(return_value=[]) - + metadata_processor._build_organization_metadata = MagicMock(return_value={}) + # Call the method we're testing metadata = metadata_processor._build_metadata(openalex_data) # Examine the fields inside datasetVersion.metadataBlocks assert 'metadataBlocks' in metadata['datasetVersion'] citation = metadata['datasetVersion']['metadataBlocks'].get('citation', {}) - + # Check fields in citation section assert 'fields' in citation fields = citation['fields'] - + # Check for author and datasetContact fields field_names = [field.get('typeName') for field in fields] assert 'author' in field_names assert 'datasetContact' in field_names - + # Verify these are compound fields with actual entries for field in fields: if field.get('typeName') == 'author': assert 'value' in field assert isinstance(field['value'], list) assert len(field['value']) > 0 - + if field.get('typeName') == 'datasetContact': assert 'value' in field assert isinstance(field['value'], list) @@ -116,45 +117,46 @@ def test_build_metadata_keywords_and_topics(metadata_processor, openalex_data, m """Test that _build_metadata correctly extracts keywords and topics""" # Mock the console to avoid print errors metadata_processor.console = MagicMock() - + # Mock the Abstract related methods and objects to avoid console errors abstract_mock = MagicMock() abstract_mock.text = "This is a sample abstract" abstract_mock.source = "openalex" monkeypatch.setattr("doi2dataset.AbstractProcessor.get_abstract", lambda *args, **kwargs: abstract_mock) - + # Mock the _fetch_data method to return our test data metadata_processor._fetch_data = MagicMock(return_value=openalex_data) - + # Mock methods that might cause issues in isolation metadata_processor._build_description = MagicMock(return_value="Test description") metadata_processor._get_involved_pis = MagicMock(return_value=[]) - + metadata_processor._build_organization_metadata = MagicMock(return_value={}) + # Call the method we're testing metadata = metadata_processor._build_metadata(openalex_data) - + # Examine the fields inside datasetVersion.metadataBlocks assert 'metadataBlocks' in metadata['datasetVersion'] citation = metadata['datasetVersion']['metadataBlocks'].get('citation', {}) - + # Check fields in citation section assert 'fields' in citation fields = citation['fields'] - + # Check for keyword and subject fields field_names = [field.get('typeName') for field in fields] - + # If keywords exist, verify structure if 'keyword' in field_names: for field in fields: if field.get('typeName') == 'keyword': assert 'value' in field assert isinstance(field['value'], list) - + # Check for subject field which should definitely exist assert 'subject' in field_names for field in fields: if field.get('typeName') == 'subject': assert 'value' in field assert isinstance(field['value'], list) - assert len(field['value']) > 0 + assert len(field['value']) > 0 \ No newline at end of file diff --git a/tests/test_person.py b/tests/test_person.py index 2e1e030..3086088 100644 --- a/tests/test_person.py +++ b/tests/test_person.py @@ -1,5 +1,5 @@ -from doi2dataset import Institution, Person - +import pytest +from doi2dataset import Person, Institution def test_person_to_dict_with_string_affiliation(): """Test Person.to_dict() with a string affiliation.""" @@ -8,32 +8,35 @@ def test_person_to_dict_with_string_affiliation(): given_name="John", orcid="0000-0001-2345-6789", email="john.doe@example.org", - affiliation="Test University" + affiliation="Test University", + project=["Project A"] ) - + result = person.to_dict() - + assert result["family_name"] == "Doe" assert result["given_name"] == "John" assert result["orcid"] == "0000-0001-2345-6789" assert result["email"] == "john.doe@example.org" + assert result["project"] == ["Project A"] assert result["affiliation"] == "Test University" def test_person_to_dict_with_institution_ror(): """Test Person.to_dict() with an Institution that has a ROR ID.""" inst = Institution("Test University", "https://ror.org/12345") - + person = Person( family_name="Doe", given_name="John", orcid="0000-0001-2345-6789", email="john.doe@example.org", - affiliation=inst + affiliation=inst, + project=["Project A"] ) - + result = person.to_dict() - + assert result["affiliation"] == "https://ror.org/12345" # Check other fields too assert result["family_name"] == "Doe" @@ -43,16 +46,16 @@ def test_person_to_dict_with_institution_ror(): def test_person_to_dict_with_institution_display_name_only(): """Test Person.to_dict() with an Institution that has only a display_name.""" inst = Institution("Test University") # No ROR ID - + person = Person( family_name="Smith", given_name="Jane", orcid="0000-0001-9876-5432", affiliation=inst ) - + result = person.to_dict() - + assert result["affiliation"] == "Test University" assert result["family_name"] == "Smith" assert result["given_name"] == "Jane" @@ -62,15 +65,15 @@ def test_person_to_dict_with_empty_institution(): """Test Person.to_dict() with an Institution that has neither ROR nor display_name.""" # Create an Institution with empty values inst = Institution("") - + person = Person( family_name="Brown", given_name="Robert", affiliation=inst ) - + result = person.to_dict() - + assert result["affiliation"] == "" assert result["family_name"] == "Brown" assert result["given_name"] == "Robert" @@ -83,10 +86,10 @@ def test_person_to_dict_with_no_affiliation(): given_name="Alice", orcid="0000-0002-1111-2222" ) - + result = person.to_dict() - + assert result["affiliation"] == "" assert result["family_name"] == "Green" assert result["given_name"] == "Alice" - assert result["orcid"] == "0000-0002-1111-2222" + assert result["orcid"] == "0000-0002-1111-2222" \ No newline at end of file