import json import os import pytest from unittest.mock import MagicMock, patch from doi2dataset import MetadataProcessor @pytest.fixture def openalex_data(): """Load the saved JSON response from the file 'srep45389.json'""" json_path = os.path.join(os.path.dirname(__file__), "srep45389.json") with open(json_path, "r", encoding="utf-8") as f: data = json.load(f) return data @pytest.fixture def metadata_processor(): """Create a MetadataProcessor instance with mocked dependencies""" doi = "10.1038/srep45389" processor = MetadataProcessor(doi=doi, upload=False, progress=False) return processor def test_build_metadata_basic_fields(metadata_processor, openalex_data, monkeypatch): """Test that _build_metadata correctly extracts basic metadata fields""" # Mock the console to avoid print errors metadata_processor.console = MagicMock() # Mock the Abstract related methods and objects to avoid console errors abstract_mock = MagicMock() abstract_mock.text = "This is a sample abstract" abstract_mock.source = "openalex" monkeypatch.setattr("doi2dataset.AbstractProcessor.get_abstract", lambda *args, **kwargs: abstract_mock) # Mock the _fetch_data method to return our test data metadata_processor._fetch_data = MagicMock(return_value=openalex_data) # Mock methods that might cause issues in isolation metadata_processor._build_description = MagicMock(return_value="Test description") metadata_processor._get_involved_pis = MagicMock(return_value=[]) metadata_processor._build_organization_metadata = MagicMock(return_value={}) # Call the method we're testing metadata = metadata_processor._build_metadata(openalex_data) # Verify the basic metadata fields were extracted correctly assert metadata is not None assert 'datasetVersion' in metadata # Examine the fields inside datasetVersion.metadataBlocks assert 'metadataBlocks' in metadata['datasetVersion'] citation = metadata['datasetVersion']['metadataBlocks'].get('citation', {}) # Check fields in citation section assert 'fields' in citation fields = citation['fields'] # Check for basic metadata fields in a more flexible way field_names = [field.get('typeName') for field in fields] assert 'title' in field_names assert 'subject' in field_names assert 'dsDescription' in field_names # Description is named 'dsDescription' in the schema def test_build_metadata_authors(metadata_processor, openalex_data, monkeypatch): """Test that _build_metadata correctly processes author information""" # Mock the console to avoid print errors metadata_processor.console = MagicMock() # Mock the Abstract related methods and objects to avoid console errors abstract_mock = MagicMock() abstract_mock.text = "This is a sample abstract" abstract_mock.source = "openalex" monkeypatch.setattr("doi2dataset.AbstractProcessor.get_abstract", lambda *args, **kwargs: abstract_mock) # Mock the _fetch_data method to return our test data metadata_processor._fetch_data = MagicMock(return_value=openalex_data) # Mock methods that might cause issues in isolation metadata_processor._build_description = MagicMock(return_value="Test description") metadata_processor._get_involved_pis = MagicMock(return_value=[]) metadata_processor._build_organization_metadata = MagicMock(return_value={}) # Call the method we're testing metadata = metadata_processor._build_metadata(openalex_data) # Examine the fields inside datasetVersion.metadataBlocks assert 'metadataBlocks' in metadata['datasetVersion'] citation = metadata['datasetVersion']['metadataBlocks'].get('citation', {}) # Check fields in citation section assert 'fields' in citation fields = citation['fields'] # Check for author and datasetContact fields field_names = [field.get('typeName') for field in fields] assert 'author' in field_names assert 'datasetContact' in field_names # Verify these are compound fields with actual entries for field in fields: if field.get('typeName') == 'author': assert 'value' in field assert isinstance(field['value'], list) assert len(field['value']) > 0 if field.get('typeName') == 'datasetContact': assert 'value' in field assert isinstance(field['value'], list) # The datasetContact might be empty in test environment # Just check it exists rather than asserting length def test_build_metadata_keywords_and_topics(metadata_processor, openalex_data, monkeypatch): """Test that _build_metadata correctly extracts keywords and topics""" # Mock the console to avoid print errors metadata_processor.console = MagicMock() # Mock the Abstract related methods and objects to avoid console errors abstract_mock = MagicMock() abstract_mock.text = "This is a sample abstract" abstract_mock.source = "openalex" monkeypatch.setattr("doi2dataset.AbstractProcessor.get_abstract", lambda *args, **kwargs: abstract_mock) # Mock the _fetch_data method to return our test data metadata_processor._fetch_data = MagicMock(return_value=openalex_data) # Mock methods that might cause issues in isolation metadata_processor._build_description = MagicMock(return_value="Test description") metadata_processor._get_involved_pis = MagicMock(return_value=[]) metadata_processor._build_organization_metadata = MagicMock(return_value={}) # Call the method we're testing metadata = metadata_processor._build_metadata(openalex_data) # Examine the fields inside datasetVersion.metadataBlocks assert 'metadataBlocks' in metadata['datasetVersion'] citation = metadata['datasetVersion']['metadataBlocks'].get('citation', {}) # Check fields in citation section assert 'fields' in citation fields = citation['fields'] # Check for keyword and subject fields field_names = [field.get('typeName') for field in fields] # If keywords exist, verify structure if 'keyword' in field_names: for field in fields: if field.get('typeName') == 'keyword': assert 'value' in field assert isinstance(field['value'], list) # Check for subject field which should definitely exist assert 'subject' in field_names for field in fields: if field.get('typeName') == 'subject': assert 'value' in field assert isinstance(field['value'], list) assert len(field['value']) > 0