doi2dataset/tests/test_metadata_processor.py
Alexander Minges eb270cba9b
All checks were successful
Test pipeline / test (push) Successful in 12s
Update testing documentation and improve test structure
2025-05-20 15:17:18 +02:00

162 lines
No EOL
6.6 KiB
Python

import json
import os
import pytest
from unittest.mock import MagicMock, patch
from doi2dataset import MetadataProcessor
@pytest.fixture
def openalex_data():
"""Load the saved JSON response from the file 'srep45389.json'"""
json_path = os.path.join(os.path.dirname(__file__), "srep45389.json")
with open(json_path, "r", encoding="utf-8") as f:
data = json.load(f)
return data
@pytest.fixture
def metadata_processor():
"""Create a MetadataProcessor instance with mocked dependencies"""
doi = "10.1038/srep45389"
processor = MetadataProcessor(doi=doi, upload=False, progress=False)
return processor
def test_build_metadata_basic_fields(metadata_processor, openalex_data, monkeypatch):
"""Test that _build_metadata correctly extracts basic metadata fields"""
# Mock the console to avoid print errors
metadata_processor.console = MagicMock()
# Mock the Abstract related methods and objects to avoid console errors
abstract_mock = MagicMock()
abstract_mock.text = "This is a sample abstract"
abstract_mock.source = "openalex"
monkeypatch.setattr("doi2dataset.AbstractProcessor.get_abstract", lambda *args, **kwargs: abstract_mock)
# Mock the _fetch_data method to return our test data
metadata_processor._fetch_data = MagicMock(return_value=openalex_data)
# Mock methods that might cause issues in isolation
metadata_processor._build_description = MagicMock(return_value="Test description")
metadata_processor._get_involved_pis = MagicMock(return_value=[])
metadata_processor._build_organization_metadata = MagicMock(return_value={})
# Call the method we're testing
metadata = metadata_processor._build_metadata(openalex_data)
# Verify the basic metadata fields were extracted correctly
assert metadata is not None
assert 'datasetVersion' in metadata
# Examine the fields inside datasetVersion.metadataBlocks
assert 'metadataBlocks' in metadata['datasetVersion']
citation = metadata['datasetVersion']['metadataBlocks'].get('citation', {})
# Check fields in citation section
assert 'fields' in citation
fields = citation['fields']
# Check for basic metadata fields in a more flexible way
field_names = [field.get('typeName') for field in fields]
assert 'title' in field_names
assert 'subject' in field_names
assert 'dsDescription' in field_names # Description is named 'dsDescription' in the schema
def test_build_metadata_authors(metadata_processor, openalex_data, monkeypatch):
"""Test that _build_metadata correctly processes author information"""
# Mock the console to avoid print errors
metadata_processor.console = MagicMock()
# Mock the Abstract related methods and objects to avoid console errors
abstract_mock = MagicMock()
abstract_mock.text = "This is a sample abstract"
abstract_mock.source = "openalex"
monkeypatch.setattr("doi2dataset.AbstractProcessor.get_abstract", lambda *args, **kwargs: abstract_mock)
# Mock the _fetch_data method to return our test data
metadata_processor._fetch_data = MagicMock(return_value=openalex_data)
# Mock methods that might cause issues in isolation
metadata_processor._build_description = MagicMock(return_value="Test description")
metadata_processor._get_involved_pis = MagicMock(return_value=[])
metadata_processor._build_organization_metadata = MagicMock(return_value={})
# Call the method we're testing
metadata = metadata_processor._build_metadata(openalex_data)
# Examine the fields inside datasetVersion.metadataBlocks
assert 'metadataBlocks' in metadata['datasetVersion']
citation = metadata['datasetVersion']['metadataBlocks'].get('citation', {})
# Check fields in citation section
assert 'fields' in citation
fields = citation['fields']
# Check for author and datasetContact fields
field_names = [field.get('typeName') for field in fields]
assert 'author' in field_names
assert 'datasetContact' in field_names
# Verify these are compound fields with actual entries
for field in fields:
if field.get('typeName') == 'author':
assert 'value' in field
assert isinstance(field['value'], list)
assert len(field['value']) > 0
if field.get('typeName') == 'datasetContact':
assert 'value' in field
assert isinstance(field['value'], list)
# The datasetContact might be empty in test environment
# Just check it exists rather than asserting length
def test_build_metadata_keywords_and_topics(metadata_processor, openalex_data, monkeypatch):
"""Test that _build_metadata correctly extracts keywords and topics"""
# Mock the console to avoid print errors
metadata_processor.console = MagicMock()
# Mock the Abstract related methods and objects to avoid console errors
abstract_mock = MagicMock()
abstract_mock.text = "This is a sample abstract"
abstract_mock.source = "openalex"
monkeypatch.setattr("doi2dataset.AbstractProcessor.get_abstract", lambda *args, **kwargs: abstract_mock)
# Mock the _fetch_data method to return our test data
metadata_processor._fetch_data = MagicMock(return_value=openalex_data)
# Mock methods that might cause issues in isolation
metadata_processor._build_description = MagicMock(return_value="Test description")
metadata_processor._get_involved_pis = MagicMock(return_value=[])
metadata_processor._build_organization_metadata = MagicMock(return_value={})
# Call the method we're testing
metadata = metadata_processor._build_metadata(openalex_data)
# Examine the fields inside datasetVersion.metadataBlocks
assert 'metadataBlocks' in metadata['datasetVersion']
citation = metadata['datasetVersion']['metadataBlocks'].get('citation', {})
# Check fields in citation section
assert 'fields' in citation
fields = citation['fields']
# Check for keyword and subject fields
field_names = [field.get('typeName') for field in fields]
# If keywords exist, verify structure
if 'keyword' in field_names:
for field in fields:
if field.get('typeName') == 'keyword':
assert 'value' in field
assert isinstance(field['value'], list)
# Check for subject field which should definitely exist
assert 'subject' in field_names
for field in fields:
if field.get('typeName') == 'subject':
assert 'value' in field
assert isinstance(field['value'], list)
assert len(field['value']) > 0