test: replace hardcoded with dynamic extraction

- Replace hardcoded author names with dynamic openalex_data extraction - Extract DOIs from API response when tests use other response values - Remove redundant fake_openalex_response fixture - Add abstract_inverted_index_v3 to allowed None fields in API tests - Fix test robustness against fixture data changes - Improve test coverage from ~84% to ~90%
2025-07-25 12:17:24 +02:00 · 2025-07-25 12:17:24 +02:00 · cc94e495ff
commit cc94e495ff
parent c282cd1047
14 changed files with 883 additions and 56 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -31,12 +31,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Consolidate overlapping test concerns into dedicated files
 - Extract CLI tests into dedicated `test_cli.py` module
 - Improve test coverage from 63.87% to 84.84%
+- Replace hardcoded test values with dynamic extraction from API response fixtures
+- Extract DOIs from API response data in tests that use other response values for consistency
+- Remove redundant test fixtures and parameters

 ### Fixed

 - Fix list formatting in API documentation docstrings for better sphinx rendering
 - Fix formatting inconsistencies in constants.py (remove double empty lines)
 - Fix ruff linting issues with unused mock variables in tests
+- Replace hardcoded author names with dynamic extraction from OpenAlex data
+- Replace hardcoded content checks with dynamic validation using actual API response data
+- Fix test robustness against changes in fixture data by using real API response processing
+- Remove duplicate fake_openalex_response fixture in favor of direct openalex_data usage
+- Add abstract_inverted_index_v3 to allowed None fields in API response structure tests

 ## [v3.0.1] - 2025-07-25

--- a/tests/config_test.yaml
+++ b/tests/config_test.yaml
@ -1,3 +1,10 @@
+dataverse:
+  url: "https://test.dataverse.org"
+  api_token: "test_token"
+  dataverse: "test_dataverse"
+  auth_user: "test_user"
+  auth_password: "test_password"
+
 default_grants:
  - funder: "Awesome Funding Agency"
    id: "ABC12345"
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -1,8 +1,27 @@
+import json
 import os
 import sys

+import pytest
+
 # Get the path to the parent directory of tests
 parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))

 # Add the parent directory to sys.path
 sys.path.insert(0, parent_dir)
+
+
+@pytest.fixture(scope="session")
+def openalex_data():
+    """Load OpenAlex API response data for reuse across tests."""
+    json_path = os.path.join(os.path.dirname(__file__), "srep45389_openalex.json")
+    with open(json_path, encoding="utf-8") as f:
+        return json.load(f)
+
+
+@pytest.fixture(scope="session")
+def crossref_data():
+    """Load CrossRef API response data for reuse across tests."""
+    json_path = os.path.join(os.path.dirname(__file__), "srep45389_crossref.json")
+    with open(json_path, encoding="utf-8") as f:
+        return json.load(f)
--- a/tests/srep45389_crossref.json
+++ b/tests/srep45389_crossref.json
--- a/tests/srep45389_openalex.json
+++ b/tests/srep45389_openalex.json
--- a/tests/test_abstract_processor.py
+++ b/tests/test_abstract_processor.py
@ -204,3 +204,173 @@ class TestAbstractProcessor:
                    mock_crossref.assert_not_called()
                    mock_openalex.assert_called_once()
                    assert result2.source == "openalex"
+
+    def test_custom_license_console_output(self):
+        """Test console output for custom licenses without names"""
+        # Create a custom license without a name
+        custom_license = License(name="", uri="http://custom.license", short="custom")
+
+        with patch.object(
+            self.processor, "_get_openalex_abstract", return_value="OpenAlex text"
+        ):
+            with patch.object(self.processor.console, "print") as mock_print:
+                result = self.processor.get_abstract("10.1234/test", {}, custom_license)
+
+                # Should print custom license message
+                mock_print.assert_called()
+                # Check that it mentions "Custom license"
+                call_args = mock_print.call_args[0][0]
+                assert "Custom license does not allow derivative works" in call_args
+                assert result.source == "openalex"
+
+    def test_crossref_api_failure(self):
+        """Test _get_crossref_abstract when API call fails"""
+        from unittest.mock import Mock
+
+        # Mock API response failure
+        mock_response = Mock()
+        mock_response.status_code = 404
+
+        with patch.object(
+            self.processor.api_client, "make_request", return_value=mock_response
+        ):
+            result = self.processor._get_crossref_abstract("10.1234/test")
+            assert result is None
+
+        # Test with no response
+        with patch.object(self.processor.api_client, "make_request", return_value=None):
+            result = self.processor._get_crossref_abstract("10.1234/test")
+            assert result is None
+
+    def test_get_openalex_abstract_no_inverted_index(self):
+        """Test _get_openalex_abstract when no abstract_inverted_index exists"""
+        data = {"title": "Test Article"}  # No abstract_inverted_index
+
+        result = self.processor._get_openalex_abstract(data)
+        assert result is None
+
+    def test_clean_jats_comprehensive(self):
+        """Test _clean_jats method with various JATS tags"""
+        # Test with None input
+        result = self.processor._clean_jats(None)
+        assert result == ""
+
+        # Test with empty string
+        result = self.processor._clean_jats("")
+        assert result == ""
+
+        # Test with ordered list
+        jats_text = '<jats:list list-type="order"><jats:list-item>First item</jats:list-item><jats:list-item>Second item</jats:list-item></jats:list>'
+        expected = "<ol><li>First item</li><li>Second item</li></ol>"
+        result = self.processor._clean_jats(jats_text)
+        assert result == expected
+
+        # Test with unordered list
+        jats_text = '<jats:list list-type="bullet"><jats:list-item>Bullet one</jats:list-item><jats:list-item>Bullet two</jats:list-item></jats:list>'
+        expected = "<ul><li>Bullet one</li><li>Bullet two</li></ul>"
+        result = self.processor._clean_jats(jats_text)
+        assert result == expected
+
+        # Test with mixed formatting tags
+        jats_text = "<jats:p>This is <jats:italic>italic</jats:italic> and <jats:bold>bold</jats:bold> text with <jats:sup>superscript</jats:sup> and <jats:sub>subscript</jats:sub>.</jats:p>"
+        expected = "<p>This is <i>italic</i> and <b>bold</b> text with <sup>superscript</sup> and <sub>subscript</sub>.</p>"
+        result = self.processor._clean_jats(jats_text)
+        assert result == expected
+
+        # Test with other formatting tags
+        jats_text = "<jats:underline>Underlined</jats:underline> <jats:monospace>Code</jats:monospace> <jats:sc>Small caps</jats:sc>"
+        expected = "<u>Underlined</u> <code>Code</code> <small>Small caps</small>"
+        result = self.processor._clean_jats(jats_text)
+        assert result == expected
+
+        # Test with title and blockquote
+        jats_text = "<jats:title>Section Title</jats:title><jats:blockquote>This is a quote</jats:blockquote>"
+        expected = "<h2>Section Title</h2><blockquote>This is a quote</blockquote>"
+        result = self.processor._clean_jats(jats_text)
+        assert result == expected
+
+    def test_no_abstract_found_console_messages(self):
+        """Test console messages when no abstract is found"""
+        license_obj = create_license_from_map("cc-by-nd")  # No derivative allowed
+
+        with patch.object(self.processor, "_get_openalex_abstract", return_value=None):
+            with patch.object(self.processor.console, "print") as mock_print:
+                result = self.processor.get_abstract("10.1234/test", {}, license_obj)
+
+                # Should print warning messages
+                assert mock_print.call_count >= 2
+
+                # Check for specific warning messages
+                call_messages = [call[0][0] for call in mock_print.call_args_list]
+                assert any(
+                    "No abstract found in OpenAlex!" in msg for msg in call_messages
+                )
+                assert any(
+                    "No abstract found in either CrossRef nor OpenAlex!" in msg
+                    for msg in call_messages
+                )
+
+                assert result.text == ""
+                assert result.source == "none"
+
+    def test_crossref_abstract_with_real_data(self, crossref_data):
+        """Test CrossRef abstract extraction using real CrossRef data"""
+        from http import HTTPStatus
+        from unittest.mock import Mock
+
+        # Mock successful API response with real data
+        mock_response = Mock()
+        mock_response.status_code = HTTPStatus.OK
+        mock_response.json.return_value = crossref_data
+
+        # Extract DOI from CrossRef data since we're using other values from the response
+        expected_doi = crossref_data["message"]["DOI"]
+
+        with patch.object(
+            self.processor.api_client, "make_request", return_value=mock_response
+        ):
+            result = self.processor._get_crossref_abstract(expected_doi)
+
+            # Should successfully extract and clean the abstract
+            assert result is not None
+            assert len(result) > 0
+
+            # Check that JATS tags were converted to HTML
+            assert "<p>" in result  # JATS paragraphs converted
+            assert "<i>" in result  # JATS italic converted
+            assert "<sub>" in result  # JATS subscript converted
+            assert "jats:" not in result  # No JATS tags should remain
+
+    def test_jats_cleaning_comprehensive_real_data(self, crossref_data):
+        """Test JATS cleaning with real CrossRef abstract data"""
+
+        raw_abstract = crossref_data["message"]["abstract"]
+
+        # Clean the JATS tags
+        cleaned = self.processor._clean_jats(raw_abstract)
+
+        # Verify specific transformations from the real data
+        assert "<jats:title>" not in cleaned
+        assert "<h2>" in cleaned  # Title should be converted
+        assert "<jats:p>" not in cleaned
+        assert "<p>" in cleaned  # Paragraphs should be converted
+        assert "<jats:sub>" not in cleaned
+        assert "<sub>" in cleaned  # Subscripts should be converted
+        assert "<jats:italic>" not in cleaned
+        assert "<i>" in cleaned  # Italics should be converted
+
+        # Ensure the content is preserved by checking for specific content from the abstract
+        assert "pyruvate phosphate dikinase" in cleaned.lower()
+        assert "Abstract" in cleaned
+
+    def test_openalex_abstract_reconstruction_with_real_data(self, openalex_data):
+        """Test OpenAlex abstract reconstruction using real inverted index data"""
+
+        # Extract the abstract using the inverted index
+        result = self.processor._get_openalex_abstract(openalex_data)
+
+        if result:  # Only test if there's an abstract in the data
+            assert isinstance(result, str)
+            assert len(result) > 0
+            # Should be reconstructed from word positions
+            assert " " in result  # Should have spaces between words
--- a/tests/test_api_client.py
+++ b/tests/test_api_client.py
@ -428,3 +428,101 @@ class TestAPIClientUsageScenarios:
            assert "X-Dataverse-key" in client.session.headers
            assert "Custom-Header" in client.session.headers
            assert client.session.headers["Custom-Header"] == "custom-value"
+
+
+def test_api_response_structure_processing(openalex_data):
+    """Test API client processes complex nested response structures correctly."""
+    client = APIClient()
+
+    with patch.object(client.session, "request") as mock_request:
+        mock_response = Mock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = openalex_data
+        mock_request.return_value = mock_response
+
+        response = client.make_request("https://api.openalex.org/works/test")
+
+        assert response is not None
+        data = response.json()
+
+        # Test that nested structures are preserved through the request pipeline
+        if "authorships" in data:
+            assert isinstance(data["authorships"], list)
+            # Test deep nesting preservation
+            for authorship in data["authorships"]:
+                if "institutions" in authorship:
+                    assert isinstance(authorship["institutions"], list)
+
+        # Test data type preservation through JSON serialization/deserialization
+        for key, value in data.items():
+            assert value is not None or key in [
+                "abstract_inverted_index",
+                "abstract_inverted_index_v3",
+            ]  # Some fields can legitimately be None
+
+
+def test_api_unicode_encoding_processing(openalex_data):
+    """Test API client correctly processes Unicode characters in responses."""
+    client = APIClient()
+
+    with patch.object(client.session, "request") as mock_request:
+        mock_response = Mock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = openalex_data
+        mock_response.encoding = "utf-8"
+        mock_request.return_value = mock_response
+
+        response = client.make_request("https://api.openalex.org/works/test")
+
+        assert response is not None
+        data = response.json()
+
+        # Test that Unicode characters are preserved through processing pipeline
+        def check_unicode_preservation(obj):
+            if isinstance(obj, str):
+                # Should preserve Unicode characters
+                try:
+                    obj.encode("utf-8")
+                    return True
+                except UnicodeEncodeError:
+                    return False
+            elif isinstance(obj, dict):
+                return all(check_unicode_preservation(v) for v in obj.values())
+            elif isinstance(obj, list):
+                return all(check_unicode_preservation(item) for item in obj)
+            return True
+
+        assert check_unicode_preservation(data)
+
+
+def test_large_response_processing_efficiency(openalex_data):
+    """Test API client efficiently processes large response payloads."""
+    client = APIClient()
+
+    # Create large response based on real structure
+    large_data = dict(openalex_data)
+    if "referenced_works" in large_data:
+        # Extend existing referenced works
+        base_works = (
+            large_data["referenced_works"][:10]
+            if large_data["referenced_works"]
+            else []
+        )
+        large_data["referenced_works"] = base_works * 100  # Create large list
+
+    with patch.object(client.session, "request") as mock_request:
+        mock_response = Mock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = large_data
+        mock_request.return_value = mock_response
+
+        response = client.make_request("https://api.openalex.org/works/test")
+
+        assert response is not None
+        data = response.json()
+
+        # Verify large data structures are handled correctly
+        if "referenced_works" in data:
+            assert len(data["referenced_works"]) > 100
+            # All elements should maintain structure integrity
+            assert all(isinstance(work, str) for work in data["referenced_works"])
--- a/tests/test_citation_builder.py
+++ b/tests/test_citation_builder.py
@ -1,18 +1,8 @@
-import json
-import os
-
 import pytest

 from doi2dataset import CitationBuilder, Person, PIFinder

-
-@pytest.fixture
-def openalex_data():
-    """Load the saved JSON response from the file 'srep45389.json'"""
-    json_path = os.path.join(os.path.dirname(__file__), "srep45389.json")
-    with open(json_path, encoding="utf-8") as f:
-        data = json.load(f)
-    return data
+# openalex_data fixture now comes from conftest.py


@pytest.fixture
@ -169,3 +159,113 @@ def test_build_authors_with_ror(openalex_data, pi_finder):

        assert "@type" in expanded_value
        assert expanded_value["@type"] == "https://schema.org/Organization"
+
+
+def test_build_authors_with_real_data(openalex_data, pi_finder):
+    """Test author building with real OpenAlex data structure"""
+    doi = openalex_data["doi"].replace("https://doi.org/", "")
+    builder = CitationBuilder(data=openalex_data, doi=doi, pi_finder=pi_finder)
+
+    authors, corresponding = builder.build_authors()
+
+    # Should have multiple authors from the real data
+    assert len(authors) > 0
+
+    # Extract expected author names from the API response data
+    expected_authors = []
+    for authorship in openalex_data.get("authorships", []):
+        if "author" in authorship and "display_name" in authorship["author"]:
+            expected_authors.append(authorship["author"]["display_name"])
+
+    # Check that real author names from API response are processed correctly
+    author_names = [f"{author.given_name} {author.family_name}" for author in authors]
+
+    # Verify that at least some expected authors from the API response are found
+    found_authors = 0
+    for expected_name in expected_authors:
+        if any(expected_name in author_name for author_name in author_names):
+            found_authors += 1
+
+    # Should find at least some authors from the API response
+    assert (
+        found_authors > 0
+    ), f"No expected authors found. Expected: {expected_authors}, Got: {author_names}"
+
+
+def test_process_author_edge_cases(pi_finder):
+    """Test _process_author with various edge cases"""
+    builder = CitationBuilder(
+        data={"authorships": []}, doi="10.1000/test", pi_finder=pi_finder
+    )
+
+    # Test with minimal author data
+    minimal_author = {"display_name": "John Smith"}
+    empty_authorship = {}
+    person = builder._process_author(minimal_author, empty_authorship)
+    assert person.given_name == "John"
+    assert person.family_name == "Smith"
+
+    # Test with ORCID
+    author_with_orcid = {
+        "display_name": "Jane Doe",
+        "orcid": "https://orcid.org/0000-0000-0000-0000",
+    }
+    person = builder._process_author(author_with_orcid, empty_authorship)
+    assert person.orcid == "0000-0000-0000-0000"  # URL part is stripped
+
+
+def test_build_grants_with_default_config(pi_finder):
+    """Test that grants include default grants from config"""
+    # Use real data structure but focus on grants behavior
+    data = {"authorships": [], "grants": []}
+
+    builder = CitationBuilder(data=data, doi="10.1000/test", pi_finder=pi_finder)
+    grants = builder.build_grants()
+
+    # Should have at least the default grants from config
+    # The exact number depends on the config, but should be >= 0
+    assert isinstance(grants, list)
+    for grant in grants:
+        assert len(grant) == 2  # Should have agency and value fields
+        assert grant[0].name == "grantNumberAgency"
+        assert grant[1].name == "grantNumberValue"
+
+
+def test_process_corresponding_author_no_email(pi_finder):
+    """Test _process_corresponding_author when no email is available"""
+    builder = CitationBuilder(
+        data={"authorships": []}, doi="10.1000/test", pi_finder=pi_finder
+    )
+
+    # Create a Person without email
+    person = Person(
+        given_name="John", family_name="Doe", orcid=None, email=None, affiliation=None
+    )
+
+    authorship = {"is_corresponding": True}
+
+    result = builder._process_corresponding_author(person, authorship)
+
+    # Should return None when no email is available
+    assert result is None
+
+
+def test_build_authors_skip_empty_authorships(pi_finder):
+    """Test that empty author entries are skipped"""
+    data_with_empty_authors = {
+        "authorships": [
+            {"author": {}},  # Empty author
+            {},  # No author key
+            {"author": {"display_name": "John Doe"}},  # Valid author
+        ]
+    }
+
+    builder = CitationBuilder(
+        data=data_with_empty_authors, doi="10.1000/test", pi_finder=pi_finder
+    )
+    authors, corresponding = builder.build_authors()
+
+    # Should only process the one valid author
+    assert len(authors) == 1
+    assert authors[0].given_name == "John"
+    assert authors[0].family_name == "Doe"
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@ -1,4 +1,3 @@
-import json
 import os
 from unittest.mock import patch

@ -44,27 +43,15 @@ def load_config_test():
    Config.load_config(config_path=config_path)


-@pytest.fixture
-def fake_openalex_response():
-    """
-    Load the saved JSON response from the file 'srep45389.json'
-    located in the same directory as this test file.
-    """
-    json_path = os.path.join(os.path.dirname(__file__), "srep45389.json")
-    with open(json_path, encoding="utf-8") as f:
-        data = json.load(f)
-    return data
-
-
-def test_fetch_doi_data_with_file(mocker, fake_openalex_response):
+def test_fetch_doi_data_with_file(mocker, openalex_data):
    """
    Test fetching DOI metadata by simulating the API call with a locally saved JSON response.

    The APIClient.make_request method is patched to return a fake response built from the contents
    of 'srep45389.json', ensuring that the configuration is loaded from 'config_test.yaml'.
    """
-    doi = "10.1038/srep45389"
-    fake_response = FakeResponse(fake_openalex_response, 200)
+    doi = openalex_data["doi"].replace("https://doi.org/", "")
+    fake_response = FakeResponse(openalex_data, 200)

    # Patch the make_request method of APIClient to return our fake_response.
    mocker.patch("doi2dataset.APIClient.make_request", return_value=fake_response)
@ -75,11 +62,11 @@ def test_fetch_doi_data_with_file(mocker, fake_openalex_response):
    # Call _fetch_data(), which should now return our fake JSON data.
    data = processor._fetch_data()

-    # Verify that the fetched data matches the fake JSON data.
-    assert data == fake_openalex_response
+    # Verify that the fetched data matches the OpenAlex data.
+    assert data == openalex_data


-def test_openalex_abstract_extraction(mocker, fake_openalex_response):
+def test_openalex_abstract_extraction(openalex_data):
    """Test the extraction of abstracts from OpenAlex inverted index data."""
    # Create API client for AbstractProcessor
    api_client = APIClient()
@ -88,20 +75,20 @@ def test_openalex_abstract_extraction(mocker, fake_openalex_response):
    processor = AbstractProcessor(api_client=api_client)

    # Call the protected method directly with the fake response
-    abstract_text = processor._get_openalex_abstract(fake_openalex_response)
+    result = processor._get_openalex_abstract(openalex_data)

    # Verify abstract was extracted
-    assert abstract_text is not None
+    assert result is not None

    # If abstract exists in the response, it should be properly extracted
-    if "abstract_inverted_index" in fake_openalex_response:
-        assert len(abstract_text) > 0
+    if "abstract_inverted_index" in openalex_data:
+        assert len(result) > 0


-def test_subject_mapper(fake_openalex_response):
+def test_subject_mapper(openalex_data):
    """Test that the SubjectMapper correctly maps OpenAlex topics to subjects."""
    # Extract topics from the OpenAlex response
-    topics = fake_openalex_response.get("topics", [])
+    topics = openalex_data.get("topics", [])

    # Get subjects using the class method
    subjects = SubjectMapper.get_subjects({"topics": topics})
@ -111,15 +98,15 @@ def test_subject_mapper(fake_openalex_response):
    assert isinstance(subjects, list)


-def test_citation_builder(fake_openalex_response):
+def test_citation_builder(openalex_data):
    """Test that the CitationBuilder correctly builds author information."""
-    doi = "10.1038/srep45389"
+    doi = openalex_data["doi"].replace("https://doi.org/", "")

    # Mock PIFinder with an empty list of PIs
    pi_finder = PIFinder(pis=[])

    # Create builder with required arguments
-    builder = CitationBuilder(data=fake_openalex_response, doi=doi, pi_finder=pi_finder)
+    builder = CitationBuilder(data=openalex_data, doi=doi, pi_finder=pi_finder)

    # Test building other IDs
    other_ids = builder.build_other_ids()
@ -134,12 +121,10 @@ def test_citation_builder(fake_openalex_response):
    assert isinstance(topics, list)


-def test_license_processor(fake_openalex_response):
+def test_license_processor(openalex_data):
    """Test that the LicenseProcessor correctly identifies and processes licenses."""
    # Create a simplified data structure that contains license info
-    license_data = {
-        "primary_location": fake_openalex_response.get("primary_location", {})
-    }
+    license_data = {"primary_location": openalex_data.get("primary_location", {})}

    # Process the license
    license_obj = LicenseProcessor.process_license(license_data)
@ -182,14 +167,14 @@ def test_config_load_invalid_path():
        Config.load_config(config_path=invalid_path)


-def test_metadata_processor_fetch_data(mocker, fake_openalex_response):
+def test_metadata_processor_fetch_data(mocker, openalex_data):
    """Test the _fetch_data method of the MetadataProcessor class with mocked responses."""
-    doi = "10.1038/srep45389"
+    doi = openalex_data["doi"].replace("https://doi.org/", "")

    # Mock API response
    mocker.patch(
        "doi2dataset.APIClient.make_request",
-        return_value=FakeResponse(fake_openalex_response, 200),
+        return_value=FakeResponse(openalex_data, 200),
    )

    # Create processor with upload disabled and progress disabled
@ -200,7 +185,7 @@ def test_metadata_processor_fetch_data(mocker, fake_openalex_response):

    # Verify that data was fetched correctly
    assert data is not None
-    assert data == fake_openalex_response
+    assert data == openalex_data

    # Verify the DOI is correctly stored
    assert processor.doi == doi
--- a/tests/test_license_processor.py
+++ b/tests/test_license_processor.py
@ -102,3 +102,82 @@ def test_derivative_allowed_licenses_set_completeness():
    """Test that DERIVATIVE_ALLOWED_LICENSES contains expected licenses"""
    expected_licenses = {"cc-by", "cc-by-sa", "cc-by-nc", "cc-by-nc-sa", "cc0", "pd"}
    assert DERIVATIVE_ALLOWED_LICENSES == expected_licenses
+
+
+def test_license_processing_with_real_openalex_structure(openalex_data):
+    """Test that license processor correctly handles real OpenAlex data structure."""
+    # Process license data exactly as the real application would
+    license_obj = LicenseProcessor.process_license(openalex_data)
+
+    # Verify the processing logic works with real data structure
+    assert isinstance(license_obj, License)
+    assert hasattr(license_obj, "short")
+    assert hasattr(license_obj, "name")
+    assert hasattr(license_obj, "uri")
+
+    # Test derivative permission logic with real license
+    if license_obj.short in DERIVATIVE_ALLOWED_LICENSES:
+        # Should be able to use CrossRef abstract
+        assert license_obj.short in [
+            "cc-by",
+            "cc-by-sa",
+            "cc-by-nc",
+            "cc-by-nc-sa",
+            "cc0",
+            "pd",
+        ]
+    else:
+        # Should use OpenAlex abstract reconstruction
+        assert license_obj.short not in DERIVATIVE_ALLOWED_LICENSES
+
+
+def test_license_processing_with_multiple_locations(openalex_data):
+    """Test license processing logic with multiple publication locations."""
+    # Process all locations like the real application might encounter
+    locations = openalex_data.get("locations", [])
+
+    processed_licenses = []
+    for location in locations:
+        # Create data structure as it would appear from API
+        location_data = {"primary_location": location}
+        license_obj = LicenseProcessor.process_license(location_data)
+        processed_licenses.append(license_obj)
+
+    # Verify processing logic works for all location types
+    assert len(processed_licenses) > 0
+    assert all(isinstance(lic, License) for lic in processed_licenses)
+
+    # Should handle various license states consistently
+    for license_obj in processed_licenses:
+        if license_obj.short != "unknown":
+            assert (
+                license_obj.short in DERIVATIVE_ALLOWED_LICENSES
+                or license_obj.short not in DERIVATIVE_ALLOWED_LICENSES
+            )
+
+
+def test_crossref_license_url_mapping_logic(crossref_data):
+    """Test license URL to short-form mapping logic with real CrossRef data."""
+    # Extract license information as the real application would
+    crossref_licenses = crossref_data.get("message", {}).get("license", [])
+
+    if crossref_licenses:
+        license_url = crossref_licenses[0].get("URL", "")
+
+        # Test the mapping logic that would be used in practice
+        from doi2dataset import LICENSE_MAP
+
+        # Find corresponding short form by URL matching
+        matching_short = None
+        for short, (uri, _name) in LICENSE_MAP.items():
+            if uri == license_url:
+                matching_short = short
+                break
+
+        if matching_short:
+            # Test that our license processor handles this correctly
+            test_data = {"primary_location": {"license": matching_short}}
+            license_obj = LicenseProcessor.process_license(test_data)
+
+            assert license_obj.short == matching_short
+            assert license_obj.uri == license_url
--- a/tests/test_metadata_processor.py
+++ b/tests/test_metadata_processor.py
@ -1,5 +1,4 @@
 import json
-import os
 import tempfile
 from http import HTTPStatus
 from pathlib import Path
@ -9,14 +8,7 @@ import pytest

 from doi2dataset import MetadataProcessor

-
-@pytest.fixture
-def openalex_data():
-    """Load the saved JSON response from the file 'srep45389.json'"""
-    json_path = os.path.join(os.path.dirname(__file__), "srep45389.json")
-    with open(json_path, encoding="utf-8") as f:
-        data = json.load(f)
-    return data
+# openalex_data fixture now comes from conftest.py


@pytest.fixture
@ -444,6 +436,175 @@ class TestMetadataProcessorErrorHandling:
        with pytest.raises(KeyError, match="Missing required field"):
            processor.process()

+    def test_update_progress_with_progress_bar(self):
+        """Test progress update when progress bar is enabled."""
+        processor = MetadataProcessor(
+            doi="10.1000/test", output_path=Path("/tmp/test.json"), progress=True
+        )
+        processor.console = MagicMock()
+
+        # Mock progress bar
+        mock_progress = MagicMock()
+        processor.progress = mock_progress
+        processor.task_id = "test_task_id"
+
+        processor._update_progress()
+
+        # Verify progress.advance was called
+        mock_progress.advance.assert_called_once_with("test_task_id")
+
+    def test_update_progress_without_progress_bar(self):
+        """Test progress update when progress bar is disabled."""
+        processor = MetadataProcessor(
+            doi="10.1000/test", output_path=Path("/tmp/test.json"), progress=False
+        )
+        processor.console = MagicMock()
+
+        # No progress bar set
+        processor.progress = None
+        processor.task_id = None
+
+        # Should not raise any errors
+        processor._update_progress()
+
+    @patch("doi2dataset.processing.metadata.APIClient")
+    def test_upload_success_with_persistent_id(self, mock_api_client_class):
+        """Test successful upload with persistent ID response."""
+        import os
+
+        from doi2dataset import Config
+
+        # Load test config
+        config_path = os.path.join(os.path.dirname(__file__), "config_test.yaml")
+        Config.load_config(config_path=config_path)
+
+        # Mock the APIClient instance and response
+        mock_client = Mock()
+        mock_response = Mock()
+        mock_response.status_code = 201  # Success status for upload
+        mock_response.json.return_value = {
+            "data": {"persistentId": "doi:10.7910/DVN/TEST123"}
+        }
+        mock_client.make_request.return_value = mock_response
+        mock_api_client_class.return_value = mock_client
+
+        processor = MetadataProcessor(
+            doi="10.1000/test", output_path=Path("/tmp/test.json"), upload=True
+        )
+        processor.console = MagicMock()
+
+        metadata = {"datasetVersion": {"files": []}}
+        result = processor._upload_data(metadata)
+
+        # Verify successful response handling
+        assert result["data"]["persistentId"] == "doi:10.7910/DVN/TEST123"
+        processor.console.print.assert_called()
+
+    @patch("doi2dataset.processing.metadata.APIClient")
+    def test_upload_success_console_output(self, mock_api_client_class):
+        """Test console output during successful upload."""
+        import os
+        from unittest.mock import Mock
+
+        from doi2dataset import Config
+
+        # Load test config
+        config_path = os.path.join(os.path.dirname(__file__), "config_test.yaml")
+        Config.load_config(config_path=config_path)
+
+        # Mock the APIClient instance and response
+        mock_client = Mock()
+        mock_response = Mock()
+        mock_response.status_code = 201  # Success status for upload
+        mock_response.json.return_value = {
+            "data": {"persistentId": "doi:10.7910/DVN/TEST123"}
+        }
+        mock_client.make_request.return_value = mock_response
+        mock_api_client_class.return_value = mock_client
+
+        processor = MetadataProcessor(
+            doi="10.1000/test", output_path=Path("/tmp/test.json"), upload=True
+        )
+        processor.console = MagicMock()
+
+        metadata = {"datasetVersion": {"files": []}}
+        processor._upload_data(metadata)
+
+        # Verify successful upload message was printed
+        processor.console.print.assert_called()
+        call_args = [call[0][0] for call in processor.console.print.call_args_list]
+        upload_message = next(
+            (msg for msg in call_args if "Dataset uploaded to:" in msg), None
+        )
+        assert upload_message is not None
+        assert "TEST123" in upload_message
+
+    def test_progress_update_integration(self):
+        """Test progress updates during complete processing workflow."""
+        from unittest.mock import patch
+
+        # Mock all external dependencies
+        mock_data = {"title": "Test Paper", "authorships": []}
+
+        with patch(
+            "doi2dataset.processing.metadata.MetadataProcessor._fetch_data",
+            return_value=mock_data,
+        ):
+            with patch(
+                "doi2dataset.processing.metadata.MetadataProcessor._build_metadata",
+                return_value={"test": "metadata"},
+            ):
+                with patch(
+                    "doi2dataset.processing.metadata.MetadataProcessor._save_output"
+                ):
+                    processor = MetadataProcessor(
+                        doi="10.1000/test",
+                        output_path=Path("/tmp/test.json"),
+                        progress=True,
+                    )
+                    processor.console = MagicMock()
+
+                    # Mock progress bar
+                    mock_progress = MagicMock()
+                    processor.progress = mock_progress
+                    processor.task_id = "test_task"
+
+                    # Process should call _update_progress multiple times
+                    processor.process()
+
+                    # Verify progress was advanced multiple times (fetch, build, save)
+                    assert mock_progress.advance.call_count >= 3
+                    for call in mock_progress.advance.call_args_list:
+                        assert call[0][0] == "test_task"
+
+    def test_fetch_data_with_real_structure(self, openalex_data):
+        """Test _fetch_data method with realistic OpenAlex response structure."""
+        from http import HTTPStatus
+        from unittest.mock import Mock, patch
+
+        mock_client = Mock()
+        mock_response = Mock()
+        mock_response.status_code = HTTPStatus.OK
+        mock_response.json.return_value = openalex_data
+        # Test fetch_data with real structure
+        mock_client.make_request.return_value = mock_response
+
+        with patch(
+            "doi2dataset.processing.metadata.APIClient", return_value=mock_client
+        ):
+            processor = MetadataProcessor(
+                doi="10.1038/srep45389", output_path=Path("/tmp/test.json")
+            )
+            processor.console = MagicMock()
+
+            result = processor._fetch_data()
+
+            # Verify we got the expected data structure
+            assert result == openalex_data
+            assert "title" in result
+            assert "authorships" in result
+            assert "publication_date" in result
+
    def test_partial_data(self):
        """Test handling of incomplete API responses."""
        with patch(
--- a/tests/test_models.py
+++ b/tests/test_models.py
@ -84,3 +84,81 @@ def test_person_to_dict_with_no_affiliation():
    assert result["family_name"] == "Green"
    assert result["given_name"] == "Alice"
    assert result["orcid"] == "0000-0002-1111-2222"
+
+
+def test_person_creation_from_real_authorship_data(openalex_data):
+    """Test Person creation by processing real OpenAlex authorship data."""
+    from doi2dataset.utils.validation import split_name
+
+    # Process first authorship like the real application would
+    first_authorship = openalex_data["authorships"][0]
+    author_data = first_authorship["author"]
+
+    # Extract display_name and process it like CitationBuilder does
+    display_name = author_data.get("display_name", "")
+    given_name, family_name = split_name(display_name)
+
+    # Extract ORCID and clean it like the real application
+    orcid = author_data.get("orcid")
+    if orcid and "orcid.org/" in orcid:
+        orcid = orcid.split("orcid.org/")[-1]
+
+    person = Person(
+        family_name=family_name,
+        given_name=given_name,
+        orcid=orcid,
+        email=None,
+        affiliation=None,
+    )
+
+    # Verify the processing worked correctly
+    assert person.family_name != ""
+    assert person.given_name != ""
+    if orcid:
+        assert len(person.orcid) == 19  # ORCID format: 0000-0000-0000-0000
+
+
+def test_institution_processing_from_real_data(openalex_data):
+    """Test Institution creation by processing real OpenAlex institution data."""
+    # Process first institution like the real application would
+    first_authorship = openalex_data["authorships"][0]
+    institution_data = first_authorship["institutions"][0]
+
+    # Extract and process data like CitationBuilder does
+    display_name = institution_data.get("display_name", "")
+    ror = institution_data.get("ror", "")
+
+    institution = Institution(display_name=display_name, ror=ror)
+
+    # Test that processing preserves essential functionality
+    assert len(institution.display_name) > 0
+    if ror:
+        assert ror.startswith("https://ror.org/")
+        affiliation_field = institution.affiliation_field()
+        assert affiliation_field.value == ror
+        assert affiliation_field.expanded_value["termName"] == display_name
+
+
+def test_multiple_institutions_processing(openalex_data):
+    """Test processing multiple institutions from real authorship data."""
+    institutions_created = []
+
+    # Process all institutions like the real application would
+    for authorship in openalex_data["authorships"]:
+        for institution_data in authorship.get("institutions", []):
+            display_name = institution_data.get("display_name", "")
+            ror = institution_data.get("ror", "")
+
+            if display_name:  # Only create if there's actual data
+                institution = Institution(display_name=display_name, ror=ror)
+                institutions_created.append(institution)
+
+    # Verify we processed multiple institutions successfully
+    assert len(institutions_created) > 0
+
+    # All should have valid display names
+    assert all(len(inst.display_name) > 0 for inst in institutions_created)
+
+    # Some should have ROR IDs (based on real data)
+    ror_institutions = [inst for inst in institutions_created if inst.ror]
+    assert len(ror_institutions) > 0
--- a/tests/test_publication_utils.py
+++ b/tests/test_publication_utils.py
@ -29,6 +29,86 @@ def test_get_publication_year_with_date(metadata_processor):
    assert year == ""


+def test_publication_year_processing_logic(openalex_data):
+    """Test publication year extraction logic with real OpenAlex data structure."""
+    doi = openalex_data["doi"].replace("https://doi.org/", "")
+    processor = MetadataProcessor(doi=doi, upload=False, progress=False)
+    processor.console = MagicMock()
+
+    # Test the actual processing logic used by the application
+    year = processor._get_publication_year(openalex_data)
+
+    # Verify the processing logic works (should prefer publication_year field)
+    assert isinstance(year, int)
+    assert year > 1900  # Reasonable publication year
+    assert year <= 2030  # Not future date
+
+
+def test_doi_validation_processing_pipeline(openalex_data):
+    """Test DOI processing pipeline with real OpenAlex DOI format."""
+    from doi2dataset.utils.validation import normalize_doi, validate_doi
+
+    # Extract DOI as the real application would
+    doi_from_data = openalex_data.get("doi", "")
+
+    # Process DOI through the same pipeline as real application
+    if doi_from_data.startswith("https://doi.org/"):
+        clean_doi = doi_from_data.replace("https://doi.org/", "")
+    else:
+        clean_doi = doi_from_data
+
+    # Test validation and normalization logic
+    is_valid = validate_doi(clean_doi)
+    normalized = normalize_doi(clean_doi)
+
+    assert is_valid is True
+    assert normalized.startswith("10.")
+    assert len(normalized.split("/")) == 2  # Should have registrant/suffix format
+
+
+def test_subject_mapping_processing_logic(openalex_data):
+    """Test subject mapping logic with real OpenAlex topics structure."""
+    from doi2dataset import SubjectMapper
+
+    # Process topics exactly as the real application would
+    topics = openalex_data.get("topics", [])
+
+    # Test SubjectMapper processing logic
+    subjects = SubjectMapper.get_subjects({"topics": topics})
+
+    # Verify the mapping logic produces valid results
+    assert isinstance(subjects, list)
+
+    # If we have topics, we should get subjects
+    if topics:
+        assert len(subjects) > 0
+        # Each subject should be a string
+        assert all(isinstance(subj, str) for subj in subjects)
+
+
+def test_abstract_reconstruction_processing(openalex_data):
+    """Test abstract reconstruction logic with real inverted index data."""
+    from doi2dataset.api.client import APIClient
+    from doi2dataset.api.processors import AbstractProcessor
+
+    # Test the actual reconstruction logic used in the application
+    processor = AbstractProcessor(APIClient())
+
+    # Process abstract inverted index as the real application would
+    reconstructed = processor._get_openalex_abstract(openalex_data)
+
+    if openalex_data.get("abstract_inverted_index"):
+        # Should successfully reconstruct abstract
+        assert reconstructed is not None
+        assert isinstance(reconstructed, str)
+        assert len(reconstructed) > 0
+        # Should contain readable text with spaces
+        assert " " in reconstructed
+    else:
+        # Should handle missing abstract gracefully
+        assert reconstructed is None
+
+
 def test_get_publication_year_with_both_fields(metadata_processor):
    """Test that _get_publication_year prioritizes publication_year over date"""
    data = {"publication_year": 2020, "publication_date": "2019-05-15"}
--- a/tests/test_validation_utils.py
+++ b/tests/test_validation_utils.py
@ -188,6 +188,47 @@ def test_validate_email_validator_error():
        assert result is False


+@patch("dns.resolver.resolve")
+def test_validate_email_dns_exceptions(mock_resolve):
+    """Test email validation with DNS-related exceptions."""
+    # Test with mocked DNS resolver raising various exceptions
+    with patch("email_validator.validate_email") as mock_validate:
+        mock_result = Mock()
+        mock_result.normalized = "test@example.com"
+        mock_validate.return_value = mock_result
+
+        # Test with NoAnswer exception
+        mock_resolve.side_effect = dns.resolver.NoAnswer()
+        result = validate_email_address("test@example.com")
+        assert result is False
+
+        # Test with NXDOMAIN exception
+        mock_resolve.side_effect = dns.resolver.NXDOMAIN()
+        result = validate_email_address("test@example.com")
+        assert result is False
+
+
+def test_validate_email_validator_exceptions():
+    """Test email validation with email_validator exceptions."""
+    # Test email validator error
+    with patch("email_validator.validate_email") as mock_validate:
+        mock_validate.side_effect = EmailNotValidError("Invalid format")
+        result = validate_email_address("invalid-email")
+        assert result is False
+
+    # Test with various malformed emails that should fail validation
+    invalid_emails = [
+        "plainaddress",
+        "@missingusername.com",
+        "username@.com",
+        "username@com",
+        "username..double.dot@example.com",
+    ]
+
+    for email in invalid_emails:
+        assert validate_email_address(email) is False
+
+
 # DOI validation edge cases
 def test_validate_doi_formats():
    """Test validation of various valid DOI formats."""