feat: add pre-commit setup with gitlint

2025-07-14 09:39:07 +02:00 · 2025-07-14 09:39:07 +02:00 · 9d270ec601
commit 9d270ec601
parent b4e9943b7c
17 changed files with 1197 additions and 360 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,54 @@
 # Pre-commit configuration for doi2dataset
 # See https://pre-commit.com for more information
 # See https://pre-commit.com/hooks.html for more hooks
 repos:
  # Built-in pre-commit hooks
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.6.0
    hooks:
      - id: trailing-whitespace
      - id: end-of-file-fixer
      - id: check-yaml
      - id: check-added-large-files
      - id: check-merge-conflict
      - id: check-json
      - id: check-toml
      - id: mixed-line-ending
        args: ['--fix=lf']
  # Python code formatting and linting
  - repo: https://github.com/astral-sh/ruff-pre-commit
    rev: v0.6.9
    hooks:
      - id: ruff
        args: [--fix, --exit-non-zero-on-fix]
      - id: ruff-format
  # Git commit message linting with gitlint
  - repo: https://github.com/jorisroovers/gitlint
    rev: v0.19.1
    hooks:
      - id: gitlint
        stages: [commit-msg]
  # Optional: Check for common security issues
  - repo: https://github.com/PyCQA/bandit
    rev: 1.7.10
    hooks:
      - id: bandit
        args: ["-c", "pyproject.toml"]
        additional_dependencies: ["bandit[toml]"]
 # Configuration for specific hooks
 ci:
  autofix_commit_msg: |
    [pre-commit.ci] auto fixes from pre-commit hooks
    for more information, see https://pre-commit.ci
  autofix_prs: true
  autoupdate_branch: ''
  autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate'
  autoupdate_schedule: weekly
  skip: []
  submodules: false
--- a/README.md
+++ b/README.md
@ -152,6 +152,73 @@ Documentation is automatically built and deployed via GitLab CI/CD:
 - Deployed to GitLab Pages
 - Accessible at your project's Pages URL
 ## Git Commit Message Linting
 This project uses [gitlint](https://jorisroovers.github.io/gitlint/) to enforce consistent commit message formatting. Commit messages should follow the [Conventional Commits](https://www.conventionalcommits.org/) specification.
 ### Commit Message Format
 Commit messages must follow this format:
 ```
 <type>(<scope>): <description>
 [optional body]
 [optional footer(s)]
 ```
 **Types:**
 - `feat`: A new feature
 - `fix`: A bug fix
 - `docs`: Documentation only changes
 - `style`: Changes that do not affect the meaning of the code
 - `refactor`: A code change that neither fixes a bug nor adds a feature
 - `test`: Adding missing tests or correcting existing tests
 - `chore`: Changes to the build process or auxiliary tools
 - `ci`: Changes to CI configuration files and scripts
 - `build`: Changes that affect the build system or dependencies
 - `perf`: A code change that improves performance
 - `revert`: Reverts a previous commit
 **Examples:**
 ```
 feat(api): add support for DOI batch processing
 fix(metadata): handle missing author information gracefully
 docs: update installation instructions
 test(citation): add tests for license processing
 ```
 ### Linting Commit Messages
 To lint commit messages, use the provided script:
 ```bash
 # Lint the last commit
 python scripts/lint-commit.py
 # Lint a specific commit
 python scripts/lint-commit.py --hash <commit-hash>
 # Lint a range of commits
 python scripts/lint-commit.py --range HEAD~3..
 # Install as a git hook (optional)
 python scripts/lint-commit.py --install-hook
 ```
 ### Git Hook Installation
 You can optionally install a git hook that automatically checks commit messages:
 ```bash
 python scripts/lint-commit.py --install-hook
 ```
 This will create a `commit-msg` hook that runs automatically when you commit, ensuring all commit messages follow the required format.
 ## Testing
 Tests are implemented with pytest. The test suite provides comprehensive coverage of core functionalities. To run the tests, execute:
@ -270,6 +337,33 @@ This version has been updated to make the tool more generalized and suitable for
 Contributions are welcome! Please fork the repository and submit a pull request with your improvements.
 ### Development Setup
 1. Install development dependencies:
   ```bash
   pip install -r requirements-dev.txt
   ```
 2. Run tests to ensure everything works:
   ```bash
   pytest
   ```
 3. Install the git commit message hook (recommended):
   ```bash
   python scripts/lint-commit.py --install-hook
   ```
 ### Code Quality
 - Follow the existing code style and formatting
 - Write tests for new functionality
 - Ensure all tests pass before submitting
 - Use meaningful commit messages following the conventional commits format
 - Run `python scripts/lint-commit.py` to validate commit messages
 ## License
 This project is licensed under the MIT License. See the [LICENSE.md](LICENSE.md) file for details.
--- a/docs/make.bat
+++ b/docs/make.bat
@ -1,47 +1,47 @@
-@ECHO OFF
+@ECHO OFF
-
+
-pushd %~dp0
+pushd %~dp0
-
+
-REM Command file for Sphinx documentation
+REM Command file for Sphinx documentation
-
+
-if "%SPHINXBUILD%" == "" (
+if "%SPHINXBUILD%" == "" (
-	set SPHINXBUILD=sphinx-build
+	set SPHINXBUILD=sphinx-build
-)
+)
-set SOURCEDIR=source
+set SOURCEDIR=source
-set BUILDDIR=build
+set BUILDDIR=build
-
+
-%SPHINXBUILD% >NUL 2>NUL
+%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
+if errorlevel 9009 (
-	echo.
+	echo.
-	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
-	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.installed, then set the SPHINXBUILD environment variable to point
-	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
-	echo.may add the Sphinx directory to PATH.
+	echo.may add the Sphinx directory to PATH.
-	echo.
+	echo.
-	echo.If you don't have Sphinx installed, grab it from
+	echo.If you don't have Sphinx installed, grab it from
-	echo.https://www.sphinx-doc.org/
+	echo.https://www.sphinx-doc.org/
-	exit /b 1
+	exit /b 1
-)
+)
-
+
-if "%1" == "" goto help
+if "%1" == "" goto help
-
+
-if "%1" == "multiversion" goto multiversion
+if "%1" == "multiversion" goto multiversion
-if "%1" == "multiversion-clean" goto multiversion-clean
+if "%1" == "multiversion-clean" goto multiversion-clean
-
+
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-goto end
+goto end
-
+
-:multiversion
+:multiversion
-sphinx-multiversion %SOURCEDIR% %BUILDDIR%\multiversion\html %SPHINXOPTS% %O%
+sphinx-multiversion %SOURCEDIR% %BUILDDIR%\multiversion\html %SPHINXOPTS% %O%
-goto end
+goto end
-
+
-:multiversion-clean
+:multiversion-clean
-rmdir /s /q %BUILDDIR%\html 2>nul
+rmdir /s /q %BUILDDIR%\html 2>nul
-sphinx-multiversion %SOURCEDIR% %BUILDDIR%\multiversion\html %SPHINXOPTS% %O%
+sphinx-multiversion %SOURCEDIR% %BUILDDIR%\multiversion\html %SPHINXOPTS% %O%
-goto end
+goto end
-
+
-:help
+:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-
+
-:end
+:end
-popd
+popd
--- a/docs/source/commit-messages.rst
+++ b/docs/source/commit-messages.rst
@ -0,0 +1,229 @@
 Git Commit Message Linting
 ===========================
 This project uses `gitlint <https://jorisroovers.github.io/gitlint/>`_ to enforce consistent commit message formatting. All commit messages must follow the `Conventional Commits <https://www.conventionalcommits.org/>`_ specification to ensure clear and standardized project history.
 Why Commit Message Standards Matter
 -----------------------------------
 Standardized commit messages provide several benefits:
 * **Improved readability**: Clear, consistent format makes it easier to understand changes
 * **Automated changelog generation**: Tools can parse conventional commits to generate changelogs
 * **Better collaboration**: Team members can quickly understand the nature of changes
 * **Easier debugging**: Well-formatted commits help identify when bugs were introduced
 * **Semantic versioning**: Conventional commits can trigger automated version bumps
 Commit Message Format
 ---------------------
 All commit messages must follow this format:
 .. code-block:: text
    <type>(<scope>): <description>
    [optional body]
    [optional footer(s)]
 Components
 ~~~~~~~~~~
 **Type (required)**
  The type of change being made. Must be one of:
  * ``feat``: A new feature
  * ``fix``: A bug fix
  * ``docs``: Documentation only changes
  * ``style``: Changes that do not affect the meaning of the code (white-space, formatting, etc.)
  * ``refactor``: A code change that neither fixes a bug nor adds a feature
  * ``test``: Adding missing tests or correcting existing tests
  * ``chore``: Changes to the build process or auxiliary tools and libraries
  * ``ci``: Changes to CI configuration files and scripts
  * ``build``: Changes that affect the build system or external dependencies
  * ``perf``: A code change that improves performance
  * ``revert``: Reverts a previous commit
 **Scope (optional)**
  The scope of the change, enclosed in parentheses. Common scopes for this project:
  * ``api``: Changes to API functionality
  * ``metadata``: Changes to metadata processing
  * ``citation``: Changes to citation building
  * ``config``: Changes to configuration handling
  * ``tests``: Changes to test files
  * ``docs``: Changes to documentation
  * ``deps``: Changes to dependencies
 **Description (required)**
  A short description of the change:
  * Use the imperative, present tense: "change" not "changed" nor "changes"
  * Don't capitalize the first letter
  * No period (.) at the end
  * Maximum 50 characters
 **Body (optional)**
  A longer description of the change:
  * Use the imperative, present tense
  * Wrap at 72 characters
  * Explain what and why vs. how
 **Footer (optional)**
  One or more footers may be provided:
  * ``BREAKING CHANGE:`` description of breaking changes
  * ``Closes #123``: reference to closed issues
  * ``Co-authored-by: Name <email@example.com>``: additional authors
 Examples
 --------
 **Simple feature addition:**
 .. code-block:: text
    feat(api): add support for DOI batch processing
 **Bug fix with scope:**
 .. code-block:: text
    fix(metadata): handle missing author information gracefully
 **Documentation update:**
 .. code-block:: text
    docs: update installation instructions
 **Breaking change:**
 .. code-block:: text
    feat(api): change metadata output format
    BREAKING CHANGE: The metadata output format has changed from JSON
    to YAML. Users need to update their parsing code accordingly.
 **Multi-line with body:**
 .. code-block:: text
    refactor(citation): improve author name parsing
    The author name parsing logic has been refactored to handle
    more edge cases, including names with multiple middle initials
    and international characters.
    Closes #45
 Configuration
 -------------
 The project uses a ``.gitlint`` configuration file that enforces:
 * Maximum title length of 50 characters
 * Conventional commit format validation
 * Maximum body line length of 72 characters
 * Exclusion of certain words like "WIP", "TODO", "FIXME" in titles
 * Automatic ignoring of merge commits and dependency updates
 Linting Tools
 -------------
 Manual Linting
 ~~~~~~~~~~~~~~~
 Use the provided script to lint commit messages:
 .. code-block:: bash
    # Lint the last commit
    python scripts/lint-commit.py
    # Lint a specific commit by hash
    python scripts/lint-commit.py --hash <commit-hash>
    # Lint a range of commits
    python scripts/lint-commit.py --range HEAD~3..
    # Check staged commit message
    python scripts/lint-commit.py --staged
 Git Hook Installation
 ~~~~~~~~~~~~~~~~~~~~~
 Install an automated git hook to check commit messages:
 .. code-block:: bash
    python scripts/lint-commit.py --install-hook
 This creates a ``commit-msg`` hook that automatically validates commit messages when you commit. The commit will be rejected if the message doesn't meet the requirements.
 Direct Gitlint Usage
 ~~~~~~~~~~~~~~~~~~~~
 You can also use gitlint directly:
 .. code-block:: bash
    # Lint last commit
    gitlint
    # Lint specific commit
    gitlint --commit <commit-hash>
    # Lint commit range
    gitlint --commits HEAD~3..
 Common Validation Errors
 -------------------------
 **Title too long**
  Keep titles under 50 characters. If you need more space, use the body.
 **Invalid type**
  Use only the allowed types: ``feat``, ``fix``, ``docs``, ``style``, ``refactor``, ``test``, ``chore``, ``ci``, ``build``, ``perf``, ``revert``.
 **Missing colon**
  Don't forget the colon after the type/scope: ``feat(api): add feature``
 **Capitalized description**
  Don't capitalize the first letter of the description: ``feat: add feature`` not ``feat: Add feature``
 **Trailing period**
  Don't add a period at the end of the title: ``feat: add feature`` not ``feat: add feature.``
 **Body line too long**
  Keep body lines under 72 characters. Break long lines appropriately.
 Troubleshooting
 ---------------
 **Gitlint not found**
  Install development dependencies:
  .. code-block:: bash
      pip install -r requirements-dev.txt
 **Hook not working**
  Ensure the hook is executable:
  .. code-block:: bash
      chmod +x .git/hooks/commit-msg
 **Existing commits don't follow format**
  The linting only applies to new commits. Existing commits can be left as-is or rebased if necessary.
 Integration with CI/CD
 ----------------------
 The commit message linting can be integrated into CI/CD pipelines to ensure all commits in pull requests follow the standard format. This helps maintain consistency across all contributors.
 For more information on gitlint configuration and advanced usage, see the `official gitlint documentation <https://jorisroovers.github.io/gitlint/>`_.
--- a/docs/source/contributing.rst
+++ b/docs/source/contributing.rst
@ -115,20 +115,47 @@ Development Setup
      pip install -r requirements-dev.txt
-4. Make your changes
+4. Install the git commit message hook (recommended):
-5. Run tests to ensure everything works
+
-6. Submit a pull request
+   .. code-block:: bash
      python scripts/lint-commit.py --install-hook
 5. Make your changes
 6. Run tests to ensure everything works
 7. Validate your commit messages follow the standards
 8. Submit a pull request
 Code Style
 ----------
 Please follow the existing code style and conventions used in the project. Make sure to:
- Write clear, descriptive commit messages
+- Write clear, descriptive commit messages following the :doc:`commit-messages` standards
 - Add tests for new functionality
 - Update documentation as needed
 - Follow Python best practices
 Commit Message Standards
 ~~~~~~~~~~~~~~~~~~~~~~~~
 All commit messages must follow the Conventional Commits specification. See the :doc:`commit-messages` documentation for detailed information on:
 - Required message format
 - Available commit types
 - Examples of proper commit messages
 - How to use the linting tools
 To validate your commit messages:
 .. code-block:: bash
   # Lint the last commit
   python scripts/lint-commit.py
   # Install automatic validation hook
   python scripts/lint-commit.py --install-hook
 Submitting Changes
 ------------------
@ -136,6 +163,7 @@ Submitting Changes
 2. Make your changes with appropriate tests
 3. Ensure all tests pass
 4. Update documentation if needed
-5. Submit a pull request with a clear description of your changes
+5. Ensure all commit messages follow the conventional commits format
 6. Submit a pull request with a clear description of your changes
 Thank you for contributing to **doi2dataset**!
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -39,4 +39,5 @@ Key Features:
   usage
   modules
   contributing
   commit-messages
   faq
--- a/doi2dataset.py
+++ b/doi2dataset.py
--- a/pyproject.toml
+++ b/pyproject.toml
@ -50,6 +50,7 @@ dev = [
    "pytest-mock>=3.14.0,<4.0",
    "pytest-cov>=6.0.0,<7.0",
    "ruff>=0.11.1,<0.20",
    "gitlint>=0.19.1,<0.20",
 ]
 test = [
    "pytest>=8.3.5,<9.0",
@ -132,3 +133,7 @@ ignore = [
 [tool.ruff.lint.per-file-ignores]
 "tests/*" = ["E501"]
 [tool.bandit]
 exclude_dirs = ["tests", "docs", ".venv", "build", "dist"]
 skips = ["B101", "B601", "B404", "B603"]
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -2,3 +2,4 @@ pytest>=8.3.5,<9.0
 pytest-mock>=3.14.0,<4.0
 pytest-cov>=6.0.0,<7.0
 ruff>=0.11.1,<0.20
 gitlint>=0.19.1,<0.20
--- a/scripts/lint-commit.py
+++ b/scripts/lint-commit.py
@ -0,0 +1,179 @@
 #!/usr/bin/env python3
 """
 Simple script to lint git commit messages using gitlint.
 This script can be used to:
 1. Lint the last commit message
 2. Lint a specific commit by hash
 3. Lint commit messages in a range
 4. Be used as a pre-commit hook
 Usage:
    python scripts/lint-commit.py              # Lint last commit
    python scripts/lint-commit.py --hash <hash>  # Lint specific commit
    python scripts/lint-commit.py --range <range>  # Lint commit range
    python scripts/lint-commit.py --staged     # Lint staged commit message
 This implementation enforces conventional commit message format.
 """
 import argparse
 import subprocess
 import sys
 from pathlib import Path
 def run_command(cmd, check=True):
    """Run a shell command and return the result."""
    try:
        result = subprocess.run(cmd, capture_output=True, text=True, check=check)
        return result
    except subprocess.CalledProcessError as e:
        print(f"Error running command: {cmd}")
        print(f"Exit code: {e.returncode}")
        print(f"Output: {e.stdout}")
        print(f"Error: {e.stderr}")
        return e
 def check_gitlint_installed():
    """Check if gitlint is installed."""
    result = run_command(["which", "gitlint"], check=False)
    if result.returncode != 0:
        print("Error: gitlint is not installed.")
        print("Please install it with: pip install gitlint")
        print("Or install dev dependencies: pip install -r requirements-dev.txt")
        sys.exit(1)
 def lint_commit(commit_hash=None, commit_range=None, staged=False):
    """Lint commit message(s) using gitlint."""
    # Build gitlint command
    cmd = ["gitlint"]
    if staged:
        # Lint staged commit message
        cmd.extend(["--staged"])
    elif commit_range:
        # Lint commit range
        cmd.extend(["--commits", commit_range])
    elif commit_hash:
        # Lint specific commit
        cmd.extend(["--commit", commit_hash])
    else:
        # Lint last commit (default)
        cmd.extend(["--commit", "HEAD"])
    print(f"Running: {' '.join(cmd)}")
    print("-" * 50)
    # Run gitlint
    result = run_command(cmd, check=False)
    if result.returncode == 0:
        print("✅ All commit messages are valid!")
        return True
    else:
        print("❌ Commit message validation failed:")
        print(result.stdout)
        if result.stderr:
            print("Error output:")
            print(result.stderr)
        return False
 def main():
    """Main entry point."""
    parser = argparse.ArgumentParser(
        description="Lint git commit messages using gitlint",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 Examples:
    %(prog)s                    # Lint last commit
    %(prog)s --hash abc123      # Lint specific commit
    %(prog)s --range HEAD~3..   # Lint last 3 commits
    %(prog)s --staged           # Lint staged commit message
        """,
    )
    parser.add_argument("--hash", help="Specific commit hash to lint")
    parser.add_argument("--range", help="Commit range to lint (e.g., HEAD~3..)")
    parser.add_argument(
        "--staged", action="store_true", help="Lint staged commit message"
    )
    parser.add_argument(
        "--install-hook", action="store_true", help="Install as git commit-msg hook"
    )
    args = parser.parse_args()
    # Check if gitlint is installed
    check_gitlint_installed()
    # Install hook if requested
    if args.install_hook:
        install_hook()
        return
    # Validate arguments
    exclusive_args = [args.hash, args.range, args.staged]
    if sum(bool(arg) for arg in exclusive_args) > 1:
        print("Error: --hash, --range, and --staged are mutually exclusive")
        sys.exit(1)
    # Lint commits
    success = lint_commit(
        commit_hash=args.hash, commit_range=args.range, staged=args.staged
    )
    sys.exit(0 if success else 1)
 def install_hook():
    """Install the script as a git commit-msg hook."""
    git_dir = Path(".git")
    if not git_dir.exists():
        print("Error: Not in a git repository")
        sys.exit(1)
    hooks_dir = git_dir / "hooks"
    hooks_dir.mkdir(exist_ok=True)
    hook_file = hooks_dir / "commit-msg"
    hook_content = """#!/usr/bin/env python3
 # Git commit-msg hook for gitlint
 # Python-based commit message linting with gitlint
 import subprocess
 import sys
 # Run gitlint on the commit message
 result = subprocess.run(  # nosec B603
    ["gitlint", "--msg-filename", sys.argv[1]],
    capture_output=True,
    text=True
 )
 if result.returncode != 0:
    print("Commit message validation failed:")
    print(result.stdout)
    if result.stderr:
        print("Error output:")
        print(result.stderr)
    sys.exit(1)
 print("✅ Commit message is valid!")
 """
    hook_file.write_text(hook_content)
    hook_file.chmod(0o755)
    print(f"✅ Installed commit-msg hook at {hook_file}")
    print("The hook will automatically run when you commit.")
 if __name__ == "__main__":
    main()
--- a/tests/test_citation_builder.py
+++ b/tests/test_citation_builder.py
@ -23,7 +23,7 @@ def test_pi():
        given_name="Author",
        orcid="0000-0000-0000-1234",
        email="test.author@example.org",
-        affiliation="Test University"
+        affiliation="Test University",
    )
@ -115,7 +115,9 @@ def test_build_authors_with_ror(openalex_data, pi_finder):
        pytest.skip("Test data doesn't contain any ROR identifiers")
    # Create builder with ror=True to enable ROR identifiers
-    builder = CitationBuilder(data=openalex_data, doi=doi, pi_finder=pi_finder, ror=True)
+    builder = CitationBuilder(
        data=openalex_data, doi=doi, pi_finder=pi_finder, ror=True
    )
    # Get authors
    authors, _ = builder.build_authors()
@ -129,11 +131,11 @@ def test_build_authors_with_ror(openalex_data, pi_finder):
    for author in authors:
        # Check if author has affiliation
-        if not hasattr(author, 'affiliation') or not author.affiliation:
+        if not hasattr(author, "affiliation") or not author.affiliation:
            continue
        # Check if affiliation is an Institution with a ROR ID
-        if not hasattr(author.affiliation, 'ror'):
+        if not hasattr(author.affiliation, "ror"):
            continue
        # Check if ROR ID is present and contains "ror.org"
@ -154,7 +156,7 @@ def test_build_authors_with_ror(openalex_data, pi_finder):
        assert affiliation_field.value == institution_with_ror.ror
        # Verify the expanded_value dictionary has the expected structure
-        assert hasattr(affiliation_field, 'expanded_value')
+        assert hasattr(affiliation_field, "expanded_value")
        assert isinstance(affiliation_field.expanded_value, dict)
        # Check specific fields in the expanded_value
--- a/tests/test_doi2dataset.py
+++ b/tests/test_doi2dataset.py
@ -13,6 +13,7 @@ def test_sanitize_filename():
    result = sanitize_filename(doi)
    assert result == expected
 def test_split_name_with_comma():
    """Test splitting a full name that contains a comma."""
    full_name = "Doe, John"
@ -20,6 +21,7 @@ def test_split_name_with_comma():
    assert given == "John"
    assert family == "Doe"
 def test_split_name_without_comma():
    """Test splitting a full name that does not contain a comma."""
    full_name = "John Doe"
@ -27,11 +29,13 @@ def test_split_name_without_comma():
    assert given == "John"
    assert family == "Doe"
 def test_validate_email_address_valid():
    """Test that a valid email address is correctly recognized."""
    valid_email = "john.doe@iana.org"
    assert validate_email_address(valid_email) is True
 def test_validate_email_address_invalid():
    """Test that an invalid email address is correctly rejected."""
    invalid_email = "john.doe@invalid_domain"
--- a/tests/test_fetch_doi_mock.py
+++ b/tests/test_fetch_doi_mock.py
@ -20,6 +20,7 @@ class FakeResponse:
    """
    A fake response object to simulate an API response.
    """
    def __init__(self, json_data, status_code=200):
        self._json = json_data
        self.status_code = status_code
@ -30,6 +31,7 @@ class FakeResponse:
    def raise_for_status(self):
        pass
@pytest.fixture(autouse=True)
 def load_config_test():
    """
@ -39,6 +41,7 @@ def load_config_test():
    config_path = os.path.join(os.path.dirname(__file__), "config_test.yaml")
    Config.load_config(config_path=config_path)
@pytest.fixture
 def fake_openalex_response():
    """
@ -50,6 +53,7 @@ def fake_openalex_response():
        data = json.load(f)
    return data
 def test_fetch_doi_data_with_file(mocker, fake_openalex_response):
    """
    Test fetching DOI metadata by simulating the API call with a locally saved JSON response.
@ -88,7 +92,7 @@ def test_openalex_abstract_extraction(mocker, fake_openalex_response):
    assert abstract_text is not None
    # If abstract exists in the response, it should be properly extracted
-    if 'abstract_inverted_index' in fake_openalex_response:
+    if "abstract_inverted_index" in fake_openalex_response:
        assert len(abstract_text) > 0
@ -152,7 +156,7 @@ def test_pi_finder_find_by_orcid():
        given_name="Jon",
        orcid="0000-0000-0000-0000",
        email="jon.doe@iana.org",
-        affiliation="Institute of Science, Some University"
+        affiliation="Institute of Science, Some University",
    )
    # Create PIFinder with our test PI
@ -181,8 +185,10 @@ def test_metadata_processor_fetch_data(mocker, fake_openalex_response):
    doi = "10.1038/srep45389"
    # Mock API response
-    mocker.patch("doi2dataset.APIClient.make_request",
+    mocker.patch(
-                 return_value=FakeResponse(fake_openalex_response, 200))
+        "doi2dataset.APIClient.make_request",
        return_value=FakeResponse(fake_openalex_response, 200),
    )
    # Create processor with upload disabled and progress disabled
    processor = MetadataProcessor(doi=doi, upload=False, progress=False)
--- a/tests/test_license_processor.py
+++ b/tests/test_license_processor.py
@ -3,37 +3,27 @@ from doi2dataset import License, LicenseProcessor
 def test_license_processor_cc_by():
    """Test processing a CC BY license"""
-    data = {
+    data = {"primary_location": {"license": "cc-by"}}
        "primary_location": {
            "license": "cc-by"
        }
    }
    license_obj = LicenseProcessor.process_license(data)
    assert isinstance(license_obj, License)
    assert license_obj.short == "cc-by"
    assert license_obj.name == "CC BY 4.0"
    assert license_obj.uri == "https://creativecommons.org/licenses/by/4.0/"
 def test_license_processor_cc0():
    """Test processing a CC0 license"""
-    data = {
+    data = {"primary_location": {"license": "cc0"}}
        "primary_location": {
            "license": "cc0"
        }
    }
    license_obj = LicenseProcessor.process_license(data)
    assert isinstance(license_obj, License)
    assert license_obj.short == "cc0"
    assert license_obj.name == "CC0 1.0"
    assert license_obj.uri == "https://creativecommons.org/publicdomain/zero/1.0/"
 def test_license_processor_unknown_license():
    """Test processing an unknown license"""
-    data = {
+    data = {"primary_location": {"license": "unknown-license"}}
        "primary_location": {
            "license": "unknown-license"
        }
    }
    license_obj = LicenseProcessor.process_license(data)
    assert isinstance(license_obj, License)
    assert license_obj.short == "unknown-license"
@ -41,17 +31,17 @@ def test_license_processor_unknown_license():
    assert license_obj.name == "unknown-license" or license_obj.name == ""
    assert hasattr(license_obj, "uri")
 def test_license_processor_no_license():
    """Test processing with no license information"""
-    data = {
+    data = {"primary_location": {}}
        "primary_location": {}
    }
    license_obj = LicenseProcessor.process_license(data)
    assert isinstance(license_obj, License)
    assert license_obj.short == "unknown"
    assert license_obj.name == ""
    assert license_obj.uri == ""
 def test_license_processor_no_primary_location():
    """Test processing with no primary location"""
    data = {}
--- a/tests/test_metadata_processor.py
+++ b/tests/test_metadata_processor.py
@ -33,7 +33,10 @@ def test_build_metadata_basic_fields(metadata_processor, openalex_data, monkeypa
    abstract_mock = MagicMock()
    abstract_mock.text = "This is a sample abstract"
    abstract_mock.source = "openalex"
-    monkeypatch.setattr("doi2dataset.AbstractProcessor.get_abstract", lambda *args, **kwargs: abstract_mock)
+    monkeypatch.setattr(
        "doi2dataset.AbstractProcessor.get_abstract",
        lambda *args, **kwargs: abstract_mock,
    )
    # Mock the _fetch_data method to return our test data
    metadata_processor._fetch_data = MagicMock(return_value=openalex_data)
@ -47,21 +50,23 @@ def test_build_metadata_basic_fields(metadata_processor, openalex_data, monkeypa
    # Verify the basic metadata fields were extracted correctly
    assert metadata is not None
-    assert 'datasetVersion' in metadata
+    assert "datasetVersion" in metadata
    # Examine the fields inside datasetVersion.metadataBlocks
-    assert 'metadataBlocks' in metadata['datasetVersion']
+    assert "metadataBlocks" in metadata["datasetVersion"]
-    citation = metadata['datasetVersion']['metadataBlocks'].get('citation', {})
+    citation = metadata["datasetVersion"]["metadataBlocks"].get("citation", {})
    # Check fields in citation section
-    assert 'fields' in citation
+    assert "fields" in citation
-    fields = citation['fields']
+    fields = citation["fields"]
    # Check for basic metadata fields in a more flexible way
-    field_names = [field.get('typeName') for field in fields]
+    field_names = [field.get("typeName") for field in fields]
-    assert 'title' in field_names
+    assert "title" in field_names
-    assert 'subject' in field_names
+    assert "subject" in field_names
-    assert 'dsDescription' in field_names  # Description is named 'dsDescription' in the schema
+    assert (
        "dsDescription" in field_names
    )  # Description is named 'dsDescription' in the schema
 def test_build_metadata_authors(metadata_processor, openalex_data, monkeypatch):
@ -73,7 +78,10 @@ def test_build_metadata_authors(metadata_processor, openalex_data, monkeypatch):
    abstract_mock = MagicMock()
    abstract_mock.text = "This is a sample abstract"
    abstract_mock.source = "openalex"
-    monkeypatch.setattr("doi2dataset.AbstractProcessor.get_abstract", lambda *args, **kwargs: abstract_mock)
+    monkeypatch.setattr(
        "doi2dataset.AbstractProcessor.get_abstract",
        lambda *args, **kwargs: abstract_mock,
    )
    # Mock the _fetch_data method to return our test data
    metadata_processor._fetch_data = MagicMock(return_value=openalex_data)
@ -86,33 +94,35 @@ def test_build_metadata_authors(metadata_processor, openalex_data, monkeypatch):
    metadata = metadata_processor._build_metadata(openalex_data)
    # Examine the fields inside datasetVersion.metadataBlocks
-    assert 'metadataBlocks' in metadata['datasetVersion']
+    assert "metadataBlocks" in metadata["datasetVersion"]
-    citation = metadata['datasetVersion']['metadataBlocks'].get('citation', {})
+    citation = metadata["datasetVersion"]["metadataBlocks"].get("citation", {})
    # Check fields in citation section
-    assert 'fields' in citation
+    assert "fields" in citation
-    fields = citation['fields']
+    fields = citation["fields"]
    # Check for author and datasetContact fields
-    field_names = [field.get('typeName') for field in fields]
+    field_names = [field.get("typeName") for field in fields]
-    assert 'author' in field_names
+    assert "author" in field_names
-    assert 'datasetContact' in field_names
+    assert "datasetContact" in field_names
    # Verify these are compound fields with actual entries
    for field in fields:
-        if field.get('typeName') == 'author':
+        if field.get("typeName") == "author":
-            assert 'value' in field
+            assert "value" in field
-            assert isinstance(field['value'], list)
+            assert isinstance(field["value"], list)
-            assert len(field['value']) > 0
+            assert len(field["value"]) > 0
-        if field.get('typeName') == 'datasetContact':
+        if field.get("typeName") == "datasetContact":
-            assert 'value' in field
+            assert "value" in field
-            assert isinstance(field['value'], list)
+            assert isinstance(field["value"], list)
            # The datasetContact might be empty in test environment
            # Just check it exists rather than asserting length
-def test_build_metadata_keywords_and_topics(metadata_processor, openalex_data, monkeypatch):
+def test_build_metadata_keywords_and_topics(
    metadata_processor, openalex_data, monkeypatch
 ):
    """Test that _build_metadata correctly extracts keywords and topics"""
    # Mock the console to avoid print errors
    metadata_processor.console = MagicMock()
@ -121,7 +131,10 @@ def test_build_metadata_keywords_and_topics(metadata_processor, openalex_data, m
    abstract_mock = MagicMock()
    abstract_mock.text = "This is a sample abstract"
    abstract_mock.source = "openalex"
-    monkeypatch.setattr("doi2dataset.AbstractProcessor.get_abstract", lambda *args, **kwargs: abstract_mock)
+    monkeypatch.setattr(
        "doi2dataset.AbstractProcessor.get_abstract",
        lambda *args, **kwargs: abstract_mock,
    )
    # Mock the _fetch_data method to return our test data
    metadata_processor._fetch_data = MagicMock(return_value=openalex_data)
@ -134,27 +147,27 @@ def test_build_metadata_keywords_and_topics(metadata_processor, openalex_data, m
    metadata = metadata_processor._build_metadata(openalex_data)
    # Examine the fields inside datasetVersion.metadataBlocks
-    assert 'metadataBlocks' in metadata['datasetVersion']
+    assert "metadataBlocks" in metadata["datasetVersion"]
-    citation = metadata['datasetVersion']['metadataBlocks'].get('citation', {})
+    citation = metadata["datasetVersion"]["metadataBlocks"].get("citation", {})
    # Check fields in citation section
-    assert 'fields' in citation
+    assert "fields" in citation
-    fields = citation['fields']
+    fields = citation["fields"]
    # Check for keyword and subject fields
-    field_names = [field.get('typeName') for field in fields]
+    field_names = [field.get("typeName") for field in fields]
    # If keywords exist, verify structure
-    if 'keyword' in field_names:
+    if "keyword" in field_names:
        for field in fields:
-            if field.get('typeName') == 'keyword':
+            if field.get("typeName") == "keyword":
-                assert 'value' in field
+                assert "value" in field
-                assert isinstance(field['value'], list)
+                assert isinstance(field["value"], list)
    # Check for subject field which should definitely exist
-    assert 'subject' in field_names
+    assert "subject" in field_names
    for field in fields:
-        if field.get('typeName') == 'subject':
+        if field.get("typeName") == "subject":
-            assert 'value' in field
+            assert "value" in field
-            assert isinstance(field['value'], list)
+            assert isinstance(field["value"], list)
-            assert len(field['value']) > 0
+            assert len(field["value"]) > 0
--- a/tests/test_person.py
+++ b/tests/test_person.py
@ -8,7 +8,7 @@ def test_person_to_dict_with_string_affiliation():
        given_name="John",
        orcid="0000-0001-2345-6789",
        email="john.doe@example.org",
-        affiliation="Test University"
+        affiliation="Test University",
    )
    result = person.to_dict()
@ -29,7 +29,7 @@ def test_person_to_dict_with_institution_ror():
        given_name="John",
        orcid="0000-0001-2345-6789",
        email="john.doe@example.org",
-        affiliation=inst
+        affiliation=inst,
    )
    result = person.to_dict()
@ -48,7 +48,7 @@ def test_person_to_dict_with_institution_display_name_only():
        family_name="Smith",
        given_name="Jane",
        orcid="0000-0001-9876-5432",
-        affiliation=inst
+        affiliation=inst,
    )
    result = person.to_dict()
@ -63,11 +63,7 @@ def test_person_to_dict_with_empty_institution():
    # Create an Institution with empty values
    inst = Institution("")
-    person = Person(
+    person = Person(family_name="Brown", given_name="Robert", affiliation=inst)
        family_name="Brown",
        given_name="Robert",
        affiliation=inst
    )
    result = person.to_dict()
@ -79,9 +75,7 @@ def test_person_to_dict_with_empty_institution():
 def test_person_to_dict_with_no_affiliation():
    """Test Person.to_dict() with no affiliation."""
    person = Person(
-        family_name="Green",
+        family_name="Green", given_name="Alice", orcid="0000-0002-1111-2222"
        given_name="Alice",
        orcid="0000-0002-1111-2222"
    )
    result = person.to_dict()
--- a/tests/test_publication_utils.py
+++ b/tests/test_publication_utils.py
@ -14,44 +14,44 @@ def metadata_processor():
    processor.console = MagicMock()
    return processor
 def test_get_publication_year_with_publication_year(metadata_processor):
    """Test that _get_publication_year extracts year from publication_year field"""
    data = {"publication_year": 2020}
    year = metadata_processor._get_publication_year(data)
    assert year == 2020
 def test_get_publication_year_with_date(metadata_processor):
    """Test that _get_publication_year returns empty string when publication_year is missing"""
    data = {"publication_date": "2019-05-15"}
    year = metadata_processor._get_publication_year(data)
    assert year == ""
 def test_get_publication_year_with_both_fields(metadata_processor):
    """Test that _get_publication_year prioritizes publication_year over date"""
-    data = {
+    data = {"publication_year": 2020, "publication_date": "2019-05-15"}
        "publication_year": 2020,
        "publication_date": "2019-05-15"
    }
    year = metadata_processor._get_publication_year(data)
    assert year == 2020
 def test_get_publication_year_with_partial_date(metadata_processor):
    """Test that _get_publication_year returns empty string when only publication_date is present"""
    data = {"publication_date": "2018"}
    year = metadata_processor._get_publication_year(data)
    assert year == ""
 def test_get_publication_year_with_missing_data(metadata_processor):
    """Test that _get_publication_year handles missing data"""
    data = {"other_field": "value"}
    year = metadata_processor._get_publication_year(data)
    assert year == ""
 def test_get_publication_year_with_invalid_data(metadata_processor):
    """Test that _get_publication_year returns whatever is in publication_year field"""
-    data = {
+    data = {"publication_year": "not-a-year", "publication_date": "invalid-date"}
        "publication_year": "not-a-year",
        "publication_date": "invalid-date"
    }
    year = metadata_processor._get_publication_year(data)
    assert year == "not-a-year"