feat: add pre-commit setup with gitlint

2025-07-14 09:39:07 +02:00 · 2025-07-14 09:39:07 +02:00 · 9d270ec601
commit 9d270ec601
parent b4e9943b7c
17 changed files with 1197 additions and 360 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,54 @@
+# Pre-commit configuration for doi2dataset
+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+
+repos:
+  # Built-in pre-commit hooks
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.6.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-yaml
+      - id: check-added-large-files
+      - id: check-merge-conflict
+      - id: check-json
+      - id: check-toml
+      - id: mixed-line-ending
+        args: ['--fix=lf']
+
+  # Python code formatting and linting
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.6.9
+    hooks:
+      - id: ruff
+        args: [--fix, --exit-non-zero-on-fix]
+      - id: ruff-format
+
+  # Git commit message linting with gitlint
+  - repo: https://github.com/jorisroovers/gitlint
+    rev: v0.19.1
+    hooks:
+      - id: gitlint
+        stages: [commit-msg]
+
+  # Optional: Check for common security issues
+  - repo: https://github.com/PyCQA/bandit
+    rev: 1.7.10
+    hooks:
+      - id: bandit
+        args: ["-c", "pyproject.toml"]
+        additional_dependencies: ["bandit[toml]"]
+
+# Configuration for specific hooks
+ci:
+  autofix_commit_msg: |
+    [pre-commit.ci] auto fixes from pre-commit hooks
+
+    for more information, see https://pre-commit.ci
+  autofix_prs: true
+  autoupdate_branch: ''
+  autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate'
+  autoupdate_schedule: weekly
+  skip: []
+  submodules: false
--- a/README.md
+++ b/README.md
@ -152,6 +152,73 @@ Documentation is automatically built and deployed via GitLab CI/CD:
 - Deployed to GitLab Pages
 - Accessible at your project's Pages URL

+## Git Commit Message Linting
+
+This project uses [gitlint](https://jorisroovers.github.io/gitlint/) to enforce consistent commit message formatting. Commit messages should follow the [Conventional Commits](https://www.conventionalcommits.org/) specification.
+
+### Commit Message Format
+
+Commit messages must follow this format:
+
+```
+<type>(<scope>): <description>
+
+[optional body]
+
+[optional footer(s)]
+```
+
+**Types:**
+
+- `feat`: A new feature
+- `fix`: A bug fix
+- `docs`: Documentation only changes
+- `style`: Changes that do not affect the meaning of the code
+- `refactor`: A code change that neither fixes a bug nor adds a feature
+- `test`: Adding missing tests or correcting existing tests
+- `chore`: Changes to the build process or auxiliary tools
+- `ci`: Changes to CI configuration files and scripts
+- `build`: Changes that affect the build system or dependencies
+- `perf`: A code change that improves performance
+- `revert`: Reverts a previous commit
+
+**Examples:**
+
+```
+feat(api): add support for DOI batch processing
+fix(metadata): handle missing author information gracefully
+docs: update installation instructions
+test(citation): add tests for license processing
+```
+
+### Linting Commit Messages
+
+To lint commit messages, use the provided script:
+
+```bash
+# Lint the last commit
+python scripts/lint-commit.py
+
+# Lint a specific commit
+python scripts/lint-commit.py --hash <commit-hash>
+
+# Lint a range of commits
+python scripts/lint-commit.py --range HEAD~3..
+
+# Install as a git hook (optional)
+python scripts/lint-commit.py --install-hook
+```
+
+### Git Hook Installation
+
+You can optionally install a git hook that automatically checks commit messages:
+
+```bash
+python scripts/lint-commit.py --install-hook
+```
+
+This will create a `commit-msg` hook that runs automatically when you commit, ensuring all commit messages follow the required format.
+
 ## Testing

 Tests are implemented with pytest. The test suite provides comprehensive coverage of core functionalities. To run the tests, execute:
@ -270,6 +337,33 @@ This version has been updated to make the tool more generalized and suitable for

 Contributions are welcome! Please fork the repository and submit a pull request with your improvements.

+### Development Setup
+
+1. Install development dependencies:
+
+   ```bash
+   pip install -r requirements-dev.txt
+   ```
+
+2. Run tests to ensure everything works:
+
+   ```bash
+   pytest
+   ```
+
+3. Install the git commit message hook (recommended):
+   ```bash
+   python scripts/lint-commit.py --install-hook
+   ```
+
+### Code Quality
+
+- Follow the existing code style and formatting
+- Write tests for new functionality
+- Ensure all tests pass before submitting
+- Use meaningful commit messages following the conventional commits format
+- Run `python scripts/lint-commit.py` to validate commit messages
+
 ## License

 This project is licensed under the MIT License. See the [LICENSE.md](LICENSE.md) file for details.
--- a/docs/make.bat
+++ b/docs/make.bat
@ -1,47 +1,47 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
-	set SPHINXBUILD=sphinx-build
-)
-set SOURCEDIR=source
-set BUILDDIR=build
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
-	echo.
-	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
-	echo.installed, then set the SPHINXBUILD environment variable to point
-	echo.to the full path of the 'sphinx-build' executable. Alternatively you
-	echo.may add the Sphinx directory to PATH.
-	echo.
-	echo.If you don't have Sphinx installed, grab it from
-	echo.https://www.sphinx-doc.org/
-	exit /b 1
-)
-
-if "%1" == "" goto help
-
-if "%1" == "multiversion" goto multiversion
-if "%1" == "multiversion-clean" goto multiversion-clean
-
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-goto end
-
-:multiversion
-sphinx-multiversion %SOURCEDIR% %BUILDDIR%\multiversion\html %SPHINXOPTS% %O%
-goto end
-
-:multiversion-clean
-rmdir /s /q %BUILDDIR%\html 2>nul
-sphinx-multiversion %SOURCEDIR% %BUILDDIR%\multiversion\html %SPHINXOPTS% %O%
-goto end
-
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-
-:end
-popd
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+if "%1" == "multiversion" goto multiversion
+if "%1" == "multiversion-clean" goto multiversion-clean
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:multiversion
+sphinx-multiversion %SOURCEDIR% %BUILDDIR%\multiversion\html %SPHINXOPTS% %O%
+goto end
+
+:multiversion-clean
+rmdir /s /q %BUILDDIR%\html 2>nul
+sphinx-multiversion %SOURCEDIR% %BUILDDIR%\multiversion\html %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
--- a/docs/source/commit-messages.rst
+++ b/docs/source/commit-messages.rst
@ -0,0 +1,229 @@
+Git Commit Message Linting
+===========================
+
+This project uses `gitlint <https://jorisroovers.github.io/gitlint/>`_ to enforce consistent commit message formatting. All commit messages must follow the `Conventional Commits <https://www.conventionalcommits.org/>`_ specification to ensure clear and standardized project history.
+
+Why Commit Message Standards Matter
+-----------------------------------
+
+Standardized commit messages provide several benefits:
+
+* **Improved readability**: Clear, consistent format makes it easier to understand changes
+* **Automated changelog generation**: Tools can parse conventional commits to generate changelogs
+* **Better collaboration**: Team members can quickly understand the nature of changes
+* **Easier debugging**: Well-formatted commits help identify when bugs were introduced
+* **Semantic versioning**: Conventional commits can trigger automated version bumps
+
+Commit Message Format
+---------------------
+
+All commit messages must follow this format:
+
+.. code-block:: text
+
+    <type>(<scope>): <description>
+
+    [optional body]
+
+    [optional footer(s)]
+
+Components
+~~~~~~~~~~
+
+**Type (required)**
+  The type of change being made. Must be one of:
+
+  * ``feat``: A new feature
+  * ``fix``: A bug fix
+  * ``docs``: Documentation only changes
+  * ``style``: Changes that do not affect the meaning of the code (white-space, formatting, etc.)
+  * ``refactor``: A code change that neither fixes a bug nor adds a feature
+  * ``test``: Adding missing tests or correcting existing tests
+  * ``chore``: Changes to the build process or auxiliary tools and libraries
+  * ``ci``: Changes to CI configuration files and scripts
+  * ``build``: Changes that affect the build system or external dependencies
+  * ``perf``: A code change that improves performance
+  * ``revert``: Reverts a previous commit
+
+**Scope (optional)**
+  The scope of the change, enclosed in parentheses. Common scopes for this project:
+
+  * ``api``: Changes to API functionality
+  * ``metadata``: Changes to metadata processing
+  * ``citation``: Changes to citation building
+  * ``config``: Changes to configuration handling
+  * ``tests``: Changes to test files
+  * ``docs``: Changes to documentation
+  * ``deps``: Changes to dependencies
+
+**Description (required)**
+  A short description of the change:
+
+  * Use the imperative, present tense: "change" not "changed" nor "changes"
+  * Don't capitalize the first letter
+  * No period (.) at the end
+  * Maximum 50 characters
+
+**Body (optional)**
+  A longer description of the change:
+
+  * Use the imperative, present tense
+  * Wrap at 72 characters
+  * Explain what and why vs. how
+
+**Footer (optional)**
+  One or more footers may be provided:
+
+  * ``BREAKING CHANGE:`` description of breaking changes
+  * ``Closes #123``: reference to closed issues
+  * ``Co-authored-by: Name <email@example.com>``: additional authors
+
+Examples
+--------
+
+**Simple feature addition:**
+
+.. code-block:: text
+
+    feat(api): add support for DOI batch processing
+
+**Bug fix with scope:**
+
+.. code-block:: text
+
+    fix(metadata): handle missing author information gracefully
+
+**Documentation update:**
+
+.. code-block:: text
+
+    docs: update installation instructions
+
+**Breaking change:**
+
+.. code-block:: text
+
+    feat(api): change metadata output format
+
+    BREAKING CHANGE: The metadata output format has changed from JSON
+    to YAML. Users need to update their parsing code accordingly.
+
+**Multi-line with body:**
+
+.. code-block:: text
+
+    refactor(citation): improve author name parsing
+
+    The author name parsing logic has been refactored to handle
+    more edge cases, including names with multiple middle initials
+    and international characters.
+
+    Closes #45
+
+Configuration
+-------------
+
+The project uses a ``.gitlint`` configuration file that enforces:
+
+* Maximum title length of 50 characters
+* Conventional commit format validation
+* Maximum body line length of 72 characters
+* Exclusion of certain words like "WIP", "TODO", "FIXME" in titles
+* Automatic ignoring of merge commits and dependency updates
+
+Linting Tools
+-------------
+
+Manual Linting
+~~~~~~~~~~~~~~~
+
+Use the provided script to lint commit messages:
+
+.. code-block:: bash
+
+    # Lint the last commit
+    python scripts/lint-commit.py
+
+    # Lint a specific commit by hash
+    python scripts/lint-commit.py --hash <commit-hash>
+
+    # Lint a range of commits
+    python scripts/lint-commit.py --range HEAD~3..
+
+    # Check staged commit message
+    python scripts/lint-commit.py --staged
+
+Git Hook Installation
+~~~~~~~~~~~~~~~~~~~~~
+
+Install an automated git hook to check commit messages:
+
+.. code-block:: bash
+
+    python scripts/lint-commit.py --install-hook
+
+This creates a ``commit-msg`` hook that automatically validates commit messages when you commit. The commit will be rejected if the message doesn't meet the requirements.
+
+Direct Gitlint Usage
+~~~~~~~~~~~~~~~~~~~~
+
+You can also use gitlint directly:
+
+.. code-block:: bash
+
+    # Lint last commit
+    gitlint
+
+    # Lint specific commit
+    gitlint --commit <commit-hash>
+
+    # Lint commit range
+    gitlint --commits HEAD~3..
+
+Common Validation Errors
+-------------------------
+
+**Title too long**
+  Keep titles under 50 characters. If you need more space, use the body.
+
+**Invalid type**
+  Use only the allowed types: ``feat``, ``fix``, ``docs``, ``style``, ``refactor``, ``test``, ``chore``, ``ci``, ``build``, ``perf``, ``revert``.
+
+**Missing colon**
+  Don't forget the colon after the type/scope: ``feat(api): add feature``
+
+**Capitalized description**
+  Don't capitalize the first letter of the description: ``feat: add feature`` not ``feat: Add feature``
+
+**Trailing period**
+  Don't add a period at the end of the title: ``feat: add feature`` not ``feat: add feature.``
+
+**Body line too long**
+  Keep body lines under 72 characters. Break long lines appropriately.
+
+Troubleshooting
+---------------
+
+**Gitlint not found**
+  Install development dependencies:
+
+  .. code-block:: bash
+
+      pip install -r requirements-dev.txt
+
+**Hook not working**
+  Ensure the hook is executable:
+
+  .. code-block:: bash
+
+      chmod +x .git/hooks/commit-msg
+
+**Existing commits don't follow format**
+  The linting only applies to new commits. Existing commits can be left as-is or rebased if necessary.
+
+Integration with CI/CD
+----------------------
+
+The commit message linting can be integrated into CI/CD pipelines to ensure all commits in pull requests follow the standard format. This helps maintain consistency across all contributors.
+
+For more information on gitlint configuration and advanced usage, see the `official gitlint documentation <https://jorisroovers.github.io/gitlint/>`_.
--- a/docs/source/contributing.rst
+++ b/docs/source/contributing.rst
@ -115,20 +115,47 @@ Development Setup

      pip install -r requirements-dev.txt

-4. Make your changes
-5. Run tests to ensure everything works
-6. Submit a pull request
+4. Install the git commit message hook (recommended):
+
+   .. code-block:: bash
+
+      python scripts/lint-commit.py --install-hook
+
+5. Make your changes
+6. Run tests to ensure everything works
+7. Validate your commit messages follow the standards
+8. Submit a pull request

 Code Style
 ----------

 Please follow the existing code style and conventions used in the project. Make sure to:

- Write clear, descriptive commit messages
+- Write clear, descriptive commit messages following the :doc:`commit-messages` standards
 - Add tests for new functionality
 - Update documentation as needed
 - Follow Python best practices

+Commit Message Standards
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+All commit messages must follow the Conventional Commits specification. See the :doc:`commit-messages` documentation for detailed information on:
+
+- Required message format
+- Available commit types
+- Examples of proper commit messages
+- How to use the linting tools
+
+To validate your commit messages:
+
+.. code-block:: bash
+
+   # Lint the last commit
+   python scripts/lint-commit.py
+
+   # Install automatic validation hook
+   python scripts/lint-commit.py --install-hook
+
 Submitting Changes
 ------------------

@ -136,6 +163,7 @@ Submitting Changes
 2. Make your changes with appropriate tests
 3. Ensure all tests pass
 4. Update documentation if needed
-5. Submit a pull request with a clear description of your changes
+5. Ensure all commit messages follow the conventional commits format
+6. Submit a pull request with a clear description of your changes

 Thank you for contributing to **doi2dataset**!
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -39,4 +39,5 @@ Key Features:
   usage
   modules
   contributing
+   commit-messages
   faq
--- a/doi2dataset.py
+++ b/doi2dataset.py
--- a/pyproject.toml
+++ b/pyproject.toml
@ -50,6 +50,7 @@ dev = [
    "pytest-mock>=3.14.0,<4.0",
    "pytest-cov>=6.0.0,<7.0",
    "ruff>=0.11.1,<0.20",
+    "gitlint>=0.19.1,<0.20",
 ]
 test = [
    "pytest>=8.3.5,<9.0",
@ -132,3 +133,7 @@ ignore = [

 [tool.ruff.lint.per-file-ignores]
 "tests/*" = ["E501"]
+
+[tool.bandit]
+exclude_dirs = ["tests", "docs", ".venv", "build", "dist"]
+skips = ["B101", "B601", "B404", "B603"]
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -2,3 +2,4 @@ pytest>=8.3.5,<9.0
 pytest-mock>=3.14.0,<4.0
 pytest-cov>=6.0.0,<7.0
 ruff>=0.11.1,<0.20
+gitlint>=0.19.1,<0.20
--- a/scripts/lint-commit.py
+++ b/scripts/lint-commit.py
@ -0,0 +1,179 @@
+#!/usr/bin/env python3
+"""
+Simple script to lint git commit messages using gitlint.
+
+This script can be used to:
+1. Lint the last commit message
+2. Lint a specific commit by hash
+3. Lint commit messages in a range
+4. Be used as a pre-commit hook
+
+Usage:
+    python scripts/lint-commit.py              # Lint last commit
+    python scripts/lint-commit.py --hash <hash>  # Lint specific commit
+    python scripts/lint-commit.py --range <range>  # Lint commit range
+    python scripts/lint-commit.py --staged     # Lint staged commit message
+
+This implementation enforces conventional commit message format.
+"""
+
+import argparse
+import subprocess
+import sys
+from pathlib import Path
+
+
+def run_command(cmd, check=True):
+    """Run a shell command and return the result."""
+    try:
+        result = subprocess.run(cmd, capture_output=True, text=True, check=check)
+        return result
+    except subprocess.CalledProcessError as e:
+        print(f"Error running command: {cmd}")
+        print(f"Exit code: {e.returncode}")
+        print(f"Output: {e.stdout}")
+        print(f"Error: {e.stderr}")
+        return e
+
+
+def check_gitlint_installed():
+    """Check if gitlint is installed."""
+    result = run_command(["which", "gitlint"], check=False)
+    if result.returncode != 0:
+        print("Error: gitlint is not installed.")
+        print("Please install it with: pip install gitlint")
+        print("Or install dev dependencies: pip install -r requirements-dev.txt")
+        sys.exit(1)
+
+
+def lint_commit(commit_hash=None, commit_range=None, staged=False):
+    """Lint commit message(s) using gitlint."""
+    # Build gitlint command
+    cmd = ["gitlint"]
+
+    if staged:
+        # Lint staged commit message
+        cmd.extend(["--staged"])
+    elif commit_range:
+        # Lint commit range
+        cmd.extend(["--commits", commit_range])
+    elif commit_hash:
+        # Lint specific commit
+        cmd.extend(["--commit", commit_hash])
+    else:
+        # Lint last commit (default)
+        cmd.extend(["--commit", "HEAD"])
+
+    print(f"Running: {' '.join(cmd)}")
+    print("-" * 50)
+
+    # Run gitlint
+    result = run_command(cmd, check=False)
+
+    if result.returncode == 0:
+        print("✅ All commit messages are valid!")
+        return True
+    else:
+        print("❌ Commit message validation failed:")
+        print(result.stdout)
+        if result.stderr:
+            print("Error output:")
+            print(result.stderr)
+        return False
+
+
+def main():
+    """Main entry point."""
+    parser = argparse.ArgumentParser(
+        description="Lint git commit messages using gitlint",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    %(prog)s                    # Lint last commit
+    %(prog)s --hash abc123      # Lint specific commit
+    %(prog)s --range HEAD~3..   # Lint last 3 commits
+    %(prog)s --staged           # Lint staged commit message
+        """,
+    )
+
+    parser.add_argument("--hash", help="Specific commit hash to lint")
+
+    parser.add_argument("--range", help="Commit range to lint (e.g., HEAD~3..)")
+
+    parser.add_argument(
+        "--staged", action="store_true", help="Lint staged commit message"
+    )
+
+    parser.add_argument(
+        "--install-hook", action="store_true", help="Install as git commit-msg hook"
+    )
+
+    args = parser.parse_args()
+
+    # Check if gitlint is installed
+    check_gitlint_installed()
+
+    # Install hook if requested
+    if args.install_hook:
+        install_hook()
+        return
+
+    # Validate arguments
+    exclusive_args = [args.hash, args.range, args.staged]
+    if sum(bool(arg) for arg in exclusive_args) > 1:
+        print("Error: --hash, --range, and --staged are mutually exclusive")
+        sys.exit(1)
+
+    # Lint commits
+    success = lint_commit(
+        commit_hash=args.hash, commit_range=args.range, staged=args.staged
+    )
+
+    sys.exit(0 if success else 1)
+
+
+def install_hook():
+    """Install the script as a git commit-msg hook."""
+    git_dir = Path(".git")
+    if not git_dir.exists():
+        print("Error: Not in a git repository")
+        sys.exit(1)
+
+    hooks_dir = git_dir / "hooks"
+    hooks_dir.mkdir(exist_ok=True)
+
+    hook_file = hooks_dir / "commit-msg"
+
+    hook_content = """#!/usr/bin/env python3
+# Git commit-msg hook for gitlint
+# Python-based commit message linting with gitlint
+import subprocess
+import sys
+
+# Run gitlint on the commit message
+result = subprocess.run(  # nosec B603
+    ["gitlint", "--msg-filename", sys.argv[1]],
+    capture_output=True,
+    text=True
+)
+
+if result.returncode != 0:
+    print("Commit message validation failed:")
+    print(result.stdout)
+    if result.stderr:
+        print("Error output:")
+        print(result.stderr)
+    sys.exit(1)
+
+print("✅ Commit message is valid!")
+"""
+
+    hook_file.write_text(hook_content)
+    hook_file.chmod(0o755)
+
+    print(f"✅ Installed commit-msg hook at {hook_file}")
+    print("The hook will automatically run when you commit.")
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/test_citation_builder.py
+++ b/tests/test_citation_builder.py
@ -23,7 +23,7 @@ def test_pi():
        given_name="Author",
        orcid="0000-0000-0000-1234",
        email="test.author@example.org",
-        affiliation="Test University"
+        affiliation="Test University",
    )


@ -115,7 +115,9 @@ def test_build_authors_with_ror(openalex_data, pi_finder):
        pytest.skip("Test data doesn't contain any ROR identifiers")

    # Create builder with ror=True to enable ROR identifiers
-    builder = CitationBuilder(data=openalex_data, doi=doi, pi_finder=pi_finder, ror=True)
+    builder = CitationBuilder(
+        data=openalex_data, doi=doi, pi_finder=pi_finder, ror=True
+    )

    # Get authors
    authors, _ = builder.build_authors()
@ -129,11 +131,11 @@ def test_build_authors_with_ror(openalex_data, pi_finder):

    for author in authors:
        # Check if author has affiliation
-        if not hasattr(author, 'affiliation') or not author.affiliation:
+        if not hasattr(author, "affiliation") or not author.affiliation:
            continue

        # Check if affiliation is an Institution with a ROR ID
-        if not hasattr(author.affiliation, 'ror'):
+        if not hasattr(author.affiliation, "ror"):
            continue

        # Check if ROR ID is present and contains "ror.org"
@ -154,7 +156,7 @@ def test_build_authors_with_ror(openalex_data, pi_finder):
        assert affiliation_field.value == institution_with_ror.ror

        # Verify the expanded_value dictionary has the expected structure
-        assert hasattr(affiliation_field, 'expanded_value')
+        assert hasattr(affiliation_field, "expanded_value")
        assert isinstance(affiliation_field.expanded_value, dict)

        # Check specific fields in the expanded_value
--- a/tests/test_doi2dataset.py
+++ b/tests/test_doi2dataset.py
@ -13,6 +13,7 @@ def test_sanitize_filename():
    result = sanitize_filename(doi)
    assert result == expected

+
 def test_split_name_with_comma():
    """Test splitting a full name that contains a comma."""
    full_name = "Doe, John"
@ -20,6 +21,7 @@ def test_split_name_with_comma():
    assert given == "John"
    assert family == "Doe"

+
 def test_split_name_without_comma():
    """Test splitting a full name that does not contain a comma."""
    full_name = "John Doe"
@ -27,11 +29,13 @@ def test_split_name_without_comma():
    assert given == "John"
    assert family == "Doe"

+
 def test_validate_email_address_valid():
    """Test that a valid email address is correctly recognized."""
    valid_email = "john.doe@iana.org"
    assert validate_email_address(valid_email) is True

+
 def test_validate_email_address_invalid():
    """Test that an invalid email address is correctly rejected."""
    invalid_email = "john.doe@invalid_domain"
--- a/tests/test_fetch_doi_mock.py
+++ b/tests/test_fetch_doi_mock.py
@ -20,6 +20,7 @@ class FakeResponse:
    """
    A fake response object to simulate an API response.
    """
+
    def __init__(self, json_data, status_code=200):
        self._json = json_data
        self.status_code = status_code
@ -30,6 +31,7 @@ class FakeResponse:
    def raise_for_status(self):
        pass

+
@pytest.fixture(autouse=True)
 def load_config_test():
    """
@ -39,6 +41,7 @@ def load_config_test():
    config_path = os.path.join(os.path.dirname(__file__), "config_test.yaml")
    Config.load_config(config_path=config_path)

+
@pytest.fixture
 def fake_openalex_response():
    """
@ -50,6 +53,7 @@ def fake_openalex_response():
        data = json.load(f)
    return data

+
 def test_fetch_doi_data_with_file(mocker, fake_openalex_response):
    """
    Test fetching DOI metadata by simulating the API call with a locally saved JSON response.
@ -88,7 +92,7 @@ def test_openalex_abstract_extraction(mocker, fake_openalex_response):
    assert abstract_text is not None

    # If abstract exists in the response, it should be properly extracted
-    if 'abstract_inverted_index' in fake_openalex_response:
+    if "abstract_inverted_index" in fake_openalex_response:
        assert len(abstract_text) > 0


@ -152,7 +156,7 @@ def test_pi_finder_find_by_orcid():
        given_name="Jon",
        orcid="0000-0000-0000-0000",
        email="jon.doe@iana.org",
-        affiliation="Institute of Science, Some University"
+        affiliation="Institute of Science, Some University",
    )

    # Create PIFinder with our test PI
@ -181,8 +185,10 @@ def test_metadata_processor_fetch_data(mocker, fake_openalex_response):
    doi = "10.1038/srep45389"

    # Mock API response
-    mocker.patch("doi2dataset.APIClient.make_request",
-                 return_value=FakeResponse(fake_openalex_response, 200))
+    mocker.patch(
+        "doi2dataset.APIClient.make_request",
+        return_value=FakeResponse(fake_openalex_response, 200),
+    )

    # Create processor with upload disabled and progress disabled
    processor = MetadataProcessor(doi=doi, upload=False, progress=False)
--- a/tests/test_license_processor.py
+++ b/tests/test_license_processor.py
@ -3,37 +3,27 @@ from doi2dataset import License, LicenseProcessor

 def test_license_processor_cc_by():
    """Test processing a CC BY license"""
-    data = {
-        "primary_location": {
-            "license": "cc-by"
-        }
-    }
+    data = {"primary_location": {"license": "cc-by"}}
    license_obj = LicenseProcessor.process_license(data)
    assert isinstance(license_obj, License)
    assert license_obj.short == "cc-by"
    assert license_obj.name == "CC BY 4.0"
    assert license_obj.uri == "https://creativecommons.org/licenses/by/4.0/"

+
 def test_license_processor_cc0():
    """Test processing a CC0 license"""
-    data = {
-        "primary_location": {
-            "license": "cc0"
-        }
-    }
+    data = {"primary_location": {"license": "cc0"}}
    license_obj = LicenseProcessor.process_license(data)
    assert isinstance(license_obj, License)
    assert license_obj.short == "cc0"
    assert license_obj.name == "CC0 1.0"
    assert license_obj.uri == "https://creativecommons.org/publicdomain/zero/1.0/"

+
 def test_license_processor_unknown_license():
    """Test processing an unknown license"""
-    data = {
-        "primary_location": {
-            "license": "unknown-license"
-        }
-    }
+    data = {"primary_location": {"license": "unknown-license"}}
    license_obj = LicenseProcessor.process_license(data)
    assert isinstance(license_obj, License)
    assert license_obj.short == "unknown-license"
@ -41,17 +31,17 @@ def test_license_processor_unknown_license():
    assert license_obj.name == "unknown-license" or license_obj.name == ""
    assert hasattr(license_obj, "uri")

+
 def test_license_processor_no_license():
    """Test processing with no license information"""
-    data = {
-        "primary_location": {}
-    }
+    data = {"primary_location": {}}
    license_obj = LicenseProcessor.process_license(data)
    assert isinstance(license_obj, License)
    assert license_obj.short == "unknown"
    assert license_obj.name == ""
    assert license_obj.uri == ""

+
 def test_license_processor_no_primary_location():
    """Test processing with no primary location"""
    data = {}
--- a/tests/test_metadata_processor.py
+++ b/tests/test_metadata_processor.py
@ -33,7 +33,10 @@ def test_build_metadata_basic_fields(metadata_processor, openalex_data, monkeypa
    abstract_mock = MagicMock()
    abstract_mock.text = "This is a sample abstract"
    abstract_mock.source = "openalex"
-    monkeypatch.setattr("doi2dataset.AbstractProcessor.get_abstract", lambda *args, **kwargs: abstract_mock)
+    monkeypatch.setattr(
+        "doi2dataset.AbstractProcessor.get_abstract",
+        lambda *args, **kwargs: abstract_mock,
+    )

    # Mock the _fetch_data method to return our test data
    metadata_processor._fetch_data = MagicMock(return_value=openalex_data)
@ -47,21 +50,23 @@ def test_build_metadata_basic_fields(metadata_processor, openalex_data, monkeypa

    # Verify the basic metadata fields were extracted correctly
    assert metadata is not None
-    assert 'datasetVersion' in metadata
+    assert "datasetVersion" in metadata

    # Examine the fields inside datasetVersion.metadataBlocks
-    assert 'metadataBlocks' in metadata['datasetVersion']
-    citation = metadata['datasetVersion']['metadataBlocks'].get('citation', {})
+    assert "metadataBlocks" in metadata["datasetVersion"]
+    citation = metadata["datasetVersion"]["metadataBlocks"].get("citation", {})

    # Check fields in citation section
-    assert 'fields' in citation
-    fields = citation['fields']
+    assert "fields" in citation
+    fields = citation["fields"]

    # Check for basic metadata fields in a more flexible way
-    field_names = [field.get('typeName') for field in fields]
-    assert 'title' in field_names
-    assert 'subject' in field_names
-    assert 'dsDescription' in field_names  # Description is named 'dsDescription' in the schema
+    field_names = [field.get("typeName") for field in fields]
+    assert "title" in field_names
+    assert "subject" in field_names
+    assert (
+        "dsDescription" in field_names
+    )  # Description is named 'dsDescription' in the schema


 def test_build_metadata_authors(metadata_processor, openalex_data, monkeypatch):
@ -73,7 +78,10 @@ def test_build_metadata_authors(metadata_processor, openalex_data, monkeypatch):
    abstract_mock = MagicMock()
    abstract_mock.text = "This is a sample abstract"
    abstract_mock.source = "openalex"
-    monkeypatch.setattr("doi2dataset.AbstractProcessor.get_abstract", lambda *args, **kwargs: abstract_mock)
+    monkeypatch.setattr(
+        "doi2dataset.AbstractProcessor.get_abstract",
+        lambda *args, **kwargs: abstract_mock,
+    )

    # Mock the _fetch_data method to return our test data
    metadata_processor._fetch_data = MagicMock(return_value=openalex_data)
@ -86,33 +94,35 @@ def test_build_metadata_authors(metadata_processor, openalex_data, monkeypatch):
    metadata = metadata_processor._build_metadata(openalex_data)

    # Examine the fields inside datasetVersion.metadataBlocks
-    assert 'metadataBlocks' in metadata['datasetVersion']
-    citation = metadata['datasetVersion']['metadataBlocks'].get('citation', {})
+    assert "metadataBlocks" in metadata["datasetVersion"]
+    citation = metadata["datasetVersion"]["metadataBlocks"].get("citation", {})

    # Check fields in citation section
-    assert 'fields' in citation
-    fields = citation['fields']
+    assert "fields" in citation
+    fields = citation["fields"]

    # Check for author and datasetContact fields
-    field_names = [field.get('typeName') for field in fields]
-    assert 'author' in field_names
-    assert 'datasetContact' in field_names
+    field_names = [field.get("typeName") for field in fields]
+    assert "author" in field_names
+    assert "datasetContact" in field_names

    # Verify these are compound fields with actual entries
    for field in fields:
-        if field.get('typeName') == 'author':
-            assert 'value' in field
-            assert isinstance(field['value'], list)
-            assert len(field['value']) > 0
+        if field.get("typeName") == "author":
+            assert "value" in field
+            assert isinstance(field["value"], list)
+            assert len(field["value"]) > 0

-        if field.get('typeName') == 'datasetContact':
-            assert 'value' in field
-            assert isinstance(field['value'], list)
+        if field.get("typeName") == "datasetContact":
+            assert "value" in field
+            assert isinstance(field["value"], list)
            # The datasetContact might be empty in test environment
            # Just check it exists rather than asserting length


-def test_build_metadata_keywords_and_topics(metadata_processor, openalex_data, monkeypatch):
+def test_build_metadata_keywords_and_topics(
+    metadata_processor, openalex_data, monkeypatch
+):
    """Test that _build_metadata correctly extracts keywords and topics"""
    # Mock the console to avoid print errors
    metadata_processor.console = MagicMock()
@ -121,7 +131,10 @@ def test_build_metadata_keywords_and_topics(metadata_processor, openalex_data, m
    abstract_mock = MagicMock()
    abstract_mock.text = "This is a sample abstract"
    abstract_mock.source = "openalex"
-    monkeypatch.setattr("doi2dataset.AbstractProcessor.get_abstract", lambda *args, **kwargs: abstract_mock)
+    monkeypatch.setattr(
+        "doi2dataset.AbstractProcessor.get_abstract",
+        lambda *args, **kwargs: abstract_mock,
+    )

    # Mock the _fetch_data method to return our test data
    metadata_processor._fetch_data = MagicMock(return_value=openalex_data)
@ -134,27 +147,27 @@ def test_build_metadata_keywords_and_topics(metadata_processor, openalex_data, m
    metadata = metadata_processor._build_metadata(openalex_data)

    # Examine the fields inside datasetVersion.metadataBlocks
-    assert 'metadataBlocks' in metadata['datasetVersion']
-    citation = metadata['datasetVersion']['metadataBlocks'].get('citation', {})
+    assert "metadataBlocks" in metadata["datasetVersion"]
+    citation = metadata["datasetVersion"]["metadataBlocks"].get("citation", {})

    # Check fields in citation section
-    assert 'fields' in citation
-    fields = citation['fields']
+    assert "fields" in citation
+    fields = citation["fields"]

    # Check for keyword and subject fields
-    field_names = [field.get('typeName') for field in fields]
+    field_names = [field.get("typeName") for field in fields]

    # If keywords exist, verify structure
-    if 'keyword' in field_names:
+    if "keyword" in field_names:
        for field in fields:
-            if field.get('typeName') == 'keyword':
-                assert 'value' in field
-                assert isinstance(field['value'], list)
+            if field.get("typeName") == "keyword":
+                assert "value" in field
+                assert isinstance(field["value"], list)

    # Check for subject field which should definitely exist
-    assert 'subject' in field_names
+    assert "subject" in field_names
    for field in fields:
-        if field.get('typeName') == 'subject':
-            assert 'value' in field
-            assert isinstance(field['value'], list)
-            assert len(field['value']) > 0
+        if field.get("typeName") == "subject":
+            assert "value" in field
+            assert isinstance(field["value"], list)
+            assert len(field["value"]) > 0
--- a/tests/test_person.py
+++ b/tests/test_person.py
@ -8,7 +8,7 @@ def test_person_to_dict_with_string_affiliation():
        given_name="John",
        orcid="0000-0001-2345-6789",
        email="john.doe@example.org",
-        affiliation="Test University"
+        affiliation="Test University",
    )

    result = person.to_dict()
@ -29,7 +29,7 @@ def test_person_to_dict_with_institution_ror():
        given_name="John",
        orcid="0000-0001-2345-6789",
        email="john.doe@example.org",
-        affiliation=inst
+        affiliation=inst,
    )

    result = person.to_dict()
@ -48,7 +48,7 @@ def test_person_to_dict_with_institution_display_name_only():
        family_name="Smith",
        given_name="Jane",
        orcid="0000-0001-9876-5432",
-        affiliation=inst
+        affiliation=inst,
    )

    result = person.to_dict()
@ -63,11 +63,7 @@ def test_person_to_dict_with_empty_institution():
    # Create an Institution with empty values
    inst = Institution("")

-    person = Person(
-        family_name="Brown",
-        given_name="Robert",
-        affiliation=inst
-    )
+    person = Person(family_name="Brown", given_name="Robert", affiliation=inst)

    result = person.to_dict()

@ -79,9 +75,7 @@ def test_person_to_dict_with_empty_institution():
 def test_person_to_dict_with_no_affiliation():
    """Test Person.to_dict() with no affiliation."""
    person = Person(
-        family_name="Green",
-        given_name="Alice",
-        orcid="0000-0002-1111-2222"
+        family_name="Green", given_name="Alice", orcid="0000-0002-1111-2222"
    )

    result = person.to_dict()
--- a/tests/test_publication_utils.py
+++ b/tests/test_publication_utils.py
@ -14,44 +14,44 @@ def metadata_processor():
    processor.console = MagicMock()
    return processor

+
 def test_get_publication_year_with_publication_year(metadata_processor):
    """Test that _get_publication_year extracts year from publication_year field"""
    data = {"publication_year": 2020}
    year = metadata_processor._get_publication_year(data)
    assert year == 2020

+
 def test_get_publication_year_with_date(metadata_processor):
    """Test that _get_publication_year returns empty string when publication_year is missing"""
    data = {"publication_date": "2019-05-15"}
    year = metadata_processor._get_publication_year(data)
    assert year == ""

+
 def test_get_publication_year_with_both_fields(metadata_processor):
    """Test that _get_publication_year prioritizes publication_year over date"""
-    data = {
-        "publication_year": 2020,
-        "publication_date": "2019-05-15"
-    }
+    data = {"publication_year": 2020, "publication_date": "2019-05-15"}
    year = metadata_processor._get_publication_year(data)
    assert year == 2020

+
 def test_get_publication_year_with_partial_date(metadata_processor):
    """Test that _get_publication_year returns empty string when only publication_date is present"""
    data = {"publication_date": "2018"}
    year = metadata_processor._get_publication_year(data)
    assert year == ""

+
 def test_get_publication_year_with_missing_data(metadata_processor):
    """Test that _get_publication_year handles missing data"""
    data = {"other_field": "value"}
    year = metadata_processor._get_publication_year(data)
    assert year == ""

+
 def test_get_publication_year_with_invalid_data(metadata_processor):
    """Test that _get_publication_year returns whatever is in publication_year field"""
-    data = {
-        "publication_year": "not-a-year",
-        "publication_date": "invalid-date"
-    }
+    data = {"publication_year": "not-a-year", "publication_date": "invalid-date"}
    year = metadata_processor._get_publication_year(data)
    assert year == "not-a-year"