From 9d270ec6016816d78da03d753046a381d13e5aa5 Mon Sep 17 00:00:00 2001 From: Alexander Minges Date: Mon, 14 Jul 2025 09:39:07 +0200 Subject: [PATCH] feat: add pre-commit setup with gitlint --- .pre-commit-config.yaml | 54 +++ README.md | 94 +++++ docs/make.bat | 94 ++--- docs/source/commit-messages.rst | 229 +++++++++++ docs/source/contributing.rst | 38 +- docs/source/index.rst | 1 + doi2dataset.py | 679 +++++++++++++++++++++---------- pyproject.toml | 5 + requirements-dev.txt | 1 + scripts/lint-commit.py | 179 ++++++++ tests/test_citation_builder.py | 12 +- tests/test_doi2dataset.py | 4 + tests/test_fetch_doi_mock.py | 14 +- tests/test_license_processor.py | 26 +- tests/test_metadata_processor.py | 95 +++-- tests/test_person.py | 16 +- tests/test_publication_utils.py | 16 +- 17 files changed, 1197 insertions(+), 360 deletions(-) create mode 100644 .pre-commit-config.yaml create mode 100644 docs/source/commit-messages.rst create mode 100644 scripts/lint-commit.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..419ab30 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,54 @@ +# Pre-commit configuration for doi2dataset +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks + +repos: + # Built-in pre-commit hooks + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-added-large-files + - id: check-merge-conflict + - id: check-json + - id: check-toml + - id: mixed-line-ending + args: ['--fix=lf'] + + # Python code formatting and linting + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.6.9 + hooks: + - id: ruff + args: [--fix, --exit-non-zero-on-fix] + - id: ruff-format + + # Git commit message linting with gitlint + - repo: https://github.com/jorisroovers/gitlint + rev: v0.19.1 + hooks: + - id: gitlint + stages: [commit-msg] + + # Optional: Check for common security issues + - repo: https://github.com/PyCQA/bandit + rev: 1.7.10 + hooks: + - id: bandit + args: ["-c", "pyproject.toml"] + additional_dependencies: ["bandit[toml]"] + +# Configuration for specific hooks +ci: + autofix_commit_msg: | + [pre-commit.ci] auto fixes from pre-commit hooks + + for more information, see https://pre-commit.ci + autofix_prs: true + autoupdate_branch: '' + autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate' + autoupdate_schedule: weekly + skip: [] + submodules: false diff --git a/README.md b/README.md index 187dbfe..c664f98 100644 --- a/README.md +++ b/README.md @@ -152,6 +152,73 @@ Documentation is automatically built and deployed via GitLab CI/CD: - Deployed to GitLab Pages - Accessible at your project's Pages URL +## Git Commit Message Linting + +This project uses [gitlint](https://jorisroovers.github.io/gitlint/) to enforce consistent commit message formatting. Commit messages should follow the [Conventional Commits](https://www.conventionalcommits.org/) specification. + +### Commit Message Format + +Commit messages must follow this format: + +``` +(): + +[optional body] + +[optional footer(s)] +``` + +**Types:** + +- `feat`: A new feature +- `fix`: A bug fix +- `docs`: Documentation only changes +- `style`: Changes that do not affect the meaning of the code +- `refactor`: A code change that neither fixes a bug nor adds a feature +- `test`: Adding missing tests or correcting existing tests +- `chore`: Changes to the build process or auxiliary tools +- `ci`: Changes to CI configuration files and scripts +- `build`: Changes that affect the build system or dependencies +- `perf`: A code change that improves performance +- `revert`: Reverts a previous commit + +**Examples:** + +``` +feat(api): add support for DOI batch processing +fix(metadata): handle missing author information gracefully +docs: update installation instructions +test(citation): add tests for license processing +``` + +### Linting Commit Messages + +To lint commit messages, use the provided script: + +```bash +# Lint the last commit +python scripts/lint-commit.py + +# Lint a specific commit +python scripts/lint-commit.py --hash + +# Lint a range of commits +python scripts/lint-commit.py --range HEAD~3.. + +# Install as a git hook (optional) +python scripts/lint-commit.py --install-hook +``` + +### Git Hook Installation + +You can optionally install a git hook that automatically checks commit messages: + +```bash +python scripts/lint-commit.py --install-hook +``` + +This will create a `commit-msg` hook that runs automatically when you commit, ensuring all commit messages follow the required format. + ## Testing Tests are implemented with pytest. The test suite provides comprehensive coverage of core functionalities. To run the tests, execute: @@ -270,6 +337,33 @@ This version has been updated to make the tool more generalized and suitable for Contributions are welcome! Please fork the repository and submit a pull request with your improvements. +### Development Setup + +1. Install development dependencies: + + ```bash + pip install -r requirements-dev.txt + ``` + +2. Run tests to ensure everything works: + + ```bash + pytest + ``` + +3. Install the git commit message hook (recommended): + ```bash + python scripts/lint-commit.py --install-hook + ``` + +### Code Quality + +- Follow the existing code style and formatting +- Write tests for new functionality +- Ensure all tests pass before submitting +- Use meaningful commit messages following the conventional commits format +- Run `python scripts/lint-commit.py` to validate commit messages + ## License This project is licensed under the MIT License. See the [LICENSE.md](LICENSE.md) file for details. diff --git a/docs/make.bat b/docs/make.bat index 9725e0f..3857d4c 100644 --- a/docs/make.bat +++ b/docs/make.bat @@ -1,47 +1,47 @@ -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set SOURCEDIR=source -set BUILDDIR=build - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.https://www.sphinx-doc.org/ - exit /b 1 -) - -if "%1" == "" goto help - -if "%1" == "multiversion" goto multiversion -if "%1" == "multiversion-clean" goto multiversion-clean - -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% -goto end - -:multiversion -sphinx-multiversion %SOURCEDIR% %BUILDDIR%\multiversion\html %SPHINXOPTS% %O% -goto end - -:multiversion-clean -rmdir /s /q %BUILDDIR%\html 2>nul -sphinx-multiversion %SOURCEDIR% %BUILDDIR%\multiversion\html %SPHINXOPTS% %O% -goto end - -:help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% - -:end -popd +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +if "%1" == "multiversion" goto multiversion +if "%1" == "multiversion-clean" goto multiversion-clean + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:multiversion +sphinx-multiversion %SOURCEDIR% %BUILDDIR%\multiversion\html %SPHINXOPTS% %O% +goto end + +:multiversion-clean +rmdir /s /q %BUILDDIR%\html 2>nul +sphinx-multiversion %SOURCEDIR% %BUILDDIR%\multiversion\html %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/commit-messages.rst b/docs/source/commit-messages.rst new file mode 100644 index 0000000..b5388dd --- /dev/null +++ b/docs/source/commit-messages.rst @@ -0,0 +1,229 @@ +Git Commit Message Linting +=========================== + +This project uses `gitlint `_ to enforce consistent commit message formatting. All commit messages must follow the `Conventional Commits `_ specification to ensure clear and standardized project history. + +Why Commit Message Standards Matter +----------------------------------- + +Standardized commit messages provide several benefits: + +* **Improved readability**: Clear, consistent format makes it easier to understand changes +* **Automated changelog generation**: Tools can parse conventional commits to generate changelogs +* **Better collaboration**: Team members can quickly understand the nature of changes +* **Easier debugging**: Well-formatted commits help identify when bugs were introduced +* **Semantic versioning**: Conventional commits can trigger automated version bumps + +Commit Message Format +--------------------- + +All commit messages must follow this format: + +.. code-block:: text + + (): + + [optional body] + + [optional footer(s)] + +Components +~~~~~~~~~~ + +**Type (required)** + The type of change being made. Must be one of: + + * ``feat``: A new feature + * ``fix``: A bug fix + * ``docs``: Documentation only changes + * ``style``: Changes that do not affect the meaning of the code (white-space, formatting, etc.) + * ``refactor``: A code change that neither fixes a bug nor adds a feature + * ``test``: Adding missing tests or correcting existing tests + * ``chore``: Changes to the build process or auxiliary tools and libraries + * ``ci``: Changes to CI configuration files and scripts + * ``build``: Changes that affect the build system or external dependencies + * ``perf``: A code change that improves performance + * ``revert``: Reverts a previous commit + +**Scope (optional)** + The scope of the change, enclosed in parentheses. Common scopes for this project: + + * ``api``: Changes to API functionality + * ``metadata``: Changes to metadata processing + * ``citation``: Changes to citation building + * ``config``: Changes to configuration handling + * ``tests``: Changes to test files + * ``docs``: Changes to documentation + * ``deps``: Changes to dependencies + +**Description (required)** + A short description of the change: + + * Use the imperative, present tense: "change" not "changed" nor "changes" + * Don't capitalize the first letter + * No period (.) at the end + * Maximum 50 characters + +**Body (optional)** + A longer description of the change: + + * Use the imperative, present tense + * Wrap at 72 characters + * Explain what and why vs. how + +**Footer (optional)** + One or more footers may be provided: + + * ``BREAKING CHANGE:`` description of breaking changes + * ``Closes #123``: reference to closed issues + * ``Co-authored-by: Name ``: additional authors + +Examples +-------- + +**Simple feature addition:** + +.. code-block:: text + + feat(api): add support for DOI batch processing + +**Bug fix with scope:** + +.. code-block:: text + + fix(metadata): handle missing author information gracefully + +**Documentation update:** + +.. code-block:: text + + docs: update installation instructions + +**Breaking change:** + +.. code-block:: text + + feat(api): change metadata output format + + BREAKING CHANGE: The metadata output format has changed from JSON + to YAML. Users need to update their parsing code accordingly. + +**Multi-line with body:** + +.. code-block:: text + + refactor(citation): improve author name parsing + + The author name parsing logic has been refactored to handle + more edge cases, including names with multiple middle initials + and international characters. + + Closes #45 + +Configuration +------------- + +The project uses a ``.gitlint`` configuration file that enforces: + +* Maximum title length of 50 characters +* Conventional commit format validation +* Maximum body line length of 72 characters +* Exclusion of certain words like "WIP", "TODO", "FIXME" in titles +* Automatic ignoring of merge commits and dependency updates + +Linting Tools +------------- + +Manual Linting +~~~~~~~~~~~~~~~ + +Use the provided script to lint commit messages: + +.. code-block:: bash + + # Lint the last commit + python scripts/lint-commit.py + + # Lint a specific commit by hash + python scripts/lint-commit.py --hash + + # Lint a range of commits + python scripts/lint-commit.py --range HEAD~3.. + + # Check staged commit message + python scripts/lint-commit.py --staged + +Git Hook Installation +~~~~~~~~~~~~~~~~~~~~~ + +Install an automated git hook to check commit messages: + +.. code-block:: bash + + python scripts/lint-commit.py --install-hook + +This creates a ``commit-msg`` hook that automatically validates commit messages when you commit. The commit will be rejected if the message doesn't meet the requirements. + +Direct Gitlint Usage +~~~~~~~~~~~~~~~~~~~~ + +You can also use gitlint directly: + +.. code-block:: bash + + # Lint last commit + gitlint + + # Lint specific commit + gitlint --commit + + # Lint commit range + gitlint --commits HEAD~3.. + +Common Validation Errors +------------------------- + +**Title too long** + Keep titles under 50 characters. If you need more space, use the body. + +**Invalid type** + Use only the allowed types: ``feat``, ``fix``, ``docs``, ``style``, ``refactor``, ``test``, ``chore``, ``ci``, ``build``, ``perf``, ``revert``. + +**Missing colon** + Don't forget the colon after the type/scope: ``feat(api): add feature`` + +**Capitalized description** + Don't capitalize the first letter of the description: ``feat: add feature`` not ``feat: Add feature`` + +**Trailing period** + Don't add a period at the end of the title: ``feat: add feature`` not ``feat: add feature.`` + +**Body line too long** + Keep body lines under 72 characters. Break long lines appropriately. + +Troubleshooting +--------------- + +**Gitlint not found** + Install development dependencies: + + .. code-block:: bash + + pip install -r requirements-dev.txt + +**Hook not working** + Ensure the hook is executable: + + .. code-block:: bash + + chmod +x .git/hooks/commit-msg + +**Existing commits don't follow format** + The linting only applies to new commits. Existing commits can be left as-is or rebased if necessary. + +Integration with CI/CD +---------------------- + +The commit message linting can be integrated into CI/CD pipelines to ensure all commits in pull requests follow the standard format. This helps maintain consistency across all contributors. + +For more information on gitlint configuration and advanced usage, see the `official gitlint documentation `_. diff --git a/docs/source/contributing.rst b/docs/source/contributing.rst index 00e9719..6c0a5db 100644 --- a/docs/source/contributing.rst +++ b/docs/source/contributing.rst @@ -115,20 +115,47 @@ Development Setup pip install -r requirements-dev.txt -4. Make your changes -5. Run tests to ensure everything works -6. Submit a pull request +4. Install the git commit message hook (recommended): + + .. code-block:: bash + + python scripts/lint-commit.py --install-hook + +5. Make your changes +6. Run tests to ensure everything works +7. Validate your commit messages follow the standards +8. Submit a pull request Code Style ---------- Please follow the existing code style and conventions used in the project. Make sure to: -- Write clear, descriptive commit messages +- Write clear, descriptive commit messages following the :doc:`commit-messages` standards - Add tests for new functionality - Update documentation as needed - Follow Python best practices +Commit Message Standards +~~~~~~~~~~~~~~~~~~~~~~~~ + +All commit messages must follow the Conventional Commits specification. See the :doc:`commit-messages` documentation for detailed information on: + +- Required message format +- Available commit types +- Examples of proper commit messages +- How to use the linting tools + +To validate your commit messages: + +.. code-block:: bash + + # Lint the last commit + python scripts/lint-commit.py + + # Install automatic validation hook + python scripts/lint-commit.py --install-hook + Submitting Changes ------------------ @@ -136,6 +163,7 @@ Submitting Changes 2. Make your changes with appropriate tests 3. Ensure all tests pass 4. Update documentation if needed -5. Submit a pull request with a clear description of your changes +5. Ensure all commit messages follow the conventional commits format +6. Submit a pull request with a clear description of your changes Thank you for contributing to **doi2dataset**! diff --git a/docs/source/index.rst b/docs/source/index.rst index ea9015b..608c1d4 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -39,4 +39,5 @@ Key Features: usage modules contributing + commit-messages faq diff --git a/doi2dataset.py b/doi2dataset.py index 7d23409..12a3373 100755 --- a/doi2dataset.py +++ b/doi2dataset.py @@ -51,11 +51,13 @@ from rich.theme import Theme # Get version from setuptools_scm try: from importlib.metadata import version + __version__ = version("doi2dataset") except ImportError: # Fallback for older Python versions try: import pkg_resources + __version__ = pkg_resources.get_distribution("doi2dataset").version except Exception: __version__ = "1.0.0" # Fallback version @@ -74,34 +76,37 @@ from idutils.validators import is_doi # noqa: E402 # Icon definitions for console output ICONS = { - 'success': "✓", # Simple checkmark - 'error': "✗", # Simple X - 'warning': "!", # Simple exclamation - 'info': "ℹ", # Info symbol - 'processing': "⋯", # Three dots - 'done': "∎", # Filled square - 'file': "⨳", # Document symbol - 'folder': "⊞", # Folder symbol - 'clock': "◷", # Clock symbol - 'search': "⌕", # Search symbol - 'data': "≡", # Three lines - 'doi': "∾", # Link symbol - 'total': "∑", # Sum symbol - 'save': "⤓", # Save/download arrow - 'upload': "⤒" # Upload arrow + "success": "✓", # Simple checkmark + "error": "✗", # Simple X + "warning": "!", # Simple exclamation + "info": "ℹ", # Info symbol + "processing": "⋯", # Three dots + "done": "∎", # Filled square + "file": "⨳", # Document symbol + "folder": "⊞", # Folder symbol + "clock": "◷", # Clock symbol + "search": "⌕", # Search symbol + "data": "≡", # Three lines + "doi": "∾", # Link symbol + "total": "∑", # Sum symbol + "save": "⤓", # Save/download arrow + "upload": "⤒", # Upload arrow } # Theme configuration for Rich console output -THEME = Theme({ - "info": "cyan", - "warning": "yellow", - "error": "red bold", - "success": "green", -}) +THEME = Theme( + { + "info": "cyan", + "warning": "yellow", + "error": "red bold", + "success": "green", + } +) # Available sources for metadata abstracts SOURCES = ["openalex", "crossref", "none"] + def format_status(icon: str, message: str, style: str = "default") -> str: """ Format a status message with an icon and a given style. @@ -116,12 +121,15 @@ def format_status(icon: str, message: str, style: str = "default") -> str: """ return f"[{style}]{ICONS[icon]} {message}[/{style}]" + class FieldType(Enum): """Enum representing different Dataverse field types.""" + PRIMITIVE = "primitive" COMPOUND = "compound" VOCABULARY = "controlledVocabulary" + @dataclass class BaseMetadataField[T]: """ @@ -137,6 +145,7 @@ class BaseMetadataField[T]: value (T): The value stored in the field. type (FieldType): The type of the field, automatically set based on T. """ + name: str multiple: bool value: T @@ -172,11 +181,13 @@ class BaseMetadataField[T]: """ raise NotImplementedError("Subclasses must implement the to_dict method.") + @dataclass class PrimitiveMetadataField(BaseMetadataField[str]): """ Metadata field representing a primitive type (e.g., string) for Dataverse. """ + def _set_type(self) -> None: self.type = FieldType.PRIMITIVE @@ -194,7 +205,7 @@ class PrimitiveMetadataField(BaseMetadataField[str]): "typeClass": self.type.value, "multiple": self.multiple, "value": self.value, - "expandedValue": self.expanded_value + "expandedValue": self.expanded_value, } else: return { @@ -204,11 +215,13 @@ class PrimitiveMetadataField(BaseMetadataField[str]): "value": self.value, } + @dataclass class ControlledVocabularyMetadataField(BaseMetadataField[str | list[str]]): """ Metadata field for controlled vocabulary values. """ + def _set_type(self) -> None: self.type = FieldType.VOCABULARY @@ -229,11 +242,14 @@ class ControlledVocabularyMetadataField(BaseMetadataField[str | list[str]]): @dataclass class CompoundMetadataField( - BaseMetadataField[Sequence[Sequence[PrimitiveMetadataField | ControlledVocabularyMetadataField]]] + BaseMetadataField[ + Sequence[Sequence[PrimitiveMetadataField | ControlledVocabularyMetadataField]] + ] ): """ Metadata field representing compound types, composed of multiple subfields. """ + def _set_type(self) -> None: self.type = FieldType.COMPOUND @@ -255,9 +271,10 @@ class CompoundMetadataField( "typeName": self.name, "typeClass": self.type.value, "multiple": self.multiple, - "value": value_list + "value": value_list, } + @dataclass class Institution: """ @@ -267,6 +284,7 @@ class Institution: display_name (str): The name of the institution. ror (str): Research Organization Registry identifier (optional). """ + display_name: str ror: str = "" @@ -282,12 +300,15 @@ class Institution: expanded_value = { "scheme": "http://www.grid.ac/ontology/", "termName": self.display_name, - "@type": "https://schema.org/Organization" + "@type": "https://schema.org/Organization", } - return PrimitiveMetadataField("authorAffiliation", False, self.ror, expanded_value=expanded_value) + return PrimitiveMetadataField( + "authorAffiliation", False, self.ror, expanded_value=expanded_value + ) else: return PrimitiveMetadataField("authorAffiliation", False, self.display_name) + @dataclass class Person: """ @@ -300,6 +321,7 @@ class Person: email (str): Email address (optional). affiliation (Institution): Affiliation of the person (optional). """ + family_name: str given_name: str orcid: str = "" @@ -307,34 +329,34 @@ class Person: affiliation: Institution | str = "" def to_dict(self) -> dict[str, str | list[str] | dict[str, str]]: - """ - Convert Person to a dictionary for JSON serialization. + """ + Convert Person to a dictionary for JSON serialization. - Handles affiliations properly by checking if the affiliation - is an Institution object or a string. + Handles affiliations properly by checking if the affiliation + is an Institution object or a string. - Returns: - dict: A dictionary containing the person's information including - name, contact details, and affiliation. - """ - return_dict: dict[str, str | list[str] | dict[str, str]] = { - "family_name": self.family_name, - "given_name": self.given_name, - "orcid": self.orcid, - "email": self.email - } + Returns: + dict: A dictionary containing the person's information including + name, contact details, and affiliation. + """ + return_dict: dict[str, str | list[str] | dict[str, str]] = { + "family_name": self.family_name, + "given_name": self.given_name, + "orcid": self.orcid, + "email": self.email, + } - if isinstance(self.affiliation, Institution): - if self.affiliation.ror: - return_dict["affiliation"] = self.affiliation.ror - elif self.affiliation.display_name: - return_dict["affiliation"] = self.affiliation.display_name - else: - return_dict["affiliation"] = "" + if isinstance(self.affiliation, Institution): + if self.affiliation.ror: + return_dict["affiliation"] = self.affiliation.ror + elif self.affiliation.display_name: + return_dict["affiliation"] = self.affiliation.display_name else: - return_dict["affiliation"] = self.affiliation if self.affiliation else "" + return_dict["affiliation"] = "" + else: + return_dict["affiliation"] = self.affiliation if self.affiliation else "" - return return_dict + return return_dict def format_name(self) -> str: """ @@ -345,7 +367,9 @@ class Person: """ return f"{self.family_name}, {self.given_name}" - def author_fields(self) -> list[PrimitiveMetadataField | ControlledVocabularyMetadataField]: + def author_fields( + self, + ) -> list[PrimitiveMetadataField | ControlledVocabularyMetadataField]: """ Build metadata fields for the author. @@ -360,19 +384,23 @@ class Person: if isinstance(self.affiliation, Institution): affiliation_field = self.affiliation.affiliation_field() else: - affiliation_field = PrimitiveMetadataField("authorAffiliation", False, self.affiliation) + affiliation_field = PrimitiveMetadataField( + "authorAffiliation", False, self.affiliation + ) if self.orcid: return [ PrimitiveMetadataField("authorName", False, self.format_name()), affiliation_field, - ControlledVocabularyMetadataField("authorIdentifierScheme", False, "ORCID"), - PrimitiveMetadataField("authorIdentifier", False, self.orcid) + ControlledVocabularyMetadataField( + "authorIdentifierScheme", False, "ORCID" + ), + PrimitiveMetadataField("authorIdentifier", False, self.orcid), ] else: return [ PrimitiveMetadataField("authorName", False, self.format_name()), - affiliation_field + affiliation_field, ] def dataset_contact_fields(self) -> list[PrimitiveMetadataField]: @@ -391,12 +419,14 @@ class Person: if isinstance(self.affiliation, Institution): affiliation_field = self.affiliation.affiliation_field() else: - affiliation_field = PrimitiveMetadataField("datasetContactAffiliation", False, self.affiliation) + affiliation_field = PrimitiveMetadataField( + "datasetContactAffiliation", False, self.affiliation + ) return [ PrimitiveMetadataField("datasetContactName", False, self.format_name()), affiliation_field, - PrimitiveMetadataField("datasetContactEmail", False, self.email) + PrimitiveMetadataField("datasetContactEmail", False, self.email), ] @@ -410,10 +440,12 @@ class License: uri (str): The license URI. short (str): The short identifier of the license. """ + name: str uri: str short: str + @dataclass class Abstract: """ @@ -423,6 +455,7 @@ class Abstract: text (str): The abstract text. source (str): The source of the abstract ('crossref', 'openalex', or 'none'). """ + text: str source: str @@ -435,7 +468,10 @@ class Abstract: """ allowed_sources = ["crossref", "openalex", "none"] if self.source not in allowed_sources: - raise ValueError(f"{self.source} is not valid! Needs to be one of {str(allowed_sources)}.") + raise ValueError( + f"{self.source} is not valid! Needs to be one of {str(allowed_sources)}." + ) + @dataclass class ConfigData: @@ -447,18 +483,21 @@ class ConfigData: pis (list[dict[str, Any]]): List of principal investigator configurations. default_grants (list[dict[str, str]]): Default grant configurations. """ + dataverse: dict[str, str] pis: list[dict[str, Any]] default_grants: list[dict[str, str]] + class Config: """ Singleton class to handle configuration loading and retrieval. """ - _instance: 'Config | None' = None + + _instance: "Config | None" = None _config_data: ConfigData | None = None - def __new__(cls) -> 'Config': + def __new__(cls) -> "Config": """ Create and return the singleton instance of Config. @@ -489,20 +528,22 @@ class Config: if not config_path.exists(): raise FileNotFoundError(f"Config file not found: {config_path}") - with open(config_path, encoding='utf-8') as f: + with open(config_path, encoding="utf-8") as f: config_data = yaml.safe_load(f) # Validate PI email addresses - pis = config_data.get('pis', []) + pis = config_data.get("pis", []) for pi in pis: - if email := pi.get('email'): + if email := pi.get("email"): if not validate_email_address(email): - raise ValueError(f"Configuration Error: Invalid email address for PI {pi.get('given_name', '')} {pi.get('family_name', '')}: {email}") + raise ValueError( + f"Configuration Error: Invalid email address for PI {pi.get('given_name', '')} {pi.get('family_name', '')}: {email}" + ) cls._config_data = ConfigData( - dataverse=config_data.get('dataverse', {}), - pis=config_data.get('pis', []), - default_grants=config_data.get('default_grants', []) + dataverse=config_data.get("dataverse", {}), + pis=config_data.get("pis", []), + default_grants=config_data.get("default_grants", []), ) @classmethod @@ -552,6 +593,7 @@ class Config: """ return self.get_config().dataverse + class APIClient: """ Client for making HTTP requests to external APIs. @@ -559,7 +601,13 @@ class APIClient: Attributes: session (requests.Session): The underlying requests session. """ - def __init__(self, contact_mail: str | None = None, user_agent: str = f"UDE-Doi2Dataset/{__version__}", token: str | None = None) -> None: + + def __init__( + self, + contact_mail: str | None = None, + user_agent: str = f"UDE-Doi2Dataset/{__version__}", + token: str | None = None, + ) -> None: """ Initialize the API client with optional contact mail, user agent, and token. @@ -571,7 +619,9 @@ class APIClient: self.session = requests.Session() self._set_headers(contact_mail, user_agent, token) - def _set_headers(self, contact_mail: str | None, user_agent: str, token: str | None) -> None: + def _set_headers( + self, contact_mail: str | None, user_agent: str, token: str | None + ) -> None: """ Set HTTP headers for the session based on contact email and token. @@ -590,7 +640,9 @@ class APIClient: self.session.headers.update(header) - def make_request(self, url: str, method: str = "GET", **kwargs: Any) -> requests.Response | None: + def make_request( + self, url: str, method: str = "GET", **kwargs: Any + ) -> requests.Response | None: """ Make an HTTP request and return the response. @@ -610,10 +662,12 @@ class APIClient: print(f"\n{ICONS['error']} Request failed: {str(e)}") return None + class NameProcessor: """ Provides utility methods for processing names. """ + @staticmethod def normalize_string(s: str) -> str: """ @@ -625,7 +679,11 @@ class NameProcessor: Returns: str: The normalized string. """ - return unicodedata.normalize("NFKD", s.lower()).encode("ASCII", "ignore").decode("ASCII") + return ( + unicodedata.normalize("NFKD", s.lower()) + .encode("ASCII", "ignore") + .decode("ASCII") + ) @staticmethod def split_name(full_name: str) -> tuple[str, str]: @@ -648,10 +706,12 @@ class NameProcessor: return " ".join(parts[:-1]), parts[-1] + class PIFinder: """ Finds principal investigators (PIs) among a list of Person objects. """ + def __init__(self, pis: list[Person]) -> None: """ Initialize with a list of Person objects representing potential PIs. @@ -661,7 +721,12 @@ class PIFinder: """ self.pis = pis - def find_pi(self, family_name: str | None = None, orcid: str | None = None, given_name: str | None = None) -> Person | None: + def find_pi( + self, + family_name: str | None = None, + orcid: str | None = None, + given_name: str | None = None, + ) -> Person | None: """ Find a PI by ORCID or name. @@ -711,7 +776,10 @@ class PIFinder: normalized_family_name = NameProcessor.normalize_string(family_name) for person in self.pis: - if NameProcessor.normalize_string(person.family_name) == normalized_family_name: + if ( + NameProcessor.normalize_string(person.family_name) + == normalized_family_name + ): matches.append(person) if not matches: @@ -720,7 +788,10 @@ class PIFinder: if given_name: normalized_given_name = NameProcessor.normalize_string(given_name) for match in matches: - if NameProcessor.normalize_string(match.given_name) == normalized_given_name: + if ( + NameProcessor.normalize_string(match.given_name) + == normalized_given_name + ): return match return None @@ -729,19 +800,30 @@ class PIFinder: raise ValueError("Multiple matches found for family name") + class LicenseProcessor: """ Processes license information from metadata. """ + LICENSE_MAP = { "cc-by": ("https://creativecommons.org/licenses/by/4.0/", "CC BY 4.0"), "cc-by-sa": ("https://creativecommons.org/licenses/by-sa/4.0/", "CC BY-SA 4.0"), "cc-by-nc": ("https://creativecommons.org/licenses/by-nc/4.0/", "CC BY-NC 4.0"), - "cc-by-nc-sa": ("https://creativecommons.org/licenses/by-nc-sa/4.0/", "CC BY-NC-SA 4.0"), - "cc-by-nc-nd": ("https://creativecommons.org/licenses/by-nc-nd/4.0/", "CC BY-NC-ND 4.0"), + "cc-by-nc-sa": ( + "https://creativecommons.org/licenses/by-nc-sa/4.0/", + "CC BY-NC-SA 4.0", + ), + "cc-by-nc-nd": ( + "https://creativecommons.org/licenses/by-nc-nd/4.0/", + "CC BY-NC-ND 4.0", + ), "cc-by-nd": ("https://creativecommons.org/licenses/by-nd/4.0/", "CC BY-ND 4.0"), "cc0": ("https://creativecommons.org/publicdomain/zero/1.0/", "CC0 1.0"), - "pd": ("https://creativecommons.org/publicdomain/mark/1.0/", "Public Domain Mark 1.0"), + "pd": ( + "https://creativecommons.org/publicdomain/mark/1.0/", + "Public Domain Mark 1.0", + ), } @classmethod @@ -765,10 +847,12 @@ class LicenseProcessor: uri, name = cls.LICENSE_MAP.get(base_license, ("", license_short)) return License(name=name, uri=uri, short=license_short) + class AbstractProcessor: """ Retrieves and processes abstracts from CrossRef and OpenAlex. """ + def __init__(self, api_client: APIClient, console: Console | None = None): """ Initialize with an APIClient instance. @@ -780,7 +864,9 @@ class AbstractProcessor: self.api_client = api_client self.console = console or Console() - def get_abstract(self, doi: str, data: dict[str, Any], license: License) -> Abstract: + def get_abstract( + self, doi: str, data: dict[str, Any], license: License + ) -> Abstract: """ Get an abstract based on DOI and license permissions. @@ -795,26 +881,42 @@ class AbstractProcessor: license_ok = {"cc-by", "cc-by-sa", "cc-by-nc", "cc-by-nc-sa", "cc0", "pd"} if license.short in license_ok: - self.console.print(f"\n{ICONS['info']} License {license.name} allows derivative works. Pulling abstract from CrossRef.", style="info") + self.console.print( + f"\n{ICONS['info']} License {license.name} allows derivative works. Pulling abstract from CrossRef.", + style="info", + ) crossref_abstract = self._get_crossref_abstract(doi) if crossref_abstract: return Abstract(text=crossref_abstract, source="crossref") else: - self.console.print(f"\n{ICONS['warning']} No abstract found in CrossRef!", style="warning") + self.console.print( + f"\n{ICONS['warning']} No abstract found in CrossRef!", + style="warning", + ) else: if license.name: - self.console.print(f"\n{ICONS['info']} License {license.name} does not allow derivative works. Reconstructing abstract from OpenAlex!", style="info") + self.console.print( + f"\n{ICONS['info']} License {license.name} does not allow derivative works. Reconstructing abstract from OpenAlex!", + style="info", + ) else: - self.console.print(f"\n{ICONS['info']} Custom license does not allow derivative works. Reconstructing abstract from OpenAlex!", style="info") - + self.console.print( + f"\n{ICONS['info']} Custom license does not allow derivative works. Reconstructing abstract from OpenAlex!", + style="info", + ) openalex_abstract = self._get_openalex_abstract(data) if openalex_abstract: return Abstract(text=openalex_abstract, source="openalex") else: - self.console.print(f"\n{ICONS['warning']} No abstract found in OpenAlex!", style="warning") + self.console.print( + f"\n{ICONS['warning']} No abstract found in OpenAlex!", style="warning" + ) - self.console.print(f"\n{ICONS['warning']} No abstract found in either CrossRef nor OpenAlex!", style="warning") + self.console.print( + f"\n{ICONS['warning']} No abstract found in either CrossRef nor OpenAlex!", + style="warning", + ) return Abstract(text="", source="none") def _get_crossref_abstract(self, doi: str) -> str | None: @@ -849,7 +951,9 @@ class AbstractProcessor: if not inv_index: return None - word_positions = [(word, pos) for word, positions in inv_index.items() for pos in positions] + word_positions = [ + (word, pos) for word, positions in inv_index.items() for pos in positions + ] sorted_words = sorted(word_positions, key=lambda x: x[1]) return " ".join(word for word, _ in sorted_words) @@ -874,13 +978,13 @@ class AbstractProcessor: # Replace closing tags that follow ordered list openings # This regex matches that comes after
    tags - pattern = r'(
      .*?)' - text = re.sub(pattern, r'\1
    ', text, flags=re.DOTALL) + pattern = r"(
      .*?)" + text = re.sub(pattern, r"\1
    ", text, flags=re.DOTALL) # Process unordered lists second text = text.replace('', "
      ") # Replace remaining tags as unordered list closings - text = text.replace('', '
    ') + text = text.replace("
    ", "") # Handle other JATS tags replacements = { @@ -912,10 +1016,12 @@ class AbstractProcessor: text = text.replace(jats_tag, html_tag) return text + class SubjectMapper: """ Maps subject names from input data to controlled vocabulary. """ + CONTROLLED_VOCAB = { "Agricultural Sciences": "Agricultural Sciences", "Arts and Humanities": "Arts and Humanities", @@ -939,7 +1045,9 @@ class SubjectMapper: } @classmethod - def get_subjects(cls, data: dict[str, Any], fallback_subject: str = "Other") -> list[str]: + def get_subjects( + cls, data: dict[str, Any], fallback_subject: str = "Other" + ) -> list[str]: """ Extract and map subjects from input data. @@ -961,7 +1069,6 @@ class SubjectMapper: mapped_subjects = cls.map_subjects(subject_collection) return mapped_subjects if mapped_subjects else [fallback_subject] - @classmethod def map_subjects(cls, subjects: list[str]) -> list[str]: """ @@ -979,11 +1086,15 @@ class SubjectMapper: valid_subjects.add(mapped_subject) return list(valid_subjects) + class CitationBuilder: """ Builds various citation-related metadata fields. """ - def __init__(self, data: dict[str, Any], doi: str, pi_finder: PIFinder, ror: bool = False) -> None: + + def __init__( + self, data: dict[str, Any], doi: str, pi_finder: PIFinder, ror: bool = False + ) -> None: """ Initialize the CitationBuilder with data, DOI, and a PIFinder. @@ -1004,18 +1115,22 @@ class CitationBuilder: Returns: list[list[PrimitiveMetadataField]]: Nested list of identifier metadata fields. """ - other_ids = [[ - PrimitiveMetadataField("otherIdAgency", False, "doi"), - PrimitiveMetadataField("otherIdValue", False, self.doi) - ]] + other_ids = [ + [ + PrimitiveMetadataField("otherIdAgency", False, "doi"), + PrimitiveMetadataField("otherIdValue", False, self.doi), + ] + ] if pmid := self.data.get("ids", {}).get("pmid"): try: normalized_pmid = normalize_pmid(pmid) - other_ids.append([ - PrimitiveMetadataField("otherIdAgency", False, "pmid"), - PrimitiveMetadataField("otherIdValue", False, normalized_pmid) - ]) + other_ids.append( + [ + PrimitiveMetadataField("otherIdAgency", False, "pmid"), + PrimitiveMetadataField("otherIdValue", False, normalized_pmid), + ] + ) except ValueError: pass @@ -1034,7 +1149,12 @@ class CitationBuilder: grants: list[list[PrimitiveMetadataField]] = [] for grant in default_grants: - grants.append([PrimitiveMetadataField("grantNumberAgency", False, grant["funder"]), PrimitiveMetadataField("grantNumberValue", False, grant["id"])]) + grants.append( + [ + PrimitiveMetadataField("grantNumberAgency", False, grant["funder"]), + PrimitiveMetadataField("grantNumberValue", False, grant["id"]), + ] + ) for grant in self.data.get("grants", []): grant_funder = grant.get("funder_display_name", {}) @@ -1042,11 +1162,15 @@ class CitationBuilder: if not grant_funder or not grant_id: continue - grants.append([PrimitiveMetadataField("grantNumberAgency", False, grant_funder), PrimitiveMetadataField("grantNumberValue", False, grant_id)]) + grants.append( + [ + PrimitiveMetadataField("grantNumberAgency", False, grant_funder), + PrimitiveMetadataField("grantNumberValue", False, grant_id), + ] + ) return grants - def build_authors(self) -> tuple[list[Person], list[Person]]: """ Build lists of authors and corresponding authors from the metadata. @@ -1065,14 +1189,17 @@ class CitationBuilder: authors.append(author_person) if authorship.get("is_corresponding"): - corresponding_entry = self._process_corresponding_author(author_person, authorship) + corresponding_entry = self._process_corresponding_author( + author_person, authorship + ) if corresponding_entry: corresponding_authors.append(corresponding_entry) return authors, corresponding_authors - - def _process_author(self, author: dict[str, Any], authorship: dict[str, Any]) -> Person: + def _process_author( + self, author: dict[str, Any], authorship: dict[str, Any] + ) -> Person: """ Process author data and return a Person instance. @@ -1089,7 +1216,9 @@ class CitationBuilder: person = Person(family_name, given_name) if affiliations := authorship.get("affiliations"): - affiliation = Institution(affiliations[0].get("raw_affiliation_string", "").strip()) + affiliation = Institution( + affiliations[0].get("raw_affiliation_string", "").strip() + ) person.affiliation = affiliation @@ -1097,18 +1226,20 @@ class CitationBuilder: if institutions := authorship.get("institutions"): institution = institutions[0] if institution.get("ror"): - affiliation = Institution(institution.get("display_name"), institution.get("ror")) + affiliation = Institution( + institution.get("display_name"), institution.get("ror") + ) person.affiliation = affiliation - if orcid := author.get("orcid"): person.orcid = normalize_orcid(orcid) return person - - def _process_corresponding_author(self, author: Person, authorship: dict[str, Any]) -> Person | None: + def _process_corresponding_author( + self, author: Person, authorship: dict[str, Any] + ) -> Person | None: """ Identify the corresponding author based on provided PI information. @@ -1122,7 +1253,7 @@ class CitationBuilder: pi = self.pi_finder.find_pi( family_name=author.family_name, given_name=author.given_name, - orcid=author.orcid + orcid=author.orcid, ) if not pi: @@ -1141,19 +1272,26 @@ class CitationBuilder: for topic in self.data.get("topics", []): if topic.get("score") >= 0.8: - - topic_class_value_field = PrimitiveMetadataField("topicClassValue", - False, topic.get("display_name")) - topic_class_vocab_field = PrimitiveMetadataField("topicClassVocab", - False, "OpenAlex") + topic_class_value_field = PrimitiveMetadataField( + "topicClassValue", False, topic.get("display_name") + ) + topic_class_vocab_field = PrimitiveMetadataField( + "topicClassVocab", False, "OpenAlex" + ) topic_class_vocab_uri_field = PrimitiveMetadataField( - "topicClassVocabURI", False, topic.get("id")) + "topicClassVocabURI", False, topic.get("id") + ) - topics.append([topic_class_value_field, topic_class_vocab_field, topic_class_vocab_uri_field]) + topics.append( + [ + topic_class_value_field, + topic_class_vocab_field, + topic_class_vocab_uri_field, + ] + ) return topics - def build_keywords(self) -> list[list[PrimitiveMetadataField]]: """ Build metadata fields for keywords from both regular keywords and MeSH terms. @@ -1166,7 +1304,9 @@ class CitationBuilder: for keyword in self.data.get("keywords", []): # Filter out possibly unrelated keywords (low score) if keyword["score"] >= 0.5: - keyword_value_field = PrimitiveMetadataField("keywordValue", False, keyword["display_name"]) + keyword_value_field = PrimitiveMetadataField( + "keywordValue", False, keyword["display_name"] + ) keywords.append([keyword_value_field]) mesh_base_url = "http://id.nlm.nih.gov/mesh" @@ -1175,21 +1315,37 @@ class CitationBuilder: if mesh["qualifier_ui"]: url = f"{url}{mesh['qualifier_ui']}" + keyword_value_field = PrimitiveMetadataField( + "keywordValue", False, mesh["descriptor_name"] + ) + keyword_term_uri_field = PrimitiveMetadataField( + "keywordTermURI", False, url + ) + keyword_vocabulary_field = PrimitiveMetadataField( + "keywordVocabulary", False, "MeSH" + ) + keyword_vocabulary_uri_field = PrimitiveMetadataField( + "keywordVocabularyURI", False, mesh_base_url + ) - keyword_value_field = PrimitiveMetadataField("keywordValue", False, mesh["descriptor_name"]) - keyword_term_uri_field = PrimitiveMetadataField("keywordTermURI", False, url) - keyword_vocabulary_field = PrimitiveMetadataField("keywordVocabulary", False, "MeSH") - keyword_vocabulary_uri_field = PrimitiveMetadataField("keywordVocabularyURI", False, mesh_base_url) - - keywords.append([keyword_value_field, keyword_term_uri_field, keyword_vocabulary_field, keyword_vocabulary_uri_field]) + keywords.append( + [ + keyword_value_field, + keyword_term_uri_field, + keyword_vocabulary_field, + keyword_vocabulary_uri_field, + ] + ) return keywords + class MetadataProcessor: """ Processes metadata for a given DOI by fetching data from OpenAlex, building metadata blocks, and optionally uploading the dataset. """ + def __init__( self, doi: str, @@ -1198,10 +1354,10 @@ class MetadataProcessor: default_subject: str = "Other", contact_mail: str | None = None, upload: bool = False, - ror: bool= False, + ror: bool = False, console: Console | None = None, progress: Progress | None = None, - task_id: TaskID | None = None + task_id: TaskID | None = None, ) -> None: """ Initialize the MetadataProcessor with configuration and processing options. @@ -1259,7 +1415,7 @@ class MetadataProcessor: Advance the progress bar if enabled. """ if self.progress and self.task_id is not None: - self.progress.advance(self.task_id) + self.progress.advance(self.task_id) def process(self) -> dict[str, Any]: """ @@ -1268,7 +1424,9 @@ class MetadataProcessor: Returns: dict[str, Any]: The constructed metadata dictionary. """ - self.console.print(f"{ICONS['processing']} Processing DOI: {self.doi}", style="info") + self.console.print( + f"{ICONS['processing']} Processing DOI: {self.doi}", style="info" + ) data = self._fetch_data() self._update_progress() @@ -1283,7 +1441,10 @@ class MetadataProcessor: self._save_output(metadata) self._update_progress() - self.console.print(f"\n{ICONS['success']} Successfully processed: {self.doi}\n", style="success") + self.console.print( + f"\n{ICONS['success']} Successfully processed: {self.doi}\n", + style="success", + ) return metadata def _upload_data(self, metadata: dict[str, Any]) -> dict[str, Any]: @@ -1301,19 +1462,25 @@ class MetadataProcessor: """ config = Config() - token = config.DATAVERSE['api_token'] + token = config.DATAVERSE["api_token"] client = APIClient(token=token) url = f"{config.DATAVERSE['url']}/api/dataverses/{config.DATAVERSE['dataverse']}/datasets?doNotValidate=true" - auth = (config.DATAVERSE['auth_user'], config.DATAVERSE['auth_password']) + auth = (config.DATAVERSE["auth_user"], config.DATAVERSE["auth_password"]) response = client.make_request(url, method="POST", auth=auth, json=metadata) if response is None or response.status_code != 201: - self.console.print(f"\n{ICONS['error']} Failed to upload to Dataverse: {url}", style="error") + self.console.print( + f"\n{ICONS['error']} Failed to upload to Dataverse: {url}", + style="error", + ) raise ValueError(f"Failed to upload to Dataverse: {url}") else: perma = response.json().get("data", {}).get("persistentId", "") - self.console.print(f"{ICONS['upload']} Dataset uploaded to: {config.DATAVERSE['dataverse']} with ID {perma}", style="info") + self.console.print( + f"{ICONS['upload']} Dataset uploaded to: {config.DATAVERSE['dataverse']} with ID {perma}", + style="info", + ) return response.json() @@ -1331,7 +1498,10 @@ class MetadataProcessor: response = self.api_client.make_request(url) if response is None or response.status_code != 200: - self.console.print(f"\n{ICONS['error']} Failed to fetch data for DOI: {self.doi}", style="error") + self.console.print( + f"\n{ICONS['error']} Failed to fetch data for DOI: {self.doi}", + style="error", + ) raise ValueError(f"Failed to fetch data for DOI: {self.doi}") return response.json() @@ -1353,20 +1523,27 @@ class MetadataProcessor: authors, corresponding_authors = citation_builder.build_authors() - author_fields: list[list[PrimitiveMetadataField | ControlledVocabularyMetadataField]] = [] + author_fields: list[ + list[PrimitiveMetadataField | ControlledVocabularyMetadataField] + ] = [] corresponding_author_fields: list[list[PrimitiveMetadataField]] = [] for author in authors: author_fields.append(author.author_fields()) if not corresponding_authors: - self.console.print(f"{ICONS['warning']} No corresponding authors explicitly declared; PIs are used as a fallback!", style="warning") + self.console.print( + f"{ICONS['warning']} No corresponding authors explicitly declared; PIs are used as a fallback!", + style="warning", + ) pis = self._get_involved_pis(data) corresponding_authors: list[Person] for pi in pis: corresponding_authors.append(pi) for corresponding_author in corresponding_authors: - corresponding_author_fields.append(corresponding_author.dataset_contact_fields()) + corresponding_author_fields.append( + corresponding_author.dataset_contact_fields() + ) description = self._build_description(data, abstract) @@ -1377,33 +1554,78 @@ class MetadataProcessor: "metadataBlocks": { "citation": { "fields": [ - PrimitiveMetadataField("title", False, data.get("title", "")).to_dict(), - PrimitiveMetadataField("distributionDate", False, data.get("publication_date", "")).to_dict(), - CompoundMetadataField("otherId", True, citation_builder.build_other_ids()).to_dict(), - CompoundMetadataField("dsDescription", True, [[PrimitiveMetadataField("dsDescriptionValue", False, description)]]).to_dict(), - ControlledVocabularyMetadataField("subject", True, SubjectMapper.get_subjects(data, self.default_subject)).to_dict(), - CompoundMetadataField("topicClassification", True, citation_builder.build_topics()).to_dict(), - CompoundMetadataField("keyword", True, citation_builder.build_keywords()).to_dict(), - PrimitiveMetadataField("depositor", False, self.depositor or data["primary_location"]["source"].get("display_name", "")).to_dict(), - PrimitiveMetadataField("alternativeURL", False, f"https://doi.org/{self.doi}").to_dict(), - CompoundMetadataField("author", True, author_fields).to_dict(), - CompoundMetadataField("datasetContact", True, corresponding_author_fields).to_dict(), - CompoundMetadataField("grantNumber", True, grants).to_dict() + PrimitiveMetadataField( + "title", False, data.get("title", "") + ).to_dict(), + PrimitiveMetadataField( + "distributionDate", + False, + data.get("publication_date", ""), + ).to_dict(), + CompoundMetadataField( + "otherId", True, citation_builder.build_other_ids() + ).to_dict(), + CompoundMetadataField( + "dsDescription", + True, + [ + [ + PrimitiveMetadataField( + "dsDescriptionValue", False, description + ) + ] + ], + ).to_dict(), + ControlledVocabularyMetadataField( + "subject", + True, + SubjectMapper.get_subjects(data, self.default_subject), + ).to_dict(), + CompoundMetadataField( + "topicClassification", + True, + citation_builder.build_topics(), + ).to_dict(), + CompoundMetadataField( + "keyword", True, citation_builder.build_keywords() + ).to_dict(), + PrimitiveMetadataField( + "depositor", + False, + self.depositor + or data["primary_location"]["source"].get( + "display_name", "" + ), + ).to_dict(), + PrimitiveMetadataField( + "alternativeURL", False, f"https://doi.org/{self.doi}" + ).to_dict(), + CompoundMetadataField( + "author", True, author_fields + ).to_dict(), + CompoundMetadataField( + "datasetContact", True, corresponding_author_fields + ).to_dict(), + CompoundMetadataField( + "grantNumber", True, grants + ).to_dict(), ], - "displayName": "Citation Metadata" + "displayName": "Citation Metadata", } }, - "files": [] + "files": [], } } if license_info.name: return_dict["datasetVersion"]["license"] = { "name": license_info.name, - "uri": license_info.uri + "uri": license_info.uri, } else: - return_dict["datasetVersion"]["termsOfUse"] = f"All rights reserved. Copyright © {self._get_publication_year(data)}, [TODO: Insert copyright holder here!]" + return_dict["datasetVersion"]["termsOfUse"] = ( + f"All rights reserved. Copyright © {self._get_publication_year(data)}, [TODO: Insert copyright holder here!]" + ) return return_dict @@ -1442,7 +1664,10 @@ class MetadataProcessor: elif all([journal, publication_date, type]): return f"

    This {type} was published on {publication_date} in {journal}

    " - self.console.print(f"{ICONS['warning']} No abstract header added, missing information (journal, publication date and/or document type)", style="warning") + self.console.print( + f"{ICONS['warning']} No abstract header added, missing information (journal, publication date and/or document type)", + style="warning", + ) return "" def _get_publication_year(self, data: dict[str, Any]) -> str: @@ -1457,8 +1682,6 @@ class MetadataProcessor: """ return data.get("publication_year", "") - - def _get_involved_pis(self, data: dict[str, Any]) -> list[Person]: """ Identify involved principal investigators from the metadata for use as fallback @@ -1486,7 +1709,7 @@ class MetadataProcessor: if pi := self.pi_finder.find_pi( family_name=family_name, given_name=given_name, - orcid=author.get("orcid") + orcid=author.get("orcid"), ): involved_pis.append(pi) @@ -1516,20 +1739,29 @@ class MetadataProcessor: Returns: A JSON-serializable representation of the object. """ + def default(self, o: Any) -> Any: - if hasattr(o, 'to_dict'): + if hasattr(o, "to_dict"): return o.to_dict() return super().default(o) with open(self.output_path, "w", encoding="utf-8") as f: - json.dump(metadata, f, indent=4, ensure_ascii=False, cls=CustomEncoder) - self.console.print(f"{ICONS['save']} Metadata saved in: {self.output_path}", style="info") + json.dump( + metadata, f, indent=4, ensure_ascii=False, cls=CustomEncoder + ) + self.console.print( + f"{ICONS['save']} Metadata saved in: {self.output_path}", + style="info", + ) except Exception as e: - self.console.print(f"{ICONS['error']} Error saving metadata: {str(e)}\n", style="error") + self.console.print( + f"{ICONS['error']} Error saving metadata: {str(e)}\n", style="error" + ) raise else: self.console.print(metadata) + def sanitize_filename(doi: str) -> str: """ Convert DOI to a valid filename using only alphanumeric characters and underscores. @@ -1541,12 +1773,13 @@ def sanitize_filename(doi: str) -> str: str: Sanitized filename string. """ # Replace non-alphanumeric characters with underscores - sanitized = ''.join(c if c.isalnum() else '_' for c in doi) + sanitized = "".join(c if c.isalnum() else "_" for c in doi) # Remove consecutive underscores - while '__' in sanitized: - sanitized = sanitized.replace('__', '_') + while "__" in sanitized: + sanitized = sanitized.replace("__", "_") # Remove leading/trailing underscores - return sanitized.strip('_') + return sanitized.strip("_") + def print_summary(results: dict[str, list[Any]], console: Console) -> None: """ @@ -1565,19 +1798,21 @@ def print_summary(results: dict[str, list[Any]], console: Console) -> None: table.add_row( f"{ICONS['success']} Success", str(len(results["success"])), - ", ".join(results["success"][:3]) + ("..." if len(results["success"]) > 3 else "") + ", ".join(results["success"][:3]) + + ("..." if len(results["success"]) > 3 else ""), ) if results["failed"]: table.add_row( f"{ICONS['error']} Failed", str(len(results["failed"])), - ", ".join(doi for doi, _ in results["failed"][:3]) + - ("..." if len(results["failed"]) > 3 else "") + ", ".join(doi for doi, _ in results["failed"][:3]) + + ("..." if len(results["failed"]) > 3 else ""), ) console.print(Panel(table, title="Summary", border_style="blue")) + def validate_email_address(email: str): """ Validate an email address and ensure its domain has an MX record. @@ -1594,13 +1829,14 @@ def validate_email_address(email: str): email = valid.normalized # Check domain has MX record - domain = email.split('@')[1] - dns.resolver.resolve(domain, 'MX') + domain = email.split("@")[1] + dns.resolver.resolve(domain, "MX") return True except (EmailNotValidError, dns.resolver.NoAnswer, dns.resolver.NXDOMAIN): return False + def process_doi_batch( dois: set[str], output_dir: Path, @@ -1609,7 +1845,7 @@ def process_doi_batch( contact_mail: str | None = None, upload: bool = False, ror: bool = False, - console: Console | None = None + console: Console | None = None, ) -> dict[str, list[Any]]: """ Process a batch of DOIs and return a summary of results. @@ -1648,30 +1884,25 @@ def process_doi_batch( if upload: doi_total_steps = 4 # Fetch, Build, Upload, Save else: - doi_total_steps = 3 # Fetch, Build, Save + doi_total_steps = 3 # Fetch, Build, Save with Progress( *progress_columns, console=console, - transient=True # This makes the progress bar disappear after completion + transient=True, # This makes the progress bar disappear after completion ) as progress: # Add main task - main_task = progress.add_task( - "[bold blue]Processing DOIs...", - total=len(dois) - ) + main_task = progress.add_task("[bold blue]Processing DOIs...", total=len(dois)) # Add status task for current DOI status_task = progress.add_task( "[cyan]Current:", total=None, # Indeterminate progress - visible=False # Hidden initially + visible=False, # Hidden initially ) status_task = progress.add_task( - "[cyan]Current:", - total=doi_total_steps, - visible=False + "[cyan]Current:", total=doi_total_steps, visible=False ) for doi in dois: @@ -1681,7 +1912,7 @@ def process_doi_batch( status_task, description=f"[cyan]Current: [white]{doi[:50]}...", visible=True, - completed=0 # Reset progress for new DOI + completed=0, # Reset progress for new DOI ) # Process the DOI @@ -1698,7 +1929,7 @@ def process_doi_batch( ror=ror, console=console, progress=progress, - task_id=status_task + task_id=status_task, ) # Process and capture result @@ -1714,8 +1945,7 @@ def process_doi_batch( # Show error but keep progress bar progress.console.print( - f"{ICONS['error']} Error processing {doi}: {str(e)}", - style="error" + f"{ICONS['error']} Error processing {doi}: {str(e)}", style="error" ) finally: # Clear current status @@ -1732,76 +1962,81 @@ def main(): console = Console(theme=THEME) try: - - parser = argparse.ArgumentParser(description="Process DOIs to generate metadata") - parser.add_argument( - "dois", - nargs="*", - help="One or more DOIs to process" + parser = argparse.ArgumentParser( + description="Process DOIs to generate metadata" ) + parser.add_argument("dois", nargs="*", help="One or more DOIs to process") parser.add_argument( - "-f", "--file", + "-f", + "--file", help="File containing DOIs (one per line)", - type=argparse.FileType('r') + type=argparse.FileType("r"), ) parser.add_argument( - "-o", "--output-dir", + "-o", + "--output-dir", help="Output directory for metadata files", - default="." + default=".", ) parser.add_argument( - "-d", "--depositor", - help="Name of the depositor", - default=None + "-d", "--depositor", help="Name of the depositor", default=None ) parser.add_argument( - "-s", "--subject", + "-s", + "--subject", help="Default subject", - default="Medicine, Health and Life Sciences" + default="Medicine, Health and Life Sciences", ) parser.add_argument( - "-m", "--contact-mail", - help="Contact email address", - default=False + "-m", "--contact-mail", help="Contact email address", default=False ) parser.add_argument( - "-u", "--upload", - help="Upload to Dataverse", - action="store_true" + "-u", "--upload", help="Upload to Dataverse", action="store_true" ) parser.add_argument( - "-r", "--use-ror", - help="Use ROR ID if available", - action="store_true" + "-r", "--use-ror", help="Use ROR ID if available", action="store_true" ) args = parser.parse_args() # Ensure we have either DOIs as arguments or a file if not args.dois and not args.file: - console.print(f"{ICONS['error']} Error: No DOIs provided. Use either command line arguments or -f/--file option.", style="error") + console.print( + f"{ICONS['error']} Error: No DOIs provided. Use either command line arguments or -f/--file option.", + style="error", + ) parser.print_help() sys.exit(1) # Get DOIs from both direct arguments and file if provided dois = set(args.dois) # Start with directly provided DOIs if args.file: - console.print(f"{ICONS['file']} Reading DOIs from file: {args.file.name}", style="info") + console.print( + f"{ICONS['file']} Reading DOIs from file: {args.file.name}", + style="info", + ) dois.update(line.strip() for line in args.file if line.strip()) # Create output directory if it doesn't exist output_dir = Path(args.output_dir) try: output_dir.mkdir(parents=True, exist_ok=True) - console.print(f"{ICONS['folder']} Output directory: {output_dir}\n", style="info") + console.print( + f"{ICONS['folder']} Output directory: {output_dir}\n", style="info" + ) except Exception as e: - console.print(f"Failed to create output directory: {str(e)}\n", style="error") + console.print( + f"Failed to create output directory: {str(e)}\n", style="error" + ) sys.exit(1) if args.contact_mail: if not validate_email_address(args.contact_mail): raise ValueError(f"Not a valid email address: {args.contact_mail}") - console.print(f"{ICONS['info']} Exposing contact email <{args.contact_mail}> to API services.\n", style="info") + console.print( + f"{ICONS['info']} Exposing contact email <{args.contact_mail}> to API services.\n", + style="info", + ) # Process DOIs and track time process_doi_batch( @@ -1812,16 +2047,18 @@ def main(): contact_mail=args.contact_mail, upload=args.upload, ror=args.use_ror, - console=console + console=console, ) - - except KeyboardInterrupt: - console.print(f"\n{ICONS['warning']} Processing interrupted by user", style="warning") + console.print( + f"\n{ICONS['warning']} Processing interrupted by user", style="warning" + ) sys.exit(1) except Exception as e: - console.print(f"\n{ICONS['error']} An unexpected error occurred: {str(e)}", style="error") + console.print( + f"\n{ICONS['error']} An unexpected error occurred: {str(e)}", style="error" + ) sys.exit(1) diff --git a/pyproject.toml b/pyproject.toml index bd1a783..84bd858 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,6 +50,7 @@ dev = [ "pytest-mock>=3.14.0,<4.0", "pytest-cov>=6.0.0,<7.0", "ruff>=0.11.1,<0.20", + "gitlint>=0.19.1,<0.20", ] test = [ "pytest>=8.3.5,<9.0", @@ -132,3 +133,7 @@ ignore = [ [tool.ruff.lint.per-file-ignores] "tests/*" = ["E501"] + +[tool.bandit] +exclude_dirs = ["tests", "docs", ".venv", "build", "dist"] +skips = ["B101", "B601", "B404", "B603"] diff --git a/requirements-dev.txt b/requirements-dev.txt index ab30c10..eb0ffaa 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -2,3 +2,4 @@ pytest>=8.3.5,<9.0 pytest-mock>=3.14.0,<4.0 pytest-cov>=6.0.0,<7.0 ruff>=0.11.1,<0.20 +gitlint>=0.19.1,<0.20 diff --git a/scripts/lint-commit.py b/scripts/lint-commit.py new file mode 100644 index 0000000..a424e43 --- /dev/null +++ b/scripts/lint-commit.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python3 +""" +Simple script to lint git commit messages using gitlint. + +This script can be used to: +1. Lint the last commit message +2. Lint a specific commit by hash +3. Lint commit messages in a range +4. Be used as a pre-commit hook + +Usage: + python scripts/lint-commit.py # Lint last commit + python scripts/lint-commit.py --hash # Lint specific commit + python scripts/lint-commit.py --range # Lint commit range + python scripts/lint-commit.py --staged # Lint staged commit message + +This implementation enforces conventional commit message format. +""" + +import argparse +import subprocess +import sys +from pathlib import Path + + +def run_command(cmd, check=True): + """Run a shell command and return the result.""" + try: + result = subprocess.run(cmd, capture_output=True, text=True, check=check) + return result + except subprocess.CalledProcessError as e: + print(f"Error running command: {cmd}") + print(f"Exit code: {e.returncode}") + print(f"Output: {e.stdout}") + print(f"Error: {e.stderr}") + return e + + +def check_gitlint_installed(): + """Check if gitlint is installed.""" + result = run_command(["which", "gitlint"], check=False) + if result.returncode != 0: + print("Error: gitlint is not installed.") + print("Please install it with: pip install gitlint") + print("Or install dev dependencies: pip install -r requirements-dev.txt") + sys.exit(1) + + +def lint_commit(commit_hash=None, commit_range=None, staged=False): + """Lint commit message(s) using gitlint.""" + # Build gitlint command + cmd = ["gitlint"] + + if staged: + # Lint staged commit message + cmd.extend(["--staged"]) + elif commit_range: + # Lint commit range + cmd.extend(["--commits", commit_range]) + elif commit_hash: + # Lint specific commit + cmd.extend(["--commit", commit_hash]) + else: + # Lint last commit (default) + cmd.extend(["--commit", "HEAD"]) + + print(f"Running: {' '.join(cmd)}") + print("-" * 50) + + # Run gitlint + result = run_command(cmd, check=False) + + if result.returncode == 0: + print("✅ All commit messages are valid!") + return True + else: + print("❌ Commit message validation failed:") + print(result.stdout) + if result.stderr: + print("Error output:") + print(result.stderr) + return False + + +def main(): + """Main entry point.""" + parser = argparse.ArgumentParser( + description="Lint git commit messages using gitlint", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s # Lint last commit + %(prog)s --hash abc123 # Lint specific commit + %(prog)s --range HEAD~3.. # Lint last 3 commits + %(prog)s --staged # Lint staged commit message + """, + ) + + parser.add_argument("--hash", help="Specific commit hash to lint") + + parser.add_argument("--range", help="Commit range to lint (e.g., HEAD~3..)") + + parser.add_argument( + "--staged", action="store_true", help="Lint staged commit message" + ) + + parser.add_argument( + "--install-hook", action="store_true", help="Install as git commit-msg hook" + ) + + args = parser.parse_args() + + # Check if gitlint is installed + check_gitlint_installed() + + # Install hook if requested + if args.install_hook: + install_hook() + return + + # Validate arguments + exclusive_args = [args.hash, args.range, args.staged] + if sum(bool(arg) for arg in exclusive_args) > 1: + print("Error: --hash, --range, and --staged are mutually exclusive") + sys.exit(1) + + # Lint commits + success = lint_commit( + commit_hash=args.hash, commit_range=args.range, staged=args.staged + ) + + sys.exit(0 if success else 1) + + +def install_hook(): + """Install the script as a git commit-msg hook.""" + git_dir = Path(".git") + if not git_dir.exists(): + print("Error: Not in a git repository") + sys.exit(1) + + hooks_dir = git_dir / "hooks" + hooks_dir.mkdir(exist_ok=True) + + hook_file = hooks_dir / "commit-msg" + + hook_content = """#!/usr/bin/env python3 +# Git commit-msg hook for gitlint +# Python-based commit message linting with gitlint +import subprocess +import sys + +# Run gitlint on the commit message +result = subprocess.run( # nosec B603 + ["gitlint", "--msg-filename", sys.argv[1]], + capture_output=True, + text=True +) + +if result.returncode != 0: + print("Commit message validation failed:") + print(result.stdout) + if result.stderr: + print("Error output:") + print(result.stderr) + sys.exit(1) + +print("✅ Commit message is valid!") +""" + + hook_file.write_text(hook_content) + hook_file.chmod(0o755) + + print(f"✅ Installed commit-msg hook at {hook_file}") + print("The hook will automatically run when you commit.") + + +if __name__ == "__main__": + main() diff --git a/tests/test_citation_builder.py b/tests/test_citation_builder.py index 08045fd..b664bf7 100644 --- a/tests/test_citation_builder.py +++ b/tests/test_citation_builder.py @@ -23,7 +23,7 @@ def test_pi(): given_name="Author", orcid="0000-0000-0000-1234", email="test.author@example.org", - affiliation="Test University" + affiliation="Test University", ) @@ -115,7 +115,9 @@ def test_build_authors_with_ror(openalex_data, pi_finder): pytest.skip("Test data doesn't contain any ROR identifiers") # Create builder with ror=True to enable ROR identifiers - builder = CitationBuilder(data=openalex_data, doi=doi, pi_finder=pi_finder, ror=True) + builder = CitationBuilder( + data=openalex_data, doi=doi, pi_finder=pi_finder, ror=True + ) # Get authors authors, _ = builder.build_authors() @@ -129,11 +131,11 @@ def test_build_authors_with_ror(openalex_data, pi_finder): for author in authors: # Check if author has affiliation - if not hasattr(author, 'affiliation') or not author.affiliation: + if not hasattr(author, "affiliation") or not author.affiliation: continue # Check if affiliation is an Institution with a ROR ID - if not hasattr(author.affiliation, 'ror'): + if not hasattr(author.affiliation, "ror"): continue # Check if ROR ID is present and contains "ror.org" @@ -154,7 +156,7 @@ def test_build_authors_with_ror(openalex_data, pi_finder): assert affiliation_field.value == institution_with_ror.ror # Verify the expanded_value dictionary has the expected structure - assert hasattr(affiliation_field, 'expanded_value') + assert hasattr(affiliation_field, "expanded_value") assert isinstance(affiliation_field.expanded_value, dict) # Check specific fields in the expanded_value diff --git a/tests/test_doi2dataset.py b/tests/test_doi2dataset.py index e5515d8..4f4ec15 100644 --- a/tests/test_doi2dataset.py +++ b/tests/test_doi2dataset.py @@ -13,6 +13,7 @@ def test_sanitize_filename(): result = sanitize_filename(doi) assert result == expected + def test_split_name_with_comma(): """Test splitting a full name that contains a comma.""" full_name = "Doe, John" @@ -20,6 +21,7 @@ def test_split_name_with_comma(): assert given == "John" assert family == "Doe" + def test_split_name_without_comma(): """Test splitting a full name that does not contain a comma.""" full_name = "John Doe" @@ -27,11 +29,13 @@ def test_split_name_without_comma(): assert given == "John" assert family == "Doe" + def test_validate_email_address_valid(): """Test that a valid email address is correctly recognized.""" valid_email = "john.doe@iana.org" assert validate_email_address(valid_email) is True + def test_validate_email_address_invalid(): """Test that an invalid email address is correctly rejected.""" invalid_email = "john.doe@invalid_domain" diff --git a/tests/test_fetch_doi_mock.py b/tests/test_fetch_doi_mock.py index ecdd7b5..3ed99b0 100644 --- a/tests/test_fetch_doi_mock.py +++ b/tests/test_fetch_doi_mock.py @@ -20,6 +20,7 @@ class FakeResponse: """ A fake response object to simulate an API response. """ + def __init__(self, json_data, status_code=200): self._json = json_data self.status_code = status_code @@ -30,6 +31,7 @@ class FakeResponse: def raise_for_status(self): pass + @pytest.fixture(autouse=True) def load_config_test(): """ @@ -39,6 +41,7 @@ def load_config_test(): config_path = os.path.join(os.path.dirname(__file__), "config_test.yaml") Config.load_config(config_path=config_path) + @pytest.fixture def fake_openalex_response(): """ @@ -50,6 +53,7 @@ def fake_openalex_response(): data = json.load(f) return data + def test_fetch_doi_data_with_file(mocker, fake_openalex_response): """ Test fetching DOI metadata by simulating the API call with a locally saved JSON response. @@ -88,7 +92,7 @@ def test_openalex_abstract_extraction(mocker, fake_openalex_response): assert abstract_text is not None # If abstract exists in the response, it should be properly extracted - if 'abstract_inverted_index' in fake_openalex_response: + if "abstract_inverted_index" in fake_openalex_response: assert len(abstract_text) > 0 @@ -152,7 +156,7 @@ def test_pi_finder_find_by_orcid(): given_name="Jon", orcid="0000-0000-0000-0000", email="jon.doe@iana.org", - affiliation="Institute of Science, Some University" + affiliation="Institute of Science, Some University", ) # Create PIFinder with our test PI @@ -181,8 +185,10 @@ def test_metadata_processor_fetch_data(mocker, fake_openalex_response): doi = "10.1038/srep45389" # Mock API response - mocker.patch("doi2dataset.APIClient.make_request", - return_value=FakeResponse(fake_openalex_response, 200)) + mocker.patch( + "doi2dataset.APIClient.make_request", + return_value=FakeResponse(fake_openalex_response, 200), + ) # Create processor with upload disabled and progress disabled processor = MetadataProcessor(doi=doi, upload=False, progress=False) diff --git a/tests/test_license_processor.py b/tests/test_license_processor.py index 560fe5a..f9eff58 100644 --- a/tests/test_license_processor.py +++ b/tests/test_license_processor.py @@ -3,37 +3,27 @@ from doi2dataset import License, LicenseProcessor def test_license_processor_cc_by(): """Test processing a CC BY license""" - data = { - "primary_location": { - "license": "cc-by" - } - } + data = {"primary_location": {"license": "cc-by"}} license_obj = LicenseProcessor.process_license(data) assert isinstance(license_obj, License) assert license_obj.short == "cc-by" assert license_obj.name == "CC BY 4.0" assert license_obj.uri == "https://creativecommons.org/licenses/by/4.0/" + def test_license_processor_cc0(): """Test processing a CC0 license""" - data = { - "primary_location": { - "license": "cc0" - } - } + data = {"primary_location": {"license": "cc0"}} license_obj = LicenseProcessor.process_license(data) assert isinstance(license_obj, License) assert license_obj.short == "cc0" assert license_obj.name == "CC0 1.0" assert license_obj.uri == "https://creativecommons.org/publicdomain/zero/1.0/" + def test_license_processor_unknown_license(): """Test processing an unknown license""" - data = { - "primary_location": { - "license": "unknown-license" - } - } + data = {"primary_location": {"license": "unknown-license"}} license_obj = LicenseProcessor.process_license(data) assert isinstance(license_obj, License) assert license_obj.short == "unknown-license" @@ -41,17 +31,17 @@ def test_license_processor_unknown_license(): assert license_obj.name == "unknown-license" or license_obj.name == "" assert hasattr(license_obj, "uri") + def test_license_processor_no_license(): """Test processing with no license information""" - data = { - "primary_location": {} - } + data = {"primary_location": {}} license_obj = LicenseProcessor.process_license(data) assert isinstance(license_obj, License) assert license_obj.short == "unknown" assert license_obj.name == "" assert license_obj.uri == "" + def test_license_processor_no_primary_location(): """Test processing with no primary location""" data = {} diff --git a/tests/test_metadata_processor.py b/tests/test_metadata_processor.py index ffacf4e..e489150 100644 --- a/tests/test_metadata_processor.py +++ b/tests/test_metadata_processor.py @@ -33,7 +33,10 @@ def test_build_metadata_basic_fields(metadata_processor, openalex_data, monkeypa abstract_mock = MagicMock() abstract_mock.text = "This is a sample abstract" abstract_mock.source = "openalex" - monkeypatch.setattr("doi2dataset.AbstractProcessor.get_abstract", lambda *args, **kwargs: abstract_mock) + monkeypatch.setattr( + "doi2dataset.AbstractProcessor.get_abstract", + lambda *args, **kwargs: abstract_mock, + ) # Mock the _fetch_data method to return our test data metadata_processor._fetch_data = MagicMock(return_value=openalex_data) @@ -47,21 +50,23 @@ def test_build_metadata_basic_fields(metadata_processor, openalex_data, monkeypa # Verify the basic metadata fields were extracted correctly assert metadata is not None - assert 'datasetVersion' in metadata + assert "datasetVersion" in metadata # Examine the fields inside datasetVersion.metadataBlocks - assert 'metadataBlocks' in metadata['datasetVersion'] - citation = metadata['datasetVersion']['metadataBlocks'].get('citation', {}) + assert "metadataBlocks" in metadata["datasetVersion"] + citation = metadata["datasetVersion"]["metadataBlocks"].get("citation", {}) # Check fields in citation section - assert 'fields' in citation - fields = citation['fields'] + assert "fields" in citation + fields = citation["fields"] # Check for basic metadata fields in a more flexible way - field_names = [field.get('typeName') for field in fields] - assert 'title' in field_names - assert 'subject' in field_names - assert 'dsDescription' in field_names # Description is named 'dsDescription' in the schema + field_names = [field.get("typeName") for field in fields] + assert "title" in field_names + assert "subject" in field_names + assert ( + "dsDescription" in field_names + ) # Description is named 'dsDescription' in the schema def test_build_metadata_authors(metadata_processor, openalex_data, monkeypatch): @@ -73,7 +78,10 @@ def test_build_metadata_authors(metadata_processor, openalex_data, monkeypatch): abstract_mock = MagicMock() abstract_mock.text = "This is a sample abstract" abstract_mock.source = "openalex" - monkeypatch.setattr("doi2dataset.AbstractProcessor.get_abstract", lambda *args, **kwargs: abstract_mock) + monkeypatch.setattr( + "doi2dataset.AbstractProcessor.get_abstract", + lambda *args, **kwargs: abstract_mock, + ) # Mock the _fetch_data method to return our test data metadata_processor._fetch_data = MagicMock(return_value=openalex_data) @@ -86,33 +94,35 @@ def test_build_metadata_authors(metadata_processor, openalex_data, monkeypatch): metadata = metadata_processor._build_metadata(openalex_data) # Examine the fields inside datasetVersion.metadataBlocks - assert 'metadataBlocks' in metadata['datasetVersion'] - citation = metadata['datasetVersion']['metadataBlocks'].get('citation', {}) + assert "metadataBlocks" in metadata["datasetVersion"] + citation = metadata["datasetVersion"]["metadataBlocks"].get("citation", {}) # Check fields in citation section - assert 'fields' in citation - fields = citation['fields'] + assert "fields" in citation + fields = citation["fields"] # Check for author and datasetContact fields - field_names = [field.get('typeName') for field in fields] - assert 'author' in field_names - assert 'datasetContact' in field_names + field_names = [field.get("typeName") for field in fields] + assert "author" in field_names + assert "datasetContact" in field_names # Verify these are compound fields with actual entries for field in fields: - if field.get('typeName') == 'author': - assert 'value' in field - assert isinstance(field['value'], list) - assert len(field['value']) > 0 + if field.get("typeName") == "author": + assert "value" in field + assert isinstance(field["value"], list) + assert len(field["value"]) > 0 - if field.get('typeName') == 'datasetContact': - assert 'value' in field - assert isinstance(field['value'], list) + if field.get("typeName") == "datasetContact": + assert "value" in field + assert isinstance(field["value"], list) # The datasetContact might be empty in test environment # Just check it exists rather than asserting length -def test_build_metadata_keywords_and_topics(metadata_processor, openalex_data, monkeypatch): +def test_build_metadata_keywords_and_topics( + metadata_processor, openalex_data, monkeypatch +): """Test that _build_metadata correctly extracts keywords and topics""" # Mock the console to avoid print errors metadata_processor.console = MagicMock() @@ -121,7 +131,10 @@ def test_build_metadata_keywords_and_topics(metadata_processor, openalex_data, m abstract_mock = MagicMock() abstract_mock.text = "This is a sample abstract" abstract_mock.source = "openalex" - monkeypatch.setattr("doi2dataset.AbstractProcessor.get_abstract", lambda *args, **kwargs: abstract_mock) + monkeypatch.setattr( + "doi2dataset.AbstractProcessor.get_abstract", + lambda *args, **kwargs: abstract_mock, + ) # Mock the _fetch_data method to return our test data metadata_processor._fetch_data = MagicMock(return_value=openalex_data) @@ -134,27 +147,27 @@ def test_build_metadata_keywords_and_topics(metadata_processor, openalex_data, m metadata = metadata_processor._build_metadata(openalex_data) # Examine the fields inside datasetVersion.metadataBlocks - assert 'metadataBlocks' in metadata['datasetVersion'] - citation = metadata['datasetVersion']['metadataBlocks'].get('citation', {}) + assert "metadataBlocks" in metadata["datasetVersion"] + citation = metadata["datasetVersion"]["metadataBlocks"].get("citation", {}) # Check fields in citation section - assert 'fields' in citation - fields = citation['fields'] + assert "fields" in citation + fields = citation["fields"] # Check for keyword and subject fields - field_names = [field.get('typeName') for field in fields] + field_names = [field.get("typeName") for field in fields] # If keywords exist, verify structure - if 'keyword' in field_names: + if "keyword" in field_names: for field in fields: - if field.get('typeName') == 'keyword': - assert 'value' in field - assert isinstance(field['value'], list) + if field.get("typeName") == "keyword": + assert "value" in field + assert isinstance(field["value"], list) # Check for subject field which should definitely exist - assert 'subject' in field_names + assert "subject" in field_names for field in fields: - if field.get('typeName') == 'subject': - assert 'value' in field - assert isinstance(field['value'], list) - assert len(field['value']) > 0 + if field.get("typeName") == "subject": + assert "value" in field + assert isinstance(field["value"], list) + assert len(field["value"]) > 0 diff --git a/tests/test_person.py b/tests/test_person.py index 2e1e030..61e081d 100644 --- a/tests/test_person.py +++ b/tests/test_person.py @@ -8,7 +8,7 @@ def test_person_to_dict_with_string_affiliation(): given_name="John", orcid="0000-0001-2345-6789", email="john.doe@example.org", - affiliation="Test University" + affiliation="Test University", ) result = person.to_dict() @@ -29,7 +29,7 @@ def test_person_to_dict_with_institution_ror(): given_name="John", orcid="0000-0001-2345-6789", email="john.doe@example.org", - affiliation=inst + affiliation=inst, ) result = person.to_dict() @@ -48,7 +48,7 @@ def test_person_to_dict_with_institution_display_name_only(): family_name="Smith", given_name="Jane", orcid="0000-0001-9876-5432", - affiliation=inst + affiliation=inst, ) result = person.to_dict() @@ -63,11 +63,7 @@ def test_person_to_dict_with_empty_institution(): # Create an Institution with empty values inst = Institution("") - person = Person( - family_name="Brown", - given_name="Robert", - affiliation=inst - ) + person = Person(family_name="Brown", given_name="Robert", affiliation=inst) result = person.to_dict() @@ -79,9 +75,7 @@ def test_person_to_dict_with_empty_institution(): def test_person_to_dict_with_no_affiliation(): """Test Person.to_dict() with no affiliation.""" person = Person( - family_name="Green", - given_name="Alice", - orcid="0000-0002-1111-2222" + family_name="Green", given_name="Alice", orcid="0000-0002-1111-2222" ) result = person.to_dict() diff --git a/tests/test_publication_utils.py b/tests/test_publication_utils.py index d9dc978..40b506e 100644 --- a/tests/test_publication_utils.py +++ b/tests/test_publication_utils.py @@ -14,44 +14,44 @@ def metadata_processor(): processor.console = MagicMock() return processor + def test_get_publication_year_with_publication_year(metadata_processor): """Test that _get_publication_year extracts year from publication_year field""" data = {"publication_year": 2020} year = metadata_processor._get_publication_year(data) assert year == 2020 + def test_get_publication_year_with_date(metadata_processor): """Test that _get_publication_year returns empty string when publication_year is missing""" data = {"publication_date": "2019-05-15"} year = metadata_processor._get_publication_year(data) assert year == "" + def test_get_publication_year_with_both_fields(metadata_processor): """Test that _get_publication_year prioritizes publication_year over date""" - data = { - "publication_year": 2020, - "publication_date": "2019-05-15" - } + data = {"publication_year": 2020, "publication_date": "2019-05-15"} year = metadata_processor._get_publication_year(data) assert year == 2020 + def test_get_publication_year_with_partial_date(metadata_processor): """Test that _get_publication_year returns empty string when only publication_date is present""" data = {"publication_date": "2018"} year = metadata_processor._get_publication_year(data) assert year == "" + def test_get_publication_year_with_missing_data(metadata_processor): """Test that _get_publication_year handles missing data""" data = {"other_field": "value"} year = metadata_processor._get_publication_year(data) assert year == "" + def test_get_publication_year_with_invalid_data(metadata_processor): """Test that _get_publication_year returns whatever is in publication_year field""" - data = { - "publication_year": "not-a-year", - "publication_date": "invalid-date" - } + data = {"publication_year": "not-a-year", "publication_date": "invalid-date"} year = metadata_processor._get_publication_year(data) assert year == "not-a-year"