From 9be53fd2fc1cbe1345c27a7db4908f9d8ec170a1 Mon Sep 17 00:00:00 2001
From: Alexander Minges
Date: Fri, 21 Mar 2025 14:53:23 +0100
Subject: [PATCH] Initial commit and release of doi2dataset
---
.gitignore | 186 ++++
LICENSE.md | 27 +
README.md | 79 ++
__init__.py | 0
config_example.yaml | 23 +
docs/Makefile | 20 +
docs/make.bat | 35 +
docs/source/conf.py | 31 +
docs/source/doi2dataset.rst | 7 +
docs/source/faq.rst | 14 +
docs/source/index.rst | 34 +
docs/source/installation.rst | 28 +
docs/source/introduction.rst | 8 +
docs/source/modules.rst | 9 +
docs/source/setup.rst | 7 +
docs/source/usage.rst | 77 ++
doi2dataset.py | 1733 ++++++++++++++++++++++++++++++++++
requirements-dev.txt | 3 +
requirements-doc.txt | 2 +
requirements.txt | 6 +
setup.py | 46 +
tests/test_doi2dataset.py | 50 +
tests/test_fetch_doi_mock.py | 57 ++
23 files changed, 2482 insertions(+)
create mode 100644 .gitignore
create mode 100644 LICENSE.md
create mode 100644 README.md
create mode 100644 __init__.py
create mode 100644 config_example.yaml
create mode 100644 docs/Makefile
create mode 100644 docs/make.bat
create mode 100644 docs/source/conf.py
create mode 100644 docs/source/doi2dataset.rst
create mode 100644 docs/source/faq.rst
create mode 100644 docs/source/index.rst
create mode 100644 docs/source/installation.rst
create mode 100644 docs/source/introduction.rst
create mode 100644 docs/source/modules.rst
create mode 100644 docs/source/setup.rst
create mode 100644 docs/source/usage.rst
create mode 100755 doi2dataset.py
create mode 100644 requirements-dev.txt
create mode 100644 requirements-doc.txt
create mode 100644 requirements.txt
create mode 100644 setup.py
create mode 100644 tests/test_doi2dataset.py
create mode 100644 tests/test_fetch_doi_mock.py
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b861df0
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,186 @@
+# Config file
+config.yaml
+
+# Processed DOIs
+*.json
+
+# Typing stubs
+typing/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+# For a library or package, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# UV
+# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+# This is especially recommended for binary packages to ensure reproducibility, and is more
+# commonly ignored for libraries.
+#uv.lock
+
+# poetry
+# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+# This is especially recommended for binary packages to ensure reproducibility, and is more
+# commonly ignored for libraries.
+# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+# in version control.
+# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+# and can be added to the global gitignore or merged into this file. For a more nuclear
+# option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+# Ruff stuff:
+.ruff_cache/
+
+# PyPI configuration file
+.pypirc
+
+# Typing stubs
+typings/
diff --git a/LICENSE.md b/LICENSE.md
new file mode 100644
index 0000000..becccee
--- /dev/null
+++ b/LICENSE.md
@@ -0,0 +1,27 @@
+---
+
+**LICENSE.md (MIT License)**
+
+```markdown
+MIT License
+
+Copyright (c) 2025 Alexander Minges
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+```
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..fb83831
--- /dev/null
+++ b/README.md
@@ -0,0 +1,79 @@
+# doi2dataset
+
+**doi2dataset** is a Python tool designed to process DOIs and generate metadata for Dataverse datasets. It retrieves metadata from external APIs (such as OpenAlex and CrossRef), maps metadata fields, and can optionally upload the generated metadata to a Dataverse instance.
+
+## Features
+
+- **DOI Validation and Normalization:** Validates DOIs and converts them into a standardized format.
+- **Metadata Retrieval:** Fetches metadata such as title, abstract, license, and author information from external sources.
+- **Metadata Mapping:** Automatically maps and generates metadata fields (e.g., title, description, keywords) including support for controlled vocabularies and compound fields.
+- **Optional Upload:** Allows uploading of metadata directly to a Dataverse server.
+- **Progress Tracking:** Uses the Rich library for user-friendly progress tracking and error handling.
+
+## Installation
+
+Clone the repository from GitHub:
+
+```bash
+git clone https://git.athemis.de/Athemis/doi2dataset
+cd doi2dataset
+```
+
+## Configuration
+
+Configuration
+
+Before running the tool, configure the necessary settings in the `config.yaml` file located in the project root. This file contains configuration details such as:
+
+- Dataverse connection details (URL, API token, authentication credentials)
+- Mapping of project phases
+- Principal Investigator (PI) information
+- Default grant configurations
+
+## Usage
+
+Run doi2dataset from the command line by providing one or more DOIs:
+
+```bash
+python doi2dataset.py [options] DOI1 DOI2 ...
+```
+
+### Command Line Options
+
+- `-f, --file`
+ Specify a file containing DOIs (one per line).
+
+- `-o, --output-dir`
+ Directory where metadata files will be saved.
+
+- `-d, --depositor`
+ Name of the depositor.
+
+- `-s, --subject`
+ Default subject for the metadata.
+
+- `-m, --contact-mail`
+ Contact email address.
+
+- `-u, --upload`
+ Upload metadata to a Dataverse server.
+
+## Documentation
+
+Documentation is generated using Sphinx. See the `docs/` directory for detailed API references and usage examples.
+
+## Testing
+
+Tests are implemented with pytest. To run the tests, execute:
+
+```bash
+pytest
+```
+
+## Contributing
+
+Contributions are welcome! Please fork the repository and submit a pull request with your improvements.
+
+## License
+
+This project is licensed under the MIT License. See the [LICENSE.md] file for details.
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/config_example.yaml b/config_example.yaml
new file mode 100644
index 0000000..f14cdad
--- /dev/null
+++ b/config_example.yaml
@@ -0,0 +1,23 @@
+default_grant:
+ - funder: "Awesome Funding Agency"
+ id: "ABC12345"
+
+phase:
+ "Phase 1 (2021/2025)":
+ start: 2021
+ end: 2025
+
+pis:
+ - family_name: "Doe"
+ given_name: "Jon"
+ orcid: "0000-0000-0000-0000"
+ email: "jon.doe@some-university.edu"
+ affiliation: "Institute of Science, Some University"
+ project: ["Project A01"]
+
+ - family_name: "Doe"
+ given_name: "Jane"
+ orcid: "0000-0000-0000-0001"
+ email: "jane.doe@some-university.edu"
+ affiliation: "Institute of Science, Some University"
+ project: ["Project A02"]
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..d0c3cbf
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS ?=
+SPHINXBUILD ?= sphinx-build
+SOURCEDIR = source
+BUILDDIR = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 0000000..747ffb7
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+ set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+ echo.
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+ echo.installed, then set the SPHINXBUILD environment variable to point
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
+ echo.may add the Sphinx directory to PATH.
+ echo.
+ echo.If you don't have Sphinx installed, grab it from
+ echo.https://www.sphinx-doc.org/
+ exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/source/conf.py b/docs/source/conf.py
new file mode 100644
index 0000000..921497b
--- /dev/null
+++ b/docs/source/conf.py
@@ -0,0 +1,31 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+import os
+import sys
+
+sys.path.insert(0, os.path.abspath('../..'))
+
+project = 'doi2dataset'
+copyright = '2025, Alexander Minges'
+author = 'Alexander Minges'
+release = '1.0'
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+extensions = ["sphinx.ext.autodoc", "sphinx.ext.napoleon"]
+
+templates_path = ['_templates']
+exclude_patterns = []
+
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+html_theme = "sphinx_rtd_theme"
+html_static_path = ['_static']
diff --git a/docs/source/doi2dataset.rst b/docs/source/doi2dataset.rst
new file mode 100644
index 0000000..b262195
--- /dev/null
+++ b/docs/source/doi2dataset.rst
@@ -0,0 +1,7 @@
+doi2dataset module
+==================
+
+.. automodule:: doi2dataset
+ :members:
+ :show-inheritance:
+ :undoc-members:
diff --git a/docs/source/faq.rst b/docs/source/faq.rst
new file mode 100644
index 0000000..026e0c5
--- /dev/null
+++ b/docs/source/faq.rst
@@ -0,0 +1,14 @@
+Frequently Asked Questions (FAQ)
+================================
+
+Q: What is **doi2dataset**?
+A: **doi2dataset** is a tool to process DOIs and generate metadata for Dataverse datasets by fetching data from external APIs like OpenAlex and CrossRef.
+
+Q: How do I install **doi2dataset**?
+A: You can clone the repository from GitHub or install it via pip. Please refer to the Installation section for details.
+
+Q: Can I upload metadata directly to a Dataverse server?
+A: Yes, the tool provides an option to upload metadata via the command line using the ``-u`` flag. Ensure that your configuration in `config.yaml` is correct.
+
+Q: Where can I find the API documentation?
+A: The API reference is generated automatically in the Modules section of this documentation.
diff --git a/docs/source/index.rst b/docs/source/index.rst
new file mode 100644
index 0000000..e5c1e6e
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,34 @@
+.. doi2dataset documentation master file, created by
+ sphinx-quickstart on Fri Mar 21 13:03:59 2025.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+doi2dataset documentation
+=========================
+
+Overview
+--------
+**doi2dataset** is a Python tool designed to process DOIs and generate metadata for Dataverse datasets.
+It retrieves data from external APIs such as OpenAlex and CrossRef and converts it into a format that meets Dataverse requirements.
+
+Key Features:
+
+- **Validation** and normalization of DOIs
+- Retrieval and processing of **metadata** (e.g., abstract, license, author information)
+- Automatic mapping and generation of metadata fields (e.g., title, description, keywords)
+- Support for controlled vocabularies and complex (compound) metadata fields
+- Optional **uploading** of metadata to a Dataverse server
+- **Progress tracking** and error handling using the Rich library
+
+
+
+.. toctree::
+ :maxdepth: 2
+ :caption: Contents:
+ :titlesonly:
+
+ introduction
+ installation
+ usage
+ modules
+ faq
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
new file mode 100644
index 0000000..72a16f5
--- /dev/null
+++ b/docs/source/installation.rst
@@ -0,0 +1,28 @@
+Installation
+============
+
+There are several ways to install **doi2dataset**:
+
+Using Git
+---------
+Clone the repository from GitHub by running the following commands in your terminal:
+
+.. code-block:: bash
+
+ git clone https://github.com/your_username/doi2dataset.git
+ cd doi2dataset
+
+Using pip (if available)
+-------------------------
+You can also install **doi2dataset** via pip:
+
+.. code-block:: bash
+
+ pip install doi2dataset
+
+Configuration
+-------------
+After installation, ensure that the tool is configured correctly.
+Check the `config.yaml` file in the project root for necessary settings such as Dataverse connection details and PI information.
+
+For more detailed instructions, please refer to the README file provided with the project.
diff --git a/docs/source/introduction.rst b/docs/source/introduction.rst
new file mode 100644
index 0000000..3d4f703
--- /dev/null
+++ b/docs/source/introduction.rst
@@ -0,0 +1,8 @@
+Introduction
+============
+
+Welcome to the **doi2dataset** documentation. This guide provides an in-depth look at the tool, its purpose, and how it can help you generate metadata for Dataverse datasets.
+
+The **doi2dataset** tool is aimed at researchers, data stewards, and developers who need to convert DOI-based metadata into a format compatible with Dataverse. It automates the retrieval of metadata from external sources (like OpenAlex and CrossRef) and performs necessary data transformations.
+
+In the following sections, you'll learn about the installation process, usage examples, and a detailed API reference.
diff --git a/docs/source/modules.rst b/docs/source/modules.rst
new file mode 100644
index 0000000..e84e41a
--- /dev/null
+++ b/docs/source/modules.rst
@@ -0,0 +1,9 @@
+API Reference
+=============
+
+This section contains the API reference generated from the source code docstrings.
+
+.. automodule:: doi2dataset
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/docs/source/setup.rst b/docs/source/setup.rst
new file mode 100644
index 0000000..1084cc6
--- /dev/null
+++ b/docs/source/setup.rst
@@ -0,0 +1,7 @@
+setup module
+============
+
+.. automodule:: setup
+ :members:
+ :show-inheritance:
+ :undoc-members:
diff --git a/docs/source/usage.rst b/docs/source/usage.rst
new file mode 100644
index 0000000..4e270c3
--- /dev/null
+++ b/docs/source/usage.rst
@@ -0,0 +1,77 @@
+Usage
+=====
+
+Running **doi2dataset** is done from the command line. Below is an example of how to use the tool.
+
+Basic Example
+-------------
+To process one or more DOIs, run:
+
+.. code-block:: bash
+
+ python doi2dataset.py 10.1234/doi1 10.5678/doi2
+
+Command Line Options
+--------------------
+The tool offers several command line options:
+
+- ``-f, --file``: Specify a file containing DOIs (one per line).
+- ``-o, --output-dir``: Directory where metadata files will be saved.
+- ``-d, --depositor``: Name of the depositor.
+- ``-s, --subject``: Default subject for the metadata.
+- ``-m, --contact-mail``: Contact email address.
+- ``-u, --upload``: Flag to upload metadata to a Dataverse server.
+
+Configuration via config.yaml
+-------------------------------
+Some options are also set via the **config.yaml** file. This file includes settings such as:
+
+- Dataverse connection details (URL, API token, authentication credentials).
+- Mapping of project phases.
+- PI (principal investigator) information.
+- Default grant configurations.
+
+Make sure that your **config.yaml** is properly configured before running the tool. For example, your **config.yaml** might include:
+
+.. code-block:: yaml
+
+ dataverse:
+ url: "https://your.dataverse.server"
+ api_token: "your_api_token"
+ auth_user: "your_username"
+ auth_password: "your_password"
+ dataverse: "your_dataverse_name"
+
+ phase:
+ Phase1:
+ start: 2010
+ end: 2015
+ Phase2:
+ start: 2016
+ end: 2020
+
+ pis:
+ - given_name: "John"
+ family_name: "Doe"
+ email: "john.doe@example.com"
+ orcid: "0000-0001-2345-6789"
+ affiliation: "Example University"
+ project:
+ - "Project A"
+ - "Project B"
+
+ default_grants:
+ - funder: "Funder Name"
+ id: "GrantID12345"
+
+Usage Example with Configuration
+----------------------------------
+If you have configured your **config.yaml** and want to process DOIs from a file while uploading the metadata, you could run:
+
+.. code-block:: bash
+
+ python doi2dataset.py -f dois.txt -o output/ -d "John Doe" -s "Medicine, Health and Life Sciences" -m "john.doe@example.com" -u
+
+This command will use the options provided on the command line as well as the settings from **config.yaml**.
+
+For more details on usage and configuration, please refer to the rest of the documentation.
diff --git a/doi2dataset.py b/doi2dataset.py
new file mode 100755
index 0000000..02fa6bd
--- /dev/null
+++ b/doi2dataset.py
@@ -0,0 +1,1733 @@
+#!/bin/env python
+"""
+doi2dataset.py
+
+This script processes DOIs to generate metadata for Dataverse datasets.
+It supports fetching data from OpenAlex and CrossRef, mapping metadata fields,
+processing author and grant information, and optionally uploading the metadata
+to a Dataverse instance.
+
+Usage:
+ python doi2dataset.py [options] doi1 doi2 ...
+
+Options:
+ -f, --file File containing DOIs (one per line)
+ -o, --output-dir Output directory for metadata files (default: current directory)
+ -d, --depositor Name of the depositor
+ -s, --subject Default subject (default: "Medicine, Health and Life Sciences")
+ -m, --contact-mail Contact email address
+ -u, --upload Upload metadata to Dataverse
+"""
+
+import argparse
+import json
+import sys
+import unicodedata
+import warnings # TODO: Remove once the warning is stripped from idutils
+from dataclasses import dataclass, field
+from datetime import datetime
+from enum import Enum
+from functools import reduce
+from pathlib import Path
+from typing import Any, Sequence
+
+import dns.resolver
+import requests
+import yaml
+from email_validator import EmailNotValidError, validate_email
+from rich.console import Console
+from rich.panel import Panel
+from rich.progress import (
+ BarColumn,
+ Progress,
+ SpinnerColumn,
+ TaskID,
+ TextColumn,
+ TimeElapsedColumn,
+)
+from rich.table import Table
+from rich.theme import Theme
+
+# Idutils throws an unconditional warning about deprecation of relative imports.
+# Since we are not using them, supress the warning to not confuse users
+# TODO: Remove once the warning is stripped from idutils
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+
+from idutils.normalizers import normalize_doi, normalize_orcid, normalize_pmid
+from idutils.validators import is_doi
+
+# Script version
+VERSION = "1.0"
+
+# Icon definitions for console output
+ICONS = {
+ 'success': "✓", # Simple checkmark
+ 'error': "✗", # Simple X
+ 'warning': "!", # Simple exclamation
+ 'info': "ℹ", # Info symbol
+ 'processing': "⋯", # Three dots
+ 'done': "∎", # Filled square
+ 'file': "⨳", # Document symbol
+ 'folder': "⊞", # Folder symbol
+ 'clock': "◷", # Clock symbol
+ 'search': "⌕", # Search symbol
+ 'data': "≡", # Three lines
+ 'doi': "∾", # Link symbol
+ 'total': "∑", # Sum symbol
+ 'save': "⤓", # Save/download arrow
+ 'upload': "⤒" # Upload arrow
+}
+
+# Theme configuration for Rich console output
+THEME = Theme({
+ "info": "cyan",
+ "warning": "yellow",
+ "error": "red bold",
+ "success": "green",
+})
+
+# Available sources for metadata abstracts
+SOURCES = ["openalex", "crossref", "none"]
+
+def format_status(icon: str, message: str, style: str = "default") -> str:
+ """
+ Format a status message with an icon and a given style.
+
+ Args:
+ icon (str): Key for the icon character from the ICONS dictionary.
+ message (str): The status message.
+ style (str): The style to apply (e.g., 'default', 'info', 'warning', 'error', 'success').
+
+ Returns:
+ str: The formatted status message.
+ """
+ return f"[{style}]{ICONS[icon]} {message}[/{style}]"
+
+class FieldType(Enum):
+ """Enum representing different Dataverse field types."""
+ PRIMITIVE = "primitive"
+ COMPOUND = "compound"
+ VOCABULARY = "controlledVocabulary"
+
+@dataclass
+class Phase:
+ """
+ Represents a project phase with a defined time span.
+
+ Attributes:
+ name (str): The name of the project phase.
+ start (int): The start year of the project phase.
+ end (int): The end year of the project phase.
+ """
+
+ name: str
+ start: int
+ end: int
+
+ def check_year(self, year: int) -> bool:
+ """
+ Checks whether a given year falls within the project's phase boundaries.
+
+ Args:
+ year (int): The year to check.
+
+ Returns:
+ bool: True if the year is within the phase boundaries, otherwise False.
+ """
+
+ if self.start <= year <= self.end:
+ return True
+ return False
+
+@dataclass
+class BaseMetadataField[T]:
+ """
+ Base class for Dataverse metadata fields.
+
+ This class defines a metadata field with a name, a value of type T, and
+ a flag indicating whether multiple values are allowed. It serves as
+ a template for specific metadata field implementations.
+
+ Attributes:
+ name (str): The name of the metadata field.
+ multiple (bool): Indicates whether multiple values are allowed.
+ value (T): The value stored in the field.
+ type (FieldType): The type of the field, automatically set based on T.
+ """
+ name: str
+ multiple: bool
+ value: T
+ type: FieldType = field(init=False)
+
+ def __post_init__(self) -> None:
+ """
+ After initialization, determine the field type by calling the _set_type method.
+ """
+ self._set_type()
+
+ def _set_type(self) -> None:
+ """
+ Set the `type` attribute based on the field's value.
+
+ This method must be implemented by subclasses.
+
+ Raises:
+ NotImplementedError: If not implemented by a subclass.
+ """
+ raise NotImplementedError("Subclasses must implement the _set_type method.")
+
+ def to_dict(self) -> dict[str, Any]:
+ """
+ Convert the metadata field to a dictionary representation.
+
+ Returns:
+ dict[str, Any]: Dictionary representation of the metadata field.
+
+ Raises:
+ NotImplementedError: If not implemented by a subclass.
+ """
+ raise NotImplementedError("Subclasses must implement the to_dict method.")
+
+@dataclass
+class PrimitiveMetadataField(BaseMetadataField[str]):
+ """
+ Metadata field representing a primitive type (e.g., string) for Dataverse.
+ """
+ def _set_type(self) -> None:
+ self.type = FieldType.PRIMITIVE
+
+ def to_dict(self) -> dict[str, str | bool]:
+ """
+ Convert the primitive metadata field to a dictionary representation.
+
+ Returns:
+ dict[str, str | bool]: Dictionary with field properties.
+ """
+ return {
+ "typeName": self.name,
+ "typeClass": self.type.value,
+ "multiple": self.multiple,
+ "value": self.value,
+ }
+
+@dataclass
+class ControlledVocabularyMetadataField(BaseMetadataField[str | list[str]]):
+ """
+ Metadata field for controlled vocabulary values.
+ """
+ def _set_type(self) -> None:
+ self.type = FieldType.VOCABULARY
+
+ def to_dict(self) -> dict[str, Any]:
+ """
+ Convert the controlled vocabulary metadata field to a dictionary.
+
+ Returns:
+ dict[str, Any]: Dictionary representation.
+ """
+ return {
+ "typeName": self.name,
+ "typeClass": self.type.value,
+ "multiple": self.multiple,
+ "value": self.value,
+ }
+
+
+@dataclass
+class CompoundMetadataField(
+ BaseMetadataField[Sequence[Sequence[PrimitiveMetadataField | ControlledVocabularyMetadataField]]]
+):
+ """
+ Metadata field representing compound types, composed of multiple subfields.
+ """
+ def _set_type(self) -> None:
+ self.type = FieldType.COMPOUND
+
+ def to_dict(self) -> dict[str, Any]:
+ """
+ Convert the compound metadata field to a dictionary representation.
+
+ Returns:
+ dict[str, Any]: Dictionary representation of the compound field.
+ """
+ value_list: list[dict[str, Any]] = []
+ for outer_list in self.value:
+ field_dicts: list[dict[str, Any]] = []
+ for field_item in outer_list:
+ field_dicts.append({field_item.name: field_item.to_dict()})
+ value_list.append(reduce(lambda d1, d2: d1 | d2, field_dicts))
+
+ return {
+ "typeName": self.name,
+ "typeClass": self.type.value,
+ "multiple": self.multiple,
+ "value": value_list
+ }
+
+@dataclass
+class Person:
+ """
+ Represents a person (e.g., an author or a PI).
+
+ Attributes:
+ family_name (str): Family name of the person.
+ given_name (str): Given name of the person.
+ orcid (str): ORCID identifier (optional).
+ email (str): Email address (optional).
+ affiliation (str): Affiliation of the person (optional).
+ project (list[str]): List of associated projects.
+ """
+ family_name: str
+ given_name: str
+ orcid: str = ""
+ email: str = ""
+ affiliation: str = ""
+ project: list[str] = field(default_factory=list)
+
+ def format_name(self) -> str:
+ """
+ Format the name in 'Family, Given' order.
+
+ Returns:
+ str: Formatted name.
+ """
+ return f"{self.family_name}, {self.given_name}"
+
+ def author_fields(self) -> list[PrimitiveMetadataField | ControlledVocabularyMetadataField]:
+ """
+ Build metadata fields for an author.
+
+ Returns:
+ list: List of metadata fields representing the author.
+ """
+ if self.orcid:
+ return [
+ PrimitiveMetadataField("authorName", False, self.format_name()),
+ PrimitiveMetadataField("authorAffiliation", False, self.affiliation),
+ ControlledVocabularyMetadataField("authorIdentifierScheme", False, "ORCID"),
+ PrimitiveMetadataField("authorIdentifier", False, self.orcid)
+ ]
+ else:
+ return [
+ PrimitiveMetadataField("authorName", False, self.format_name()),
+ PrimitiveMetadataField("authorAffiliation", False, self.affiliation)
+ ]
+
+ def dataset_contact_fields(self) -> list[PrimitiveMetadataField]:
+ """
+ Build metadata fields for dataset contact information.
+
+ Returns:
+ list: List of metadata fields for the dataset contact.
+ """
+ return [
+ PrimitiveMetadataField("datasetContactName", False, self.format_name()),
+ PrimitiveMetadataField("datasetContactAffiliation", False, self.affiliation),
+ PrimitiveMetadataField("datasetContactEmail", False, self.email)
+ ]
+
+@dataclass
+class License:
+ """
+ Represents a license with name, URI, and short identifier.
+
+ Attributes:
+ name (str): The full name of the license.
+ uri (str): The license URI.
+ short (str): The short identifier of the license.
+ """
+ name: str
+ uri: str
+ short: str
+
+@dataclass
+class Abstract:
+ """
+ Represents an abstract with its text and source.
+
+ Attributes:
+ text (str): The abstract text.
+ source (str): The source of the abstract ('crossref', 'openalex', or 'none').
+ """
+ text: str
+ source: str
+
+ def __post_init__(self):
+ allowed_sources = ["crossref", "openalex", "none"]
+ if self.source not in allowed_sources:
+ raise ValueError(f"{self.source} is not valid! Needs to be one of {str(allowed_sources)}.")
+
+@dataclass
+class ConfigData:
+ """
+ Represents configuration data loaded from a YAML file.
+
+ Attributes:
+ dataverse (dict[str, str]): Dataverse-related configuration.
+ phase (dict[str, dict[str, int]]): Mapping of project phases.
+ pis (list[dict[str, Any]]): List of principal investigator configurations.
+ default_grants (list[dict[str, str]]): Default grant configurations.
+ """
+ dataverse: dict[str, str]
+ phase: dict[str, dict[str, int]]
+ pis: list[dict[str, Any]]
+ default_grants: list[dict[str, str]]
+
+class Config:
+ """
+ Singleton class to handle configuration loading and retrieval.
+ """
+ _instance: 'Config | None' = None
+ _config_data: ConfigData | None = None
+
+ def __new__(cls) -> 'Config':
+ """
+ Create and return the singleton instance of Config.
+
+ Returns:
+ Config: The singleton instance.
+ """
+ if cls._instance is None:
+ cls._instance = super(Config, cls).__new__(cls)
+ return cls._instance
+
+ @classmethod
+ def load_config(cls, config_path: str | Path | None = None) -> None:
+ """
+ Load configuration from a YAML file.
+
+ Args:
+ config_path (str | Path | None): Path to the configuration file.
+ If None, the default config.yaml in the script directory is used.
+
+ Raises:
+ FileNotFoundError: If the configuration file does not exist.
+ ValueError: If any PI email address is invalid.
+ """
+ if config_path is None:
+ config_path = Path(__file__).parent / "config.yaml"
+
+ config_path = Path(config_path)
+ if not config_path.exists():
+ raise FileNotFoundError(f"Config file not found: {config_path}")
+
+ with open(config_path, 'r', encoding='utf-8') as f:
+ config_data = yaml.safe_load(f)
+
+ # Validate PI email addresses
+ pis = config_data.get('pis', [])
+ for pi in pis:
+ if email := pi.get('email'):
+ if not validate_email_address(email):
+ raise ValueError(f"Configuration Error: Invalid email address for PI {pi.get('given_name', '')} {pi.get('family_name', '')}: {email}")
+
+ cls._config_data = ConfigData(
+ dataverse=config_data.get('dataverse', {}),
+ phase=config_data.get('phase', {}),
+ pis=config_data.get('pis', []),
+ default_grants=config_data.get('default_grants', [])
+ )
+
+ @classmethod
+ def get_config(cls) -> ConfigData:
+ """
+ Retrieve the loaded configuration data.
+
+ Returns:
+ ConfigData: The configuration data.
+
+ Raises:
+ RuntimeError: If the configuration could not be loaded.
+ """
+ if cls._config_data is None:
+ cls.load_config()
+ if cls._config_data is None:
+ raise RuntimeError("Failed to load configuration")
+ return cls._config_data
+
+ @property
+ def PHASE(self) -> dict[str, dict[str, int]]:
+ """
+ Get phase configuration.
+
+ Returns:
+ dict[str, dict[str, int]]: Mapping of phases.
+ """
+ return self.get_config().phase
+
+ @property
+ def PIS(self) -> list[dict[str, Any]]:
+ """
+ Get PI configurations.
+
+ Returns:
+ list[dict[str, Any]]: List of PI configurations.
+ """
+ return self.get_config().pis
+
+ @property
+ def DEFAULT_GRANTS(self) -> list[dict[str, str]]:
+ """
+ Get default grant configurations.
+
+ Returns:
+ list[dict[str, str]]: List of default grants.
+ """
+ return self.get_config().default_grants
+
+ @property
+ def DATAVERSE(self) -> dict[str, str]:
+ """
+ Get Dataverse configurations.
+
+ Returns:
+ dict[str, str]: Dataverse configuration.
+ """
+ return self.get_config().dataverse
+
+class APIClient:
+ """
+ Client for making HTTP requests to external APIs.
+
+ Attributes:
+ session (requests.Session): The underlying requests session.
+ """
+ def __init__(self, contact_mail: str | None = None, user_agent: str = f"UDE-Doi2Dataset/{VERSION}", token: str | None = None) -> None:
+ """
+ Initialize the API client with optional contact mail, user agent, and token.
+
+ Args:
+ contact_mail (str | None): Contact email address.
+ user_agent (str): User agent string.
+ token (str | None): Optional API token.
+ """
+ self.session = requests.Session()
+ self._set_headers(contact_mail, user_agent, token)
+
+ def _set_headers(self, contact_mail: str | None, user_agent: str, token: str | None) -> None:
+ """
+ Set HTTP headers for the session based on contact email and token.
+
+ Args:
+ contact_mail (str | None): Contact email address.
+ user_agent (str): User agent string.
+ token (str | None): Optional API token.
+ """
+ if contact_mail:
+ header = {"User-Agent": f"{user_agent} (mailto:{contact_mail})"}
+ else:
+ header = {"User-Agent": user_agent}
+
+ if token:
+ header["X-Dataverse-key"] = token
+
+ self.session.headers.update(header)
+
+ def make_request(self, url: str, method: str = "GET", **kwargs: Any) -> requests.Response | None:
+ """
+ Make an HTTP request and return the response.
+
+ Args:
+ url (str): The URL to request.
+ method (str): HTTP method to use (default: GET).
+ **kwargs: Additional arguments for requests.request.
+
+ Returns:
+ requests.Response | None: The HTTP response, or None if the request failed.
+ """
+ try:
+ response = self.session.request(method, url, **kwargs)
+ response.raise_for_status()
+ return response
+ except requests.exceptions.RequestException as e:
+ print(f"\n{ICONS['error']} Request failed: {str(e)}")
+ return None
+
+class NameProcessor:
+ """
+ Provides utility methods for processing names.
+ """
+ @staticmethod
+ def normalize_string(s: str) -> str:
+ """
+ Normalize a string using Unicode NFKD normalization and convert to ASCII.
+
+ Args:
+ s (str): The string to normalize.
+
+ Returns:
+ str: The normalized string.
+ """
+ return unicodedata.normalize("NFKD", s.lower()).encode("ASCII", "ignore").decode("ASCII")
+
+ @staticmethod
+ def split_name(full_name: str) -> tuple[str, str]:
+ """
+ Split a full name into given and family names.
+
+ Args:
+ full_name (str): The full name (e.g., "Doe, John" or "John Doe").
+
+ Returns:
+ tuple[str, str]: A tuple (given_name, family_name).
+ """
+ if "," in full_name:
+ surname, given_name = full_name.split(",", 1)
+ return given_name.strip(), surname.strip()
+
+ parts = full_name.strip().split()
+ if len(parts) == 1:
+ return "", parts[0]
+
+ return " ".join(parts[:-1]), parts[-1]
+
+class PIFinder:
+ """
+ Finds principal investigators (PIs) among a list of Person objects.
+ """
+ def __init__(self, pis: list[Person]) -> None:
+ """
+ Initialize with a list of Person objects representing potential PIs.
+
+ Args:
+ pis (list[Person]): List of Person objects.
+ """
+ self.pis = pis
+
+ def find_pi(self, family_name: str | None = None, orcid: str | None = None, given_name: str | None = None) -> Person | None:
+ """
+ Find a PI by ORCID or name.
+
+ Args:
+ family_name (str | None): Family name.
+ orcid (str | None): ORCID identifier.
+ given_name (str | None): Given name.
+
+ Returns:
+ Person | None: The matched PI or None if not found.
+ """
+ if orcid:
+ return self._find_by_orcid(normalize_orcid(orcid))
+
+ if family_name:
+ return self._find_by_name(family_name, given_name)
+
+ return None
+
+ def _find_by_orcid(self, orcid: str) -> Person | None:
+ """
+ Find a PI by ORCID.
+
+ Args:
+ orcid (str): Normalized ORCID.
+
+ Returns:
+ Person | None: The matched PI or None.
+ """
+ for person in self.pis:
+ if normalize_orcid(person.orcid) == orcid:
+ return person
+ return None
+
+ def _find_by_name(self, family_name: str, given_name: str | None) -> Person | None:
+ """
+ Find a PI by family name (and optionally given name).
+
+ Args:
+ family_name (str): Family name.
+ given_name (str | None): Given name (optional).
+
+ Returns:
+ Person | None: The matched PI or None.
+ """
+ matches: list[Person] = []
+ normalized_family_name = NameProcessor.normalize_string(family_name)
+
+ for person in self.pis:
+ if NameProcessor.normalize_string(person.family_name) == normalized_family_name:
+ matches.append(person)
+
+ if not matches:
+ return None
+
+ if given_name:
+ normalized_given_name = NameProcessor.normalize_string(given_name)
+ for match in matches:
+ if NameProcessor.normalize_string(match.given_name) == normalized_given_name:
+ return match
+ return None
+
+ if len(matches) == 1:
+ return matches[0]
+
+ raise ValueError("Multiple matches found for family name")
+
+class LicenseProcessor:
+ """
+ Processes license information from metadata.
+ """
+ LICENSE_MAP = {
+ "cc-by": ("https://creativecommons.org/licenses/by/4.0/", "CC BY 4.0"),
+ "cc-by-sa": ("https://creativecommons.org/licenses/by-sa/4.0/", "CC BY-SA 4.0"),
+ "cc-by-nc": ("https://creativecommons.org/licenses/by-nc/4.0/", "CC BY-NC 4.0"),
+ "cc-by-nc-sa": ("https://creativecommons.org/licenses/by-nc-sa/4.0/", "CC BY-NC-SA 4.0"),
+ "cc-by-nc-nd": ("https://creativecommons.org/licenses/by-nc-nd/4.0/", "CC BY-NC-ND 4.0"),
+ "cc-by-nd": ("https://creativecommons.org/licenses/by-nd/4.0/", "CC BY-ND 4.0"),
+ "cc0": ("https://creativecommons.org/publicdomain/zero/1.0/", "CC0 1.0"),
+ "pd": ("https://creativecommons.org/publicdomain/mark/1.0/", "Public Domain Mark 1.0"),
+ }
+
+ @classmethod
+ def process_license(cls, data: dict[str, Any]) -> License:
+ """
+ Process and return license information based on input data.
+
+ Args:
+ data (dict[str, Any]): Input data containing license info.
+
+ Returns:
+ License: Processed license information.
+ """
+ location = data.get("primary_location", {})
+ license_short = location.get("license", "")
+
+ if not license_short:
+ return License(name="", uri="", short="unknown")
+
+ base_license = license_short.split("/")[0].lower()
+ uri, name = cls.LICENSE_MAP.get(base_license, ("", license_short))
+ return License(name=name, uri=uri, short=license_short)
+
+class AbstractProcessor:
+ """
+ Retrieves and processes abstracts from CrossRef and OpenAlex.
+ """
+ def __init__(self, api_client: APIClient):
+ """
+ Initialize with an APIClient instance.
+
+ Args:
+ api_client (APIClient): The API client to use for requests.
+ """
+ self.api_client = api_client
+
+ def get_abstract(self, doi: str, data: dict[str, Any], license: License) -> Abstract:
+ """
+ Get an abstract based on DOI and license permissions.
+
+ Args:
+ doi (str): The DOI.
+ data (dict[str, Any]): Data retrieved from an external source.
+ license (License): License information.
+
+ Returns:
+ Abstract: The abstract with its source.
+ """
+ license_ok = {"cc-by", "cc-by-sa", "cc-by-nc", "cc-by-nc-sa", "cc0", "pd"}
+
+ if license.short in license_ok:
+ console.print(f"\n{ICONS['info']} License {license.name} allows derivative works. Pulling abstract from CrossRef.", style="info")
+ crossref_abstract = self._get_crossref_abstract(doi)
+ if crossref_abstract:
+ return Abstract(text=crossref_abstract, source="crossref")
+ else:
+ console.print(f"\n{ICONS['warning']} No abstract found in CrossRef!", style="warning")
+ else:
+ console.print(f"\n{ICONS['info']} License {license.name} does not allow derivative works. Reconstructing abstract from OpenAlex!", style="info")
+
+
+ openalex_abstract = self._get_openalex_abstract(data)
+ if openalex_abstract:
+ return Abstract(text=openalex_abstract, source="openalex")
+ else:
+ console.print(f"\n{ICONS['warning']} No abstract found in OpenAlex!", style="warning")
+
+ console.print(f"\n{ICONS['warning']} No abstract found in either CrossRef nor OpenAlex!", style="warning")
+ return Abstract(text="", source="none")
+
+ def _get_crossref_abstract(self, doi: str) -> str | None:
+ """
+ Retrieve abstract from CrossRef API.
+
+ Args:
+ doi (str): The DOI.
+
+ Returns:
+ str | None: The abstract if found, otherwise None.
+ """
+ url = f"https://api.crossref.org/works/{doi}"
+ response = self.api_client.make_request(url)
+
+ if response and response.status_code == 200:
+ abstract_raw = response.json().get("message", {}).get("abstract")
+ return self._clean_jats(abstract_raw)
+ return None
+
+ def _get_openalex_abstract(self, data: dict[str, Any]) -> str | None:
+ """
+ Retrieve abstract from OpenAlex data.
+
+ Args:
+ data (dict[str, Any]): Data from OpenAlex.
+
+ Returns:
+ str | None: The reconstructed abstract, or None if not available.
+ """
+ inv_index = data.get("abstract_inverted_index")
+ if not inv_index:
+ return None
+
+ word_positions = [(word, pos) for word, positions in inv_index.items() for pos in positions]
+ sorted_words = sorted(word_positions, key=lambda x: x[1])
+ return " ".join(word for word, _ in sorted_words)
+
+ def _clean_jats(self, text: str | None) -> str:
+ """
+ Clean JATS XML tags in the abstract and convert them to HTML tags.
+
+ Args:
+ text (str | None): The raw abstract text containing JATS tags.
+
+ Returns:
+ str: The cleaned abstract text.
+ """
+ if not text:
+ return ""
+
+ replacements = {
+ "": "",
+ "": "",
+ "": "",
+ "": "",
+ "": "",
+ "": "",
+ "": "",
+ "": "",
+ "": "",
+ "": "",
+ "": "",
+ "
": "",
+ "": "",
+ "": "",
+ "": "",
+ "
": "
",
+ "": "",
+ "
": "",
+ '': "": "",
+ '': "",
+ "
": "",
+ "": "",
+ "": "",
+ "": "",
+ "
": "",
+ }
+
+ for jats_tag, html_tag in replacements.items():
+ text = text.replace(jats_tag, html_tag)
+ return text
+
+class SubjectMapper:
+ """
+ Maps subject names from input data to controlled vocabulary.
+ """
+ CONTROLLED_VOCAB = {
+ "Agricultural Sciences": "Agricultural Sciences",
+ "Arts and Humanities": "Arts and Humanities",
+ "Astronomy": "Astronomy and Astrophysics",
+ "Astrophysics": "Astronomy and Astrophysics",
+ "Business": "Business and Management",
+ "Management": "Business and Management",
+ "Chemistry": "Chemistry",
+ "Computer Science": "Computer and Information Science",
+ "Information Science": "Computer and Information Science",
+ "Earth Sciences": "Earth and Environmental Sciences",
+ "Environmental Sciences": "Earth and Environmental Sciences",
+ "Engineering": "Engineering",
+ "Law": "Law",
+ "Mathematics": "Mathematical Sciences",
+ "Medicine": "Medicine, Health and Life Sciences",
+ "Health Sciences": "Medicine, Health and Life Sciences",
+ "Life Sciences": "Medicine, Health and Life Sciences",
+ "Physics": "Physics",
+ "Social Sciences": "Social Sciences",
+ }
+
+ @classmethod
+ def get_subjects(cls, data: dict[str, Any], fallback_subject: str = "Other") -> list[str]:
+ """
+ Extract and map subjects from input data.
+
+ Args:
+ data (dict[str, Any]): The input metadata.
+ fallback_subject (str): Fallback subject if none found.
+
+ Returns:
+ list[str]: List of mapped subject names.
+ """
+ topics = data.get("topics", [])
+ subject_collection: list[Any] = []
+
+ for topic in topics:
+ for field_type in ["subfield", "field", "domain"]:
+ if field_name := topic.get(field_type, {}).get("display_name"):
+ subject_collection.append(field_name)
+
+ mapped_subjects = cls.map_subjects(subject_collection)
+ return mapped_subjects if mapped_subjects else [fallback_subject]
+
+
+ @classmethod
+ def map_subjects(cls, subjects: list[str]) -> list[str]:
+ """
+ Map given subjects to valid controlled vocabulary terms.
+
+ Args:
+ subjects (list[str]): List of subjects.
+
+ Returns:
+ list[str]: List of valid subjects.
+ """
+ valid_subjects: set[str] = set()
+ for subject in subjects:
+ if mapped_subject := cls.CONTROLLED_VOCAB.get(subject):
+ valid_subjects.add(mapped_subject)
+ return list(valid_subjects)
+
+class CitationBuilder:
+ """
+ Builds various citation-related metadata fields.
+ """
+ def __init__(self, data: dict[str, Any], doi: str, pi_finder: PIFinder) -> None:
+ """
+ Initialize the CitationBuilder with data, DOI, and a PIFinder.
+
+ Args:
+ data (dict[str, Any]): Metadata from an external source.
+ doi (str): The DOI.
+ pi_finder (PIFinder): Instance to find PI information.
+ """
+ self.data = data
+ self.doi = doi
+ self.pi_finder = pi_finder
+
+ def build_other_ids(self) -> list[list[PrimitiveMetadataField]]:
+ """
+ Build metadata fields for other identifiers (e.g., DOI, PMID).
+
+ Returns:
+ list[list[PrimitiveMetadataField]]: Nested list of identifier metadata fields.
+ """
+ other_ids = [[
+ PrimitiveMetadataField("otherIdAgency", False, "doi"),
+ PrimitiveMetadataField("otherIdValue", False, self.doi)
+ ]]
+
+ if pmid := self.data.get("ids", {}).get("pmid"):
+ try:
+ normalized_pmid = normalize_pmid(pmid)
+ other_ids.append([
+ PrimitiveMetadataField("otherIdAgency", False, "pmid"),
+ PrimitiveMetadataField("otherIdValue", False, normalized_pmid)
+ ])
+ except ValueError:
+ pass
+
+ return other_ids
+
+ def build_grants(self) -> list[list[PrimitiveMetadataField]]:
+ """
+ Build metadata fields for grants.
+
+ Returns:
+ list[list[PrimitiveMetadataField]]: Nested list of grant metadata fields.
+ """
+ config = Config()
+ default_grants = config.DEFAULT_GRANTS
+
+ grants: list[list[PrimitiveMetadataField]] = []
+
+ for grant in default_grants:
+ grants.append([PrimitiveMetadataField("grantNumberAgency", False, grant["funder"]), PrimitiveMetadataField("grantNumberValue", False, grant["id"])])
+
+ for grant in self.data.get("grants", []):
+ grant_funder = grant.get("funder_display_name", {})
+ grant_id = grant.get("award_id", {})
+ if not grant_funder or not grant_id:
+ continue
+
+ grants.append([PrimitiveMetadataField("grantNumberAgency", False, grant_funder), PrimitiveMetadataField("grantNumberValue", False, grant_id)])
+
+ return grants
+
+
+ def build_authors(self) -> tuple[list[Person], list[Person]]:
+ """
+ Build lists of authors and corresponding authors from the metadata.
+
+ Returns:
+ tuple: (authors, corresponding_authors)
+ """
+ authors: list[Person] = []
+ corresponding_authors: list[Person] = []
+ for authorship in self.data.get("authorships", []):
+ author = authorship.get("author", {})
+ if not author:
+ continue
+
+ author_person = self._process_author(author, authorship)
+ authors.append(author_person)
+
+ if authorship.get("is_corresponding"):
+ corresponding_entry = self._process_corresponding_author(author_person, authorship)
+ if corresponding_entry:
+ corresponding_authors.append(corresponding_entry)
+
+ return authors, corresponding_authors
+
+
+ def _process_author(self, author: dict[str, Any], authorship: dict[str, Any]) -> Person:
+ """
+ Process author data and return a Person instance.
+
+ Args:
+ author (dict[str, Any]): Author data.
+ authorship (dict[str, Any]): Authorship metadata.
+
+ Returns:
+ Person: Processed author.
+ """
+ display_name = author.get("display_name", "")
+ given_name, family_name = NameProcessor.split_name(display_name)
+
+ person = Person(family_name, given_name)
+
+ if affiliations := authorship.get("affiliations"):
+ affiliation = affiliations[0].get("raw_affiliation_string", "").strip()
+ person.affiliation = affiliation
+
+ if orcid := author.get("orcid"):
+ person.orcid = normalize_orcid(orcid)
+
+ return person
+
+
+ def _process_corresponding_author(self, author: Person, authorship: dict[str, Any]) -> Person | None:
+ """
+ Identify the corresponding author based on provided PI information.
+
+ Args:
+ author (Person): The author.
+ authorship (dict[str, Any]): Authorship metadata.
+
+ Returns:
+ Person | None: The corresponding author, or None if not found.
+ """
+ pi = self.pi_finder.find_pi(
+ family_name=author.family_name,
+ given_name=author.given_name,
+ orcid=author.orcid
+ )
+
+ if not pi:
+ return None
+
+ return pi
+
+ def build_topics(self) -> list[list[PrimitiveMetadataField]]:
+ """
+ Build metadata fields for topics based on a threshold score.
+
+ Returns:
+ list[list[PrimitiveMetadataField]]: Nested list of topic metadata fields.
+ """
+ topics: list[list[PrimitiveMetadataField]] = []
+
+ for topic in self.data.get("topics", []):
+ if topic.get("score") >= 0.8:
+ if name := topic.get("display_name"):
+ topics.append([PrimitiveMetadataField("topicClassValue", False, name)])
+
+ return topics
+
+
+ def build_keywords(self) -> list[list[PrimitiveMetadataField]]:
+ """
+ Build metadata fields for keywords from both regular keywords and MeSH terms.
+
+ Returns:
+ list[list[PrimitiveMetadataField]]: Nested list of keyword metadata fields.
+ """
+ keywords: list[list[PrimitiveMetadataField]] = []
+
+ for keyword in self.data.get("keywords", []):
+ # Filter out possibly unrelated keywords (low score)
+ if keyword["score"] >= 0.5:
+ keyword_value_field = PrimitiveMetadataField("keywordValue", False, keyword["display_name"])
+ keywords.append([keyword_value_field])
+
+ mesh_base_url = "http://id.nlm.nih.gov/mesh"
+ for mesh in self.data.get("mesh", []):
+ url = f"{mesh_base_url}/{mesh['descriptor_ui']}"
+ if mesh["qualifier_ui"]:
+ url = f"{url}{mesh['qualifier_ui']}"
+
+
+ keyword_value_field = PrimitiveMetadataField("keywordValue", False, mesh["descriptor_name"])
+ keyword_term_uri_field = PrimitiveMetadataField("keywordTermURI", False, url)
+ keyword_vocabulary_field = PrimitiveMetadataField("keywordVocabulary", False, "MeSH")
+ keyword_vocabulary_uri_field = PrimitiveMetadataField("keywordVocabularyURI", False, mesh_base_url)
+
+ keywords.append([keyword_value_field, keyword_term_uri_field, keyword_vocabulary_field, keyword_vocabulary_uri_field])
+
+ return keywords
+
+class MetadataProcessor:
+ """
+ Processes metadata for a given DOI by fetching data from OpenAlex,
+ building metadata blocks, and optionally uploading the dataset.
+ """
+ def __init__(
+ self,
+ doi: str,
+ depositor: str | None = None,
+ output_path: Path | None = None,
+ default_subject: str = "Other",
+ contact_mail: str | None = None,
+ upload: bool = False,
+ console: Console | None = None,
+ progress: Progress | None = None,
+ task_id: TaskID | None = None
+ ) -> None:
+ """
+ Initialize the MetadataProcessor with configuration and processing options.
+
+ Args:
+ doi (str): The DOI to process.
+ depositor (str | None): Depositor name.
+ output_path (Path | None): Path where metadata will be saved.
+ default_subject (str): Default subject.
+ contact_mail (str | None): Contact email address.
+ upload (bool): Whether to upload metadata.
+ console (Console | None): Rich console instance.
+ progress (Progress | None): Progress bar instance.
+ task_id (TaskID | None): Task ID for progress updates.
+ """
+ self.console = console or Console()
+ try:
+ self.doi = self._validate_doi(doi)
+ except ValueError as e:
+ print(f"Error: {str(e)}")
+ raise
+ self.depositor = depositor
+ self.output_path = output_path
+ self.default_subject = default_subject
+ self.api_client = APIClient(contact_mail)
+ config = Config()
+ pi_objects = [Person(**pi) for pi in config.PIS]
+ self.pi_finder = PIFinder(pi_objects)
+ self.upload = upload
+ self.progress = progress
+ self.task_id = task_id
+
+ @staticmethod
+ def _validate_doi(doi: str) -> str:
+ """
+ Validate and normalize a DOI.
+
+ Args:
+ doi (str): The DOI to validate.
+
+ Returns:
+ str: Normalized DOI.
+
+ Raises:
+ ValueError: If the DOI is invalid.
+ """
+ if not is_doi(doi):
+ raise ValueError(f"Invalid DOI: {doi}")
+ return normalize_doi(doi)
+
+ def _update_progress(self) -> None:
+ """
+ Advance the progress bar if enabled.
+ """
+ if self.progress and self.task_id is not None:
+ self.progress.advance(self.task_id)
+
+ def process(self) -> dict[str, Any]:
+ """
+ Process the DOI: fetch data, build metadata, optionally upload, and save output.
+
+ Returns:
+ dict[str, Any]: The constructed metadata dictionary.
+ """
+ self.console.print(f"{ICONS['processing']} Processing DOI: {self.doi}", style="info")
+
+ data = self._fetch_data()
+ self._update_progress()
+
+ metadata = self._build_metadata(data)
+ self._update_progress()
+
+ if self.upload:
+ self._upload_data(metadata)
+ self._update_progress()
+
+ self._save_output(metadata)
+ self._update_progress()
+
+ self.console.print(f"\n{ICONS['success']} Successfully processed: {self.doi}\n", style="success")
+ return metadata
+
+ def _upload_data(self, metadata: dict[str, Any]) -> dict[str, Any]:
+ """
+ Upload the metadata to Dataverse.
+
+ Args:
+ metadata (dict[str, Any]): The metadata to upload.
+
+ Returns:
+ dict[str, Any]: The response from the Dataverse API.
+
+ Raises:
+ ValueError: If the upload fails.
+ """
+ config = Config()
+
+ token = config.DATAVERSE['api_token']
+ client = APIClient(token=token)
+ url = f"{config.DATAVERSE['url']}/api/dataverses/{config.DATAVERSE['dataverse']}/datasets?doNotValidate=true"
+ auth = (config.DATAVERSE['auth_user'], config.DATAVERSE['auth_password'])
+
+ response = client.make_request(url, method="POST", auth=auth, json=metadata)
+
+ if response is None or response.status_code != 201:
+ self.console.print(f"\n{ICONS['error']} Failed to upload to Dataverse: {url}", style="error")
+ raise ValueError(f"Failed to upload to Dataverse: {url}")
+ else:
+ perma = response.json().get("data", {}).get("persistentId", "")
+ self.console.print(f"{ICONS['upload']} Dataset uploaded to: {config.DATAVERSE['dataverse']} with ID {perma}", style="info")
+
+ return response.json()
+
+ def _fetch_data(self) -> dict[str, Any]:
+ """
+ Fetch metadata from OpenAlex for the given DOI.
+
+ Returns:
+ dict[str, Any]: The fetched data.
+
+ Raises:
+ ValueError: If data fetching fails.
+ """
+ url = f"https://api.openalex.org/works/https://doi.org/{self.doi}"
+ response = self.api_client.make_request(url)
+
+ if response is None or response.status_code != 200:
+ self.console.print(f"\n{ICONS['error']} Failed to fetch data for DOI: {self.doi}", style="error")
+ raise ValueError(f"Failed to fetch data for DOI: {self.doi}")
+
+ return response.json()
+
+ def _build_metadata(self, data: dict[str, Any]) -> dict[str, Any]:
+ """
+ Construct the complete metadata dictionary from fetched data.
+
+ Args:
+ data (dict[str, Any]): The data retrieved from OpenAlex.
+
+ Returns:
+ dict[str, Any]: The complete metadata dictionary.
+ """
+ license_info = LicenseProcessor.process_license(data)
+ abstract_processor = AbstractProcessor(self.api_client)
+ abstract = abstract_processor.get_abstract(self.doi, data, license_info)
+ citation_builder = CitationBuilder(data, self.doi, self.pi_finder)
+
+ authors, corresponding_authors = citation_builder.build_authors()
+ author_fields: list[list[PrimitiveMetadataField | ControlledVocabularyMetadataField]] = []
+ corresponding_author_fields: list[list[PrimitiveMetadataField]] = []
+ for author in authors:
+ author_fields.append(author.author_fields())
+
+ if not corresponding_authors:
+ self.console.print(f"{ICONS['warning']} No corresponding authors explicitly declared; PIs are used as a fallback!", style="warning")
+ pis = self._get_involved_pis(data)
+ corresponding_authors: list[Person]
+ for pi in pis:
+ corresponding_authors.append(pi)
+
+ for corresponding_author in corresponding_authors:
+ corresponding_author_fields.append(corresponding_author.dataset_contact_fields())
+
+ description = self._build_description(data, abstract)
+
+ grants = citation_builder.build_grants()
+
+ return_dict: dict[str, Any] = {
+ "datasetVersion": {
+ "metadataBlocks": {
+ "citation": {
+ "fields": [
+ PrimitiveMetadataField("title", False, data.get("title", "")).to_dict(),
+ PrimitiveMetadataField("distributionDate", False, data.get("publication_date", "")).to_dict(),
+ CompoundMetadataField("otherId", True, citation_builder.build_other_ids()).to_dict(),
+ CompoundMetadataField("dsDescription", True, [[PrimitiveMetadataField("dsDescriptionValue", False, description)]]).to_dict(),
+ ControlledVocabularyMetadataField("subject", True, SubjectMapper.get_subjects(data, self.default_subject)).to_dict(),
+ CompoundMetadataField("topicClassification", True, citation_builder.build_topics()).to_dict(),
+ CompoundMetadataField("keyword", True, citation_builder.build_keywords()).to_dict(),
+ PrimitiveMetadataField("depositor", False, self.depositor or data["primary_location"]["source"].get("display_name", "")).to_dict(),
+ PrimitiveMetadataField("alternativeURL", False, f"https://doi.org/{self.doi}").to_dict(),
+ CompoundMetadataField("author", True, author_fields).to_dict(),
+ CompoundMetadataField("datasetContact", True, corresponding_author_fields).to_dict(),
+ CompoundMetadataField("grantNumber", True, grants).to_dict()
+ ],
+ "displayName": "Citation Metadata"
+ },
+ "crc1430_org_v1": self._build_organization_metadata(data)
+ },
+ "files": []
+ }
+ }
+
+ if license_info.name:
+ return_dict["datasetVersion"]["license"] = {
+ "name": license_info.name,
+ "uri": license_info.uri
+ },
+ else:
+ return_dict["datasetVersion"]["termsOfUse"] = f"All rights reserved. Copyright © {self._get_publication_year(data)}, [TODO: Insert copyright holder here!]"
+
+ return return_dict
+
+ def _build_description(self, data: dict[str, Any], abstract: Abstract) -> str:
+ """
+ Build the description field by combining a header and the abstract.
+
+ Args:
+ data (dict[str, Any]): The metadata.
+ abstract (Abstract): The abstract object.
+
+ Returns:
+ str: The full description.
+ """
+ head = self._build_description_head(data)
+ return f"{head}{abstract.text}"
+
+ def _build_description_head(self, data: dict[str, Any]) -> str:
+ """
+ Build the header for the description based on publication details.
+
+ Args:
+ data (dict[str, Any]): The metadata.
+
+ Returns:
+ str: The HTML header string.
+ """
+ journal = data.get("primary_location", {}).get("source", {}).get("display_name")
+ publication_date = data.get("publication_date")
+ volume = data.get("biblio", {}).get("volume")
+ issue = data.get("biblio", {}).get("issue")
+ type = data.get("type")
+
+ if all([journal, publication_date, volume, issue, type]):
+ return f"This {type} was published on {publication_date} in {journal} {volume}({issue})
"
+ elif all([journal, publication_date, type]):
+ return f"This {type} was published on {publication_date} in {journal}
"
+
+ self.console.print(f"{ICONS['warning']} No abstract header added, missing information (journal, publication date and/or document type)", style="warning")
+ return ""
+
+ def _get_publication_year(self, data: dict[str, Any]) -> str:
+ """
+ Extract the publication year from the metadata.
+
+ Args:
+ data (dict[str, Any]): The metadata.
+
+ Returns:
+ str: The publication year.
+ """
+ return data.get("publication_year", "")
+
+ def _build_organization_metadata(self, data: dict[str, Any]) -> dict[str, Any]:
+ """
+ Build organization metadata fields (phase, project, PI names).
+
+ Args:
+ data (dict[str, Any]): The metadata.
+
+ Returns:
+ dict[str, Any]: Organization metadata.
+ """
+ publication_year = self._get_publication_year(data)
+ if publication_year:
+ phases = self._get_phases(int(publication_year))
+ else:
+ phases = []
+
+ pis = self._get_involved_pis(data)
+ projects: list[str] = []
+ for pi in pis:
+ for project in pi.project:
+ projects.append(project)
+
+ pi_names: list[str] = []
+ for pi in pis:
+ pi_names.append(pi.format_name())
+
+ # Deduplicate projects and PI names
+ unique_projects = list(set(projects))
+ unique_pi_names = list(set(pi_names))
+
+ return {
+ "fields": [
+ ControlledVocabularyMetadataField("crc1430OrgV1Phase", True, phases).to_dict(),
+ ControlledVocabularyMetadataField("crc1430OrgV1Project", True, unique_projects).to_dict(),
+ ControlledVocabularyMetadataField("crc1430OrgV1PI", True, unique_pi_names).to_dict()
+ ]
+ }
+
+ def _get_phases(self, year: int) -> list[str]:
+ """
+ Determine the project phases matching a given publication year.
+
+ Args:
+ year (int): The publication year.
+
+ Returns:
+ list[str]: List of matching phase names.
+ """
+ config = Config()
+ matching_phases: list[str] = []
+ for phase_name, phase_info in config.PHASE.items():
+ phase = Phase(phase_name, phase_info["start"], phase_info["end"])
+ if phase.check_year(year):
+ matching_phases.append(phase.name)
+ return matching_phases
+
+ def _get_involved_pis(self, data: dict[str, Any]) -> list[Person]:
+ """
+ Identify involved principal investigators from the metadata.
+
+ Args:
+ data (dict[str, Any]): The metadata.
+
+ Returns:
+ list[Person]: List of PIs.
+ """
+ involved_pis: list[Person] = []
+ for authorship in data.get("authorships", []):
+ author = authorship.get("author", {})
+ if not author:
+ continue
+
+ display_name = author.get("display_name", "")
+ given_name, family_name = NameProcessor.split_name(display_name)
+
+ if pi := self.pi_finder.find_pi(
+ family_name=family_name,
+ given_name=given_name,
+ orcid=author.get("orcid")
+ ):
+ involved_pis.append(pi)
+
+ return involved_pis
+
+ def _save_output(self, metadata: dict[str, Any]) -> None:
+ """
+ Save the generated metadata to a file or print it to the console.
+
+ Args:
+ metadata (dict[str, Any]): The metadata to save.
+ """
+ if self.output_path:
+ try:
+ with open(self.output_path, "w", encoding="utf-8") as f:
+ json.dump(metadata, f, indent=4, ensure_ascii=False)
+ self.console.print(f"{ICONS['save']} Metadata saved in: {self.output_path}", style="info")
+ except Exception as e:
+ self.console.print(f"{ICONS['error']} Error saving metadata: {str(e)}\n", style="error")
+ raise
+ else:
+ self.console.print(metadata)
+
+def sanitize_filename(doi: str) -> str:
+ """
+ Convert DOI to a valid filename using only alphanumeric characters and underscores.
+
+ Args:
+ doi (str): The DOI to sanitize.
+
+ Returns:
+ str: Sanitized filename string.
+ """
+ # Replace non-alphanumeric characters with underscores
+ sanitized = ''.join(c if c.isalnum() else '_' for c in doi)
+ # Remove consecutive underscores
+ while '__' in sanitized:
+ sanitized = sanitized.replace('__', '_')
+ # Remove leading/trailing underscores
+ return sanitized.strip('_')
+
+def print_summary(results: dict[str, list[Any]], console: Console) -> None:
+ """
+ Print a summary table of processing results to the console.
+
+ Args:
+ results (dict[str, list[Any]]): Dictionary containing success and failed DOIs.
+ console (Console): Rich console object for output.
+ """
+ table = Table(title="Processing Results")
+
+ table.add_column("Status", style="bold")
+ table.add_column("Count", justify="right")
+ table.add_column("DOIs", style="dim")
+
+ table.add_row(
+ f"{ICONS['success']} Success",
+ str(len(results["success"])),
+ ", ".join(results["success"][:3]) + ("..." if len(results["success"]) > 3 else "")
+ )
+
+ if results["failed"]:
+ table.add_row(
+ f"{ICONS['error']} Failed",
+ str(len(results["failed"])),
+ ", ".join(doi for doi, _ in results["failed"][:3]) +
+ ("..." if len(results["failed"]) > 3 else "")
+ )
+
+ console.print(Panel(table, title="Summary", border_style="blue"))
+
+def validate_email_address(email: str):
+ """
+ Validate an email address and ensure its domain has an MX record.
+
+ Args:
+ email (str): The email address to validate.
+
+ Returns:
+ bool: True if the email address is valid and its domain resolves, otherwise False.
+ """
+ try:
+ # Basic validation
+ valid = validate_email(email)
+ email = valid.normalized
+
+ # Check domain has MX record
+ domain = email.split('@')[1]
+ dns.resolver.resolve(domain, 'MX')
+
+ return True
+ except (EmailNotValidError, dns.resolver.NoAnswer, dns.resolver.NXDOMAIN):
+ return False
+
+def process_doi_batch(
+ dois: set[str],
+ output_dir: Path,
+ depositor: str | None = None,
+ default_subject: str = "Medicine, Health and Life Sciences",
+ contact_mail: str | None = None,
+ upload: bool = False
+) -> dict[str, list[Any]]:
+ """
+ Process a batch of DOIs and return a summary of results.
+
+ Args:
+ dois (set[str]): Set of DOIs to process.
+ output_dir (Path): Directory where metadata files will be saved.
+ depositor (str | None): Depositor name.
+ default_subject (str): Default subject for metadata.
+ contact_mail (str | None): Contact email address.
+ upload (bool): Flag indicating whether to upload metadata to Dataverse.
+
+ Returns:
+ dict[str, list[Any]]: Dictionary with keys 'success' and 'failed'.
+ """
+ results: dict[str, list[Any]] = {"success": [], "failed": []}
+
+ progress_columns = [
+ SpinnerColumn(),
+ TextColumn("[bold blue]{task.description:<50}"),
+ BarColumn(bar_width=None),
+ TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+ TextColumn("•"), # Separator
+ TimeElapsedColumn(),
+ TextColumn("•"), # Separator
+ TextColumn("[bold]{task.completed}/{task.total}"),
+ ]
+
+ # Define steps for each DOI processing
+ if upload:
+ doi_total_steps = 4 # Fetch, Build, Upload, Save
+ else:
+ doi_total_steps = 3 # Fetch, Build, Save
+
+ with Progress(
+ *progress_columns,
+ console=console,
+ transient=True # This makes the progress bar disappear after completion
+ ) as progress:
+ # Add main task
+ main_task = progress.add_task(
+ "[bold blue]Processing DOIs...",
+ total=len(dois)
+ )
+
+ # Add status task for current DOI
+ status_task = progress.add_task(
+ "[cyan]Current:",
+ total=None, # Indeterminate progress
+ visible=False # Hidden initially
+ )
+
+ status_task = progress.add_task(
+ "[cyan]Current:",
+ total=doi_total_steps,
+ visible=False
+ )
+
+ for doi in dois:
+ try:
+ # Update status display
+ progress.update(
+ status_task,
+ description=f"[cyan]Current: [white]{doi[:50]}...",
+ visible=True,
+ completed=0 # Reset progress for new DOI
+ )
+
+ # Process the DOI
+ sanitized_filename = sanitize_filename(normalize_doi(doi))
+ output_path = output_dir / f"{sanitized_filename}_metadata.json"
+
+ processor = MetadataProcessor(
+ doi=doi,
+ depositor=depositor,
+ output_path=output_path,
+ default_subject=default_subject,
+ contact_mail=contact_mail,
+ upload=upload,
+ console=console,
+ progress=progress,
+ task_id=status_task
+ )
+
+ # Process and capture result
+ processor.process()
+ results["success"].append(doi)
+
+ # Update progress
+ progress.advance(main_task)
+
+ except Exception as e:
+ # Handle errors
+ results["failed"].append((doi, str(e)))
+
+ # Show error but keep progress bar
+ progress.console.print(
+ f"{ICONS['error']} Error processing {doi}: {str(e)}",
+ style="error"
+ )
+ finally:
+ # Clear current status
+ progress.update(status_task, visible=False)
+
+ # Print final summary
+ print_summary(results, console)
+
+ return results
+
+
+if __name__ == "__main__":
+
+ console = Console(theme=THEME)
+
+ try:
+
+ parser = argparse.ArgumentParser(description="Process DOIs to generate metadata")
+ parser.add_argument(
+ "dois",
+ nargs="*",
+ help="One or more DOIs to process"
+ )
+ parser.add_argument(
+ "-f", "--file",
+ help="File containing DOIs (one per line)",
+ type=argparse.FileType('r')
+ )
+ parser.add_argument(
+ "-o", "--output-dir",
+ help="Output directory for metadata files",
+ default="."
+ )
+ parser.add_argument(
+ "-d", "--depositor",
+ help="Name of the depositor",
+ default=None
+ )
+ parser.add_argument(
+ "-s", "--subject",
+ help="Default subject",
+ default="Medicine, Health and Life Sciences"
+ )
+ parser.add_argument(
+ "-m", "--contact-mail",
+ help="Contact email address",
+ default=False
+ )
+ parser.add_argument(
+ "-u", "--upload",
+ help="Upload to Dataverse",
+ action='store_true'
+ )
+
+ args = parser.parse_args()
+
+ # Ensure we have either DOIs as arguments or a file
+ if not args.dois and not args.file:
+ console.print(f"{ICONS['error']} Error: No DOIs provided. Use either command line arguments or -f/--file option.", style="error")
+ parser.print_help()
+ sys.exit(1)
+
+ # Get DOIs from both direct arguments and file if provided
+ dois = set(args.dois) # Start with directly provided DOIs
+ if args.file:
+ console.print(f"{ICONS['file']} Reading DOIs from file: {args.file.name}", style="info")
+ dois.update(line.strip() for line in args.file if line.strip())
+
+ # Create output directory if it doesn't exist
+ output_dir = Path(args.output_dir)
+ try:
+ output_dir.mkdir(parents=True, exist_ok=True)
+ console.print(f"{ICONS['folder']} Output directory: {output_dir}\n", style="info")
+ except Exception as e:
+ console.print(f"Failed to create output directory: {str(e)}\n", style="error")
+ sys.exit(1)
+
+ if args.contact_mail:
+ if not validate_email_address(args.contact_mail):
+ raise ValueError(f"Not a valid email address: {args.contact_mail}")
+ console.print(f"{ICONS['info']} Exposing contact email <{args.contact_mail}> to API services.\n", style="info")
+
+ # Process DOIs and track time
+ start_time = datetime.now()
+ results = process_doi_batch(
+ dois=dois,
+ output_dir=output_dir,
+ depositor=args.depositor,
+ default_subject=args.subject,
+ contact_mail=args.contact_mail,
+ upload=args.upload
+ )
+
+
+
+ except KeyboardInterrupt:
+ console.print(f"\n{ICONS['warning']} Processing interrupted by user", style="warning")
+ sys.exit(1)
+ except Exception as e:
+ console.print(f"\n{ICONS['error']} An unexpected error occurred: {str(e)}", style="error")
+ sys.exit(1)
diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 0000000..9a2e475
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1,3 @@
+pytest>=8.3.5,<9.0
+pytest-mock>=3.14.0,4.0
+ruff>=0.11.1,<0.20
diff --git a/requirements-doc.txt b/requirements-doc.txt
new file mode 100644
index 0000000..045ded0
--- /dev/null
+++ b/requirements-doc.txt
@@ -0,0 +1,2 @@
+sphinx>=8.2.3,<9.0.0
+sphinx_rtd_theme>=3.0,<4.0
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..3e808cf
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,6 @@
+dnspython>=2.7.0,<3.0.0
+requests>=2.32.3,<2.33.0
+PyYAML>=6.0.2,<7.0
+email_validator>=2.2.0,<3.0.0
+rich>=13.9.4,<14.0.0
+idutils>=1.4.2,<2.0.0
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..be93bdd
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,46 @@
+from setuptools import find_packages, setup
+
+setup(
+ name="doi2dataset",
+ version="1.0",
+ description="A tool to process DOIs and generate metadata for Dataverse.org datasets.",
+ long_description=open("README.md", encoding="utf-8").read() if open("README.md", encoding="utf-8") else "",
+ long_description_content_type="text/markdown",
+ author="Alexander Minges",
+ author_email="alexander.minges@uni-due.de",
+ url="https://github.com/your_username/doi2dataset",
+ packages=find_packages(),
+ install_requires=[
+ "dnspython>=2.7.0,<3.0.0",
+ "requests>=2.32.3,<2.33.0",
+ "PyYAML>=6.0,<7.0",
+ "email_validator>=2.2.0,<3.0.0",
+ "rich>=13.9.4,<14.0.0",
+ "idutils>=1.4.2,<2.0.0"
+ ],
+ extras_require={
+ "docs": [
+ "sphinx>=8.2.3,<9.0.0",
+ "sphinx_rtd_theme>=3.0,<4.0"
+ ],
+ "dev": [
+ "pytest>=8.3.5,<9.0",
+ "pytest-mock>=3.14.0,4.0",
+ "ruff>=0.11.1,<0.20"
+ ]
+ },
+ entry_points={
+ "console_scripts": [
+ "doi2dataset=doi2dataset:main"
+ ]
+ },
+ classifiers=[
+ "Programming Language :: Python :: 3",
+ "Operating System :: OS Independent",
+ "License :: OSI Approved :: MIT License",
+ "Development Status :: 4 - Beta",
+ "Intended Audience :: Developers",
+ "Topic :: Software Development :: Build Tools",
+ ],
+ python_requires='>=3.10',
+)
diff --git a/tests/test_doi2dataset.py b/tests/test_doi2dataset.py
new file mode 100644
index 0000000..65ceecb
--- /dev/null
+++ b/tests/test_doi2dataset.py
@@ -0,0 +1,50 @@
+import os
+import sys
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+
+from doi2dataset import NameProcessor, Phase, sanitize_filename, validate_email_address
+
+
+def test_phase_check_year():
+ """Test that check_year correctly determines if a year is within the phase boundaries."""
+ phase = Phase("TestPhase", 2000, 2010)
+ # Within boundaries
+ assert phase.check_year(2005) is True
+ # Outside boundaries
+ assert phase.check_year(1999) is False
+ assert phase.check_year(2011) is False
+ # Boundary cases
+ assert phase.check_year(2000) is True
+ assert phase.check_year(2010) is True
+
+def test_sanitize_filename():
+ """Test the sanitize_filename function to convert DOI to a valid filename."""
+ doi = "10.1234/abc.def"
+ expected = "10_1234_abc_def"
+ result = sanitize_filename(doi)
+ assert result == expected
+
+def test_split_name_with_comma():
+ """Test splitting a full name that contains a comma."""
+ full_name = "Doe, John"
+ given, family = NameProcessor.split_name(full_name)
+ assert given == "John"
+ assert family == "Doe"
+
+def test_split_name_without_comma():
+ """Test splitting a full name that does not contain a comma."""
+ full_name = "John Doe"
+ given, family = NameProcessor.split_name(full_name)
+ assert given == "John"
+ assert family == "Doe"
+
+def test_validate_email_address_valid():
+ """Test that a valid email address is correctly recognized."""
+ valid_email = "john.doe@iana.org"
+ assert validate_email_address(valid_email) is True
+
+def test_validate_email_address_invalid():
+ """Test that an invalid email address is correctly rejected."""
+ invalid_email = "john.doe@invalid_domain"
+ assert validate_email_address(invalid_email) is False
diff --git a/tests/test_fetch_doi_mock.py b/tests/test_fetch_doi_mock.py
new file mode 100644
index 0000000..a5de463
--- /dev/null
+++ b/tests/test_fetch_doi_mock.py
@@ -0,0 +1,57 @@
+import json
+import os
+import sys
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+
+import pytest
+
+from doi2dataset import MetadataProcessor
+
+
+class FakeResponse:
+ """
+ A fake response object to simulate an API response.
+ """
+ def __init__(self, json_data, status_code=200):
+ self._json = json_data
+ self.status_code = status_code
+
+ def json(self):
+ return self._json
+
+ def raise_for_status(self):
+ pass
+
+@pytest.fixture
+def fake_openalex_response():
+ """
+ Load the saved JSON response from the file 'srep45389.json'
+ located in the same directory as this test file.
+ """
+ json_path = os.path.join(os.path.dirname(__file__), "srep45389.json")
+ with open(json_path, "r", encoding="utf-8") as f:
+ data = json.load(f)
+ return data
+
+def test_fetch_doi_data_with_file(mocker, fake_openalex_response):
+ """
+ Test fetching DOI metadata by simulating the API call with a locally saved JSON response.
+
+ The APIClient.make_request method is patched to return a fake response based on
+ the contents of 'srep45389.json', so that no actual network request is performed.
+ """
+ doi = "10.1038/srep45389"
+ fake_response = FakeResponse(fake_openalex_response, 200)
+
+ # Patch the make_request method of APIClient to return our fake_response.
+ mocker.patch("doi2dataset.APIClient.make_request", return_value=fake_response)
+
+ # Instantiate MetadataProcessor without upload and progress.
+ processor = MetadataProcessor(doi=doi, upload=False)
+
+ # Call _fetch_data(), which should now return our fake JSON data.
+ data = processor._fetch_data()
+
+ # Verify that the fetched data matches the fake JSON data.
+ assert data == fake_openalex_response