From 6c9ba4ff1e4a6e0ff28a239759f6ef607e0a686a Mon Sep 17 00:00:00 2001 From: Alexander Minges Date: Thu, 17 Jul 2025 11:22:52 +0200 Subject: [PATCH] feat: add environment variable support Add support for DATAVERSE_URL, DATAVERSE_API_TOKEN, DATAVERSE_DATAVERSE, DATAVERSE_AUTH_USER, and DATAVERSE_AUTH_PASSWORD environment variables. Environment variables override config file values when present. Includes documentation and test coverage. --- CHANGELOG.md | 12 ++ README.md | 34 ++++ config_example.yaml | 7 + docs/source/environment-variables.rst | 229 ++++++++++++++++++++++++++ docs/source/index.rst | 1 + docs/source/usage.rst | 34 ++++ doi2dataset.py | 49 +++++- tests/test_doi2dataset.py | 123 +++++++++++++- 8 files changed, 482 insertions(+), 7 deletions(-) create mode 100644 docs/source/environment-variables.rst diff --git a/CHANGELOG.md b/CHANGELOG.md index c64620f..dbeafd1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Environment variable support for Dataverse configuration +- Support for overriding sensitive credentials using environment variables: + - `DATAVERSE_URL` - Dataverse server URL + - `DATAVERSE_API_TOKEN` - API token for authentication + - `DATAVERSE_DATAVERSE` - Dataverse alias/name + - `DATAVERSE_AUTH_USER` - Basic authentication username + - `DATAVERSE_AUTH_PASSWORD` - Basic authentication password +- Environment variables take precedence over configuration file values +- Backward compatibility maintained - config file values used when environment variables are not set + ## [v2.0.3] - 2025-07-14 ### Added diff --git a/README.md b/README.md index de5dedf..089fc4e 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,40 @@ See `config_example.yaml` for a complete example configuration. **Note**: The PI section is optional. If no corresponding authors are found in the publication metadata and no PIs are configured, the tool will still generate metadata but may issue a warning about missing corresponding author information. +### Environment Variables + +For security and deployment flexibility, you can override Dataverse configuration values using environment variables. This is particularly useful for sensitive credentials like API tokens and passwords. + +The following environment variables are supported: + +- `DATAVERSE_URL` - Dataverse server URL +- `DATAVERSE_API_TOKEN` - API token for authentication +- `DATAVERSE_DATAVERSE` - Dataverse alias/name +- `DATAVERSE_AUTH_USER` - Basic authentication username +- `DATAVERSE_AUTH_PASSWORD` - Basic authentication password + +Environment variables take precedence over values in the configuration file. You can set some or all of these variables - any unset variables will fall back to the config file values. + +#### Example Usage + +```bash +# Set environment variables +export DATAVERSE_API_TOKEN="your-secure-token" +export DATAVERSE_AUTH_PASSWORD="your-secure-password" + +# Run doi2dataset - it will use environment variables for credentials +python doi2dataset.py 10.1234/example.doi + +# Or set them inline for a single run +DATAVERSE_API_TOKEN="token" python doi2dataset.py 10.1234/example.doi +``` + +This approach allows you to: + +- Keep sensitive credentials out of version control +- Use different configurations for different environments (dev, staging, production) +- Deploy the tool with secure environment-based configuration + ## Usage Run doi2dataset from the command line by providing one or more DOIs: diff --git a/config_example.yaml b/config_example.yaml index d00d523..dd22f69 100644 --- a/config_example.yaml +++ b/config_example.yaml @@ -1,3 +1,10 @@ +# Dataverse configuration +# Note: These values can be overridden by environment variables: +# - DATAVERSE_URL +# - DATAVERSE_API_TOKEN +# - DATAVERSE_DATAVERSE +# - DATAVERSE_AUTH_USER +# - DATAVERSE_AUTH_PASSWORD dataverse: url: "https://your-dataverse-instance.org" api_token: "your-api-token-here" diff --git a/docs/source/environment-variables.rst b/docs/source/environment-variables.rst new file mode 100644 index 0000000..d3e22f4 --- /dev/null +++ b/docs/source/environment-variables.rst @@ -0,0 +1,229 @@ +Environment Variables +===================== + +Overview +-------- + +**doi2dataset** supports environment variable configuration to override values from the ``config.yaml`` file. This feature is particularly valuable for: + +- **Security**: Keep sensitive credentials out of version control +- **Deployment**: Use different configurations across environments (development, staging, production) +- **CI/CD**: Securely inject credentials during automated deployments + +Supported Environment Variables +------------------------------- + +The following environment variables can be used to override Dataverse configuration: + +.. list-table:: + :header-rows: 1 + :widths: 25 25 50 + + * - Environment Variable + - Config File Key + - Description + * - ``DATAVERSE_URL`` + - ``dataverse.url`` + - Dataverse server URL + * - ``DATAVERSE_API_TOKEN`` + - ``dataverse.api_token`` + - API token for authentication + * - ``DATAVERSE_DATAVERSE`` + - ``dataverse.dataverse`` + - Dataverse alias/name + * - ``DATAVERSE_AUTH_USER`` + - ``dataverse.auth_user`` + - Basic auth username + * - ``DATAVERSE_AUTH_PASSWORD`` + - ``dataverse.auth_password`` + - Basic auth password + +Precedence Rules +---------------- + +Environment variables take precedence over configuration file values: + +1. **Environment Variable Set**: Uses the environment variable value +2. **Environment Variable Not Set**: Falls back to config file value +3. **Neither Set**: Uses empty string/None (may cause errors) + +This allows for flexible partial overrides - you can set only the sensitive credentials as environment variables while keeping other configuration in the file. + +Usage Examples +-------------- + +Basic Usage +~~~~~~~~~~~ + +.. code-block:: bash + + # Set environment variables + export DATAVERSE_API_TOKEN="your-secure-token" + export DATAVERSE_AUTH_PASSWORD="your-secure-password" + + # Run doi2dataset + python doi2dataset.py 10.1234/example.doi + +Inline Usage +~~~~~~~~~~~~ + +.. code-block:: bash + + # Set variables for a single command + DATAVERSE_API_TOKEN="token" DATAVERSE_URL="https://test.dataverse.org" python doi2dataset.py 10.1234/example.doi + +Shell Script +~~~~~~~~~~~~ + +Create a script to set multiple variables: + +.. code-block:: bash + + #!/bin/bash + # set_dataverse_env.sh + + export DATAVERSE_URL="https://your-dataverse-instance.org" + export DATAVERSE_API_TOKEN="your-api-token" + export DATAVERSE_DATAVERSE="your-dataverse-alias" + export DATAVERSE_AUTH_USER="your-username" + export DATAVERSE_AUTH_PASSWORD="your-password" + + echo "Environment variables set successfully!" + +Usage: + +.. code-block:: bash + + # Source the script to set variables in current shell + source set_dataverse_env.sh + + # Run doi2dataset + python doi2dataset.py 10.1234/example.doi + +Environment Files +~~~~~~~~~~~~~~~~~ + +For development and deployment, use environment files: + +.. code-block:: bash + + # .env file + DATAVERSE_API_TOKEN=your-secure-token + DATAVERSE_AUTH_PASSWORD=your-secure-password + +.. code-block:: bash + + # Load environment file + set -a + source .env + set +a + + # Run application + python doi2dataset.py 10.1234/example.doi + +Security Best Practices +------------------------ + +Use Secrets Management +~~~~~~~~~~~~~~~~~~~~~~ + +Never hardcode sensitive values in scripts or configuration files: + +.. code-block:: bash + + # ❌ Bad - hardcoded secrets + export DATAVERSE_API_TOKEN="abc123-def456-ghi789" + + # ✅ Good - read from secure source + export DATAVERSE_API_TOKEN=$(vault kv get -field=api_token secret/dataverse) + +Limit Environment Variable Scope +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Set environment variables only where needed: + +.. code-block:: bash + + # ❌ Bad - sets variables globally + export DATAVERSE_API_TOKEN="token" + + # ✅ Good - sets variables for specific command + DATAVERSE_API_TOKEN="token" python doi2dataset.py 10.1234/example.doi + +Use Environment Files +~~~~~~~~~~~~~~~~~~~~~ + +For development, use environment files that are excluded from version control: + +.. code-block:: bash + + # .env (add to .gitignore) + DATAVERSE_API_TOKEN=dev-token + DATAVERSE_AUTH_PASSWORD=dev-password + +Troubleshooting +--------------- + +Checking Current Environment Variables +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # Check if variables are set + echo $DATAVERSE_API_TOKEN + echo $DATAVERSE_URL + + # List all DATAVERSE_* variables + env | grep DATAVERSE + +Common Issues +~~~~~~~~~~~~~ + +1. **Variables not taking effect**: Make sure variables are exported in the same shell where you run doi2dataset +2. **Partial override not working**: Check that unset variables have appropriate defaults in config.yaml +3. **Permission errors**: Ensure the API token has the correct permissions for your Dataverse instance + +Migration Guide +--------------- + +If you're migrating from config-file-only setup: + +1. **Identify sensitive values** in your ``config.yaml`` +2. **Set environment variables** for these values +3. **Test the configuration** to ensure it works correctly +4. **Remove sensitive values** from config.yaml (optional) +5. **Update deployment scripts** to set environment variables + +Example Migration +~~~~~~~~~~~~~~~~~ + +Before: + +.. code-block:: yaml + + # config.yaml + dataverse: + url: "https://dataverse.example.org" + api_token: "sensitive-token" + auth_password: "sensitive-password" + dataverse: "my-dataverse" + auth_user: "admin" + +After: + +.. code-block:: yaml + + # config.yaml + dataverse: + url: "https://dataverse.example.org" + dataverse: "my-dataverse" + auth_user: "admin" + # Sensitive values moved to environment variables + +.. code-block:: bash + + # Environment variables + export DATAVERSE_API_TOKEN="sensitive-token" + export DATAVERSE_AUTH_PASSWORD="sensitive-password" + +This approach keeps non-sensitive configuration in the file while securing credentials through environment variables. diff --git a/docs/source/index.rst b/docs/source/index.rst index 608c1d4..0e1fcbc 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -37,6 +37,7 @@ Key Features: introduction installation usage + environment-variables modules contributing commit-messages diff --git a/docs/source/usage.rst b/docs/source/usage.rst index 11d3a08..60b1029 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -64,6 +64,40 @@ Make sure that your **config.yaml** is properly configured before running the to - funder: "Another Funding Agency" id: "GrantID98765" +Environment Variables +--------------------- +For security and deployment flexibility, you can override Dataverse configuration values using environment variables. This is particularly useful for sensitive credentials like API tokens and passwords. + +The following environment variables are supported: + +- ``DATAVERSE_URL`` - Dataverse server URL +- ``DATAVERSE_API_TOKEN`` - API token for authentication +- ``DATAVERSE_DATAVERSE`` - Dataverse alias/name +- ``DATAVERSE_AUTH_USER`` - Basic authentication username +- ``DATAVERSE_AUTH_PASSWORD`` - Basic authentication password + +Environment variables take precedence over values in the configuration file. You can set some or all of these variables - any unset variables will fall back to the config file values. + +Example usage: + +.. code-block:: bash + + # Set environment variables + export DATAVERSE_API_TOKEN="your-secure-token" + export DATAVERSE_AUTH_PASSWORD="your-secure-password" + + # Run doi2dataset - it will use environment variables for credentials + python doi2dataset.py 10.1234/example.doi + + # Or set them inline for a single run + DATAVERSE_API_TOKEN="token" python doi2dataset.py 10.1234/example.doi + +This approach allows you to: + +- Keep sensitive credentials out of version control +- Use different configurations for different environments (dev, staging, production) +- Deploy the tool with secure environment-based configuration + Usage Example with Configuration ---------------------------------- If you have configured your **config.yaml** and want to process DOIs from a file while uploading the metadata, you could run: diff --git a/doi2dataset.py b/doi2dataset.py index 12a3373..aeaaeaf 100755 --- a/doi2dataset.py +++ b/doi2dataset.py @@ -21,6 +21,7 @@ Options: import argparse import json +import os import sys import unicodedata import warnings # TODO: Remove once the warning is stripped from idutils @@ -476,10 +477,15 @@ class Abstract: @dataclass class ConfigData: """ - Represents configuration data loaded from a YAML file. + Represents configuration data loaded from a YAML file with environment variable overrides. + + The dataverse configuration may be overridden by environment variables: + DATAVERSE_URL, DATAVERSE_API_TOKEN, DATAVERSE_DATAVERSE, + DATAVERSE_AUTH_USER, DATAVERSE_AUTH_PASSWORD. Attributes: - dataverse (dict[str, str]): Dataverse-related configuration. + dataverse (dict[str, str]): Dataverse-related configuration with environment + variable overrides applied. pis (list[dict[str, Any]]): List of principal investigator configurations. default_grants (list[dict[str, str]]): Default grant configurations. """ @@ -492,6 +498,15 @@ class ConfigData: class Config: """ Singleton class to handle configuration loading and retrieval. + + Supports environment variable overrides for Dataverse configuration: + - DATAVERSE_URL: Overrides dataverse.url + - DATAVERSE_API_TOKEN: Overrides dataverse.api_token + - DATAVERSE_DATAVERSE: Overrides dataverse.dataverse + - DATAVERSE_AUTH_USER: Overrides dataverse.auth_user + - DATAVERSE_AUTH_PASSWORD: Overrides dataverse.auth_password + + Environment variables take precedence over config file values. """ _instance: "Config | None" = None @@ -511,7 +526,11 @@ class Config: @classmethod def load_config(cls, config_path: str | Path | None = None) -> None: """ - Load configuration from a YAML file. + Load configuration from a YAML file with environment variable overrides. + + Environment variables will override corresponding config file values: + DATAVERSE_URL, DATAVERSE_API_TOKEN, DATAVERSE_DATAVERSE, + DATAVERSE_AUTH_USER, DATAVERSE_AUTH_PASSWORD Args: config_path (str | Path | None): Path to the configuration file. @@ -531,6 +550,23 @@ class Config: with open(config_path, encoding="utf-8") as f: config_data = yaml.safe_load(f) + # Override dataverse config with environment variables if they exist + dataverse_config = config_data.get("dataverse", {}) + + # Check for environment variables and override config values + env_overrides = { + "url": os.getenv("DATAVERSE_URL"), + "api_token": os.getenv("DATAVERSE_API_TOKEN"), + "dataverse": os.getenv("DATAVERSE_DATAVERSE"), + "auth_user": os.getenv("DATAVERSE_AUTH_USER"), + "auth_password": os.getenv("DATAVERSE_AUTH_PASSWORD"), + } + + # Apply environment variable overrides if they exist + for key, env_value in env_overrides.items(): + if env_value is not None: + dataverse_config[key] = env_value + # Validate PI email addresses pis = config_data.get("pis", []) for pi in pis: @@ -541,7 +577,7 @@ class Config: ) cls._config_data = ConfigData( - dataverse=config_data.get("dataverse", {}), + dataverse=dataverse_config, pis=config_data.get("pis", []), default_grants=config_data.get("default_grants", []), ) @@ -586,10 +622,11 @@ class Config: @property def DATAVERSE(self) -> dict[str, str]: """ - Get Dataverse configurations. + Get Dataverse configurations with environment variable overrides applied. Returns: - dict[str, str]: Dataverse configuration. + dict[str, str]: Dataverse configuration with environment variables + taking precedence over config file values. """ return self.get_config().dataverse diff --git a/tests/test_doi2dataset.py b/tests/test_doi2dataset.py index 4f4ec15..6fa279d 100644 --- a/tests/test_doi2dataset.py +++ b/tests/test_doi2dataset.py @@ -1,9 +1,12 @@ import os import sys +import tempfile + +import yaml sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) -from doi2dataset import NameProcessor, sanitize_filename, validate_email_address +from doi2dataset import Config, NameProcessor, sanitize_filename, validate_email_address def test_sanitize_filename(): @@ -40,3 +43,121 @@ def test_validate_email_address_invalid(): """Test that an invalid email address is correctly rejected.""" invalid_email = "john.doe@invalid_domain" assert validate_email_address(invalid_email) is False + + +def test_config_environment_variable_override(): + """Test that environment variables override config file values.""" + # Create a temporary config file with base values + config_data = { + "dataverse": { + "url": "https://config-file-url.org", + "api_token": "config-file-token", + "dataverse": "config-file-dataverse", + "auth_user": "config-file-user", + "auth_password": "config-file-password", + }, + "pis": [], + "default_grants": [], + } + + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + yaml.dump(config_data, f) + temp_config_path = f.name + + try: + # Set environment variables + os.environ["DATAVERSE_URL"] = "https://env-url.org" + os.environ["DATAVERSE_API_TOKEN"] = "env-token" + os.environ["DATAVERSE_DATAVERSE"] = "env-dataverse" + os.environ["DATAVERSE_AUTH_USER"] = "env-user" + os.environ["DATAVERSE_AUTH_PASSWORD"] = "env-password" + + # Reset the Config singleton to ensure fresh load + Config._instance = None + Config._config_data = None + + # Load config with environment variables + Config.load_config(temp_config_path) + config = Config() + + # Verify environment variables override config file values + assert config.DATAVERSE["url"] == "https://env-url.org" + assert config.DATAVERSE["api_token"] == "env-token" + assert config.DATAVERSE["dataverse"] == "env-dataverse" + assert config.DATAVERSE["auth_user"] == "env-user" + assert config.DATAVERSE["auth_password"] == "env-password" + + finally: + # Clean up environment variables + for env_var in [ + "DATAVERSE_URL", + "DATAVERSE_API_TOKEN", + "DATAVERSE_DATAVERSE", + "DATAVERSE_AUTH_USER", + "DATAVERSE_AUTH_PASSWORD", + ]: + if env_var in os.environ: + del os.environ[env_var] + + # Clean up temp file + os.unlink(temp_config_path) + + # Reset Config singleton + Config._instance = None + Config._config_data = None + + +def test_config_partial_environment_variable_override(): + """Test that only some environment variables can be set, others fall back to config file.""" + # Create a temporary config file with base values + config_data = { + "dataverse": { + "url": "https://config-file-url.org", + "api_token": "config-file-token", + "dataverse": "config-file-dataverse", + "auth_user": "config-file-user", + "auth_password": "config-file-password", + }, + "pis": [], + "default_grants": [], + } + + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + yaml.dump(config_data, f) + temp_config_path = f.name + + try: + # Set only some environment variables + os.environ["DATAVERSE_URL"] = "https://env-url.org" + os.environ["DATAVERSE_API_TOKEN"] = "env-token" + # Don't set DATAVERSE_DATAVERSE, DATAVERSE_AUTH_USER, DATAVERSE_AUTH_PASSWORD + + # Reset the Config singleton to ensure fresh load + Config._instance = None + Config._config_data = None + + # Load config with partial environment variables + Config.load_config(temp_config_path) + config = Config() + + # Verify environment variables override where set + assert config.DATAVERSE["url"] == "https://env-url.org" + assert config.DATAVERSE["api_token"] == "env-token" + + # Verify config file values are used where env vars are not set + assert config.DATAVERSE["dataverse"] == "config-file-dataverse" + assert config.DATAVERSE["auth_user"] == "config-file-user" + assert config.DATAVERSE["auth_password"] == "config-file-password" + + finally: + # Clean up environment variables + for env_var in ["DATAVERSE_URL", "DATAVERSE_API_TOKEN"]: + if env_var in os.environ: + del os.environ[env_var] + + # Clean up temp file + os.unlink(temp_config_path) + + # Reset Config singleton + Config._instance = None + Config._config_data = None