feat: complete CLI module separation

- Extract CLI functionality from monolithic script to dedicated modules
- Create doi2dataset/cli.py with all command-line interface logic
- Create doi2dataset/main.py as clean entry point
- Add normalize_doi function to utils.validation module
- Update pyproject.toml entry point to use new CLI module
- Maintain full backward compatibility with original doi2dataset.py

CLI modules created:
- cli.py: CLI functionality, argument parsing, progress tracking
- main.py: Clean entry point module

Features maintained:
- All command-line arguments and options work identically
- Progress tracking and rich console output preserved
- Error handling and validation maintained
- DOI batch processing functionality intact
- All 38 tests passing with 61% coverage

Benefits achieved:
- Clean separation between CLI and core business logic
- Multiple entry points available (cli.py, main.py, original script)
- Foundation for future CLI enhancements and testing
- Professional package structure with proper entry points
This commit is contained in:
Alexander Minges 2025-07-22 11:08:35 +02:00
parent b6209691c3
commit 091311038d
Signed by: Athemis
SSH key fingerprint: SHA256:TUXshgulbwL+FRYvBNo54pCsI0auROsSEgSvueKbkZ4
6 changed files with 360 additions and 1 deletions

View file

@ -35,6 +35,7 @@ from .api import (
APIClient,
LicenseProcessor,
)
from .cli import main, print_summary, process_doi_batch
from .core import (
Abstract,
BaseMetadataField,
@ -89,6 +90,10 @@ __all__ = [
"NameProcessor",
"PIFinder",
"SubjectMapper",
# CLI components
"main",
"process_doi_batch",
"print_summary",
# Utilities
"validate_doi",
"validate_email_address",

326
doi2dataset/cli.py Normal file
View file

@ -0,0 +1,326 @@
"""
Command-line interface for doi2dataset.
This module provides the main CLI functionality for processing DOIs and generating
metadata for Dataverse datasets. It handles argument parsing, progress tracking,
and batch processing of multiple DOIs.
"""
import argparse
import sys
from pathlib import Path
from typing import Any
from rich.console import Console
from rich.panel import Panel
from rich.progress import (
BarColumn,
Progress,
SpinnerColumn,
TextColumn,
TimeElapsedColumn,
)
from rich.table import Table
from rich.theme import Theme
from .processing.metadata import MetadataProcessor
from .utils.validation import normalize_doi, sanitize_filename, validate_email_address
# Console icons for user-friendly output
ICONS = {
"success": "", # Simple checkmark
"error": "", # Simple X
"warning": "!", # Simple exclamation
"info": "", # Info symbol
"processing": "", # Three dots
"done": "", # Filled square
"file": "", # Document symbol
"folder": "", # Folder symbol
"clock": "", # Clock symbol
"search": "", # Search symbol
"data": "", # Three lines
"doi": "", # Link symbol
"total": "", # Sum symbol
"save": "", # Save/download arrow
"upload": "", # Upload arrow
}
# Theme configuration for Rich console output
THEME = Theme(
{
"info": "cyan",
"warning": "yellow",
"error": "red bold",
"success": "green",
}
)
def print_summary(results: dict[str, list[Any]], console: Console) -> None:
"""
Print a summary table of processing results to the console.
Args:
results (dict[str, list[Any]]): Dictionary containing success and failed DOIs.
console (Console): Rich console object for output.
"""
table = Table(title="Processing Results")
table.add_column("Status", style="bold")
table.add_column("Count", justify="right")
table.add_column("DOIs", style="dim")
table.add_row(
f"{ICONS['success']} Success",
str(len(results["success"])),
", ".join(results["success"][:3])
+ ("..." if len(results["success"]) > 3 else ""),
)
if results["failed"]:
table.add_row(
f"{ICONS['error']} Failed",
str(len(results["failed"])),
", ".join(doi for doi, _ in results["failed"][:3])
+ ("..." if len(results["failed"]) > 3 else ""),
)
console.print(Panel(table, title="Summary", border_style="blue"))
def process_doi_batch(
dois: set[str],
output_dir: Path,
depositor: str | None = None,
default_subject: str = "Medicine, Health and Life Sciences",
contact_mail: str | None = None,
upload: bool = False,
ror: bool = False,
console: Console | None = None,
) -> dict[str, list[Any]]:
"""
Process a batch of DOIs and return a summary of results.
Args:
dois (set[str]): Set of DOIs to process.
output_dir (Path): Directory where metadata files will be saved.
depositor (str | None): Depositor name.
default_subject (str): Default subject for metadata.
contact_mail (str | None): Contact email address.
upload (bool): Flag indicating whether to upload metadata to Dataverse.
ror (bool): Flag indication whether to use ROR id for affiliation.
console (Console | None): Rich console instance for output.
Returns:
dict[str, list[Any]]: Dictionary with keys 'success' and 'failed'.
"""
results: dict[str, list[Any]] = {"success": [], "failed": []}
# Use provided console or create a new one
if console is None:
console = Console()
progress_columns = [
SpinnerColumn(),
TextColumn("[bold blue]{task.description:<50}"),
BarColumn(bar_width=None),
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
TextColumn(""), # Separator
TimeElapsedColumn(),
TextColumn(""), # Separator
TextColumn("[bold]{task.completed}/{task.total}"),
]
# Define steps for each DOI processing
if upload:
doi_total_steps = 4 # Fetch, Build, Upload, Save
else:
doi_total_steps = 3 # Fetch, Build, Save
with Progress(
*progress_columns,
console=console,
transient=True, # This makes the progress bar disappear after completion
) as progress:
# Add main task
main_task = progress.add_task("[bold blue]Processing DOIs...", total=len(dois))
# Add status task for current DOI
status_task = progress.add_task(
"[cyan]Current:", total=doi_total_steps, visible=False
)
for doi in dois:
try:
# Update status display
progress.update(
status_task,
description=f"[cyan]Current: [white]{doi[:50]}...",
visible=True,
completed=0, # Reset progress for new DOI
)
# Process the DOI
sanitized_filename = sanitize_filename(normalize_doi(doi))
output_path = output_dir / f"{sanitized_filename}_metadata.json"
processor = MetadataProcessor(
doi=doi,
depositor=depositor,
output_path=output_path,
default_subject=default_subject,
contact_mail=contact_mail,
upload=upload,
ror=ror,
console=console,
progress=progress,
task_id=status_task,
)
# Process and capture result
processor.process()
results["success"].append(doi)
# Update progress
progress.advance(main_task)
except Exception as e:
# Handle errors
results["failed"].append((doi, str(e)))
# Show error but keep progress bar
progress.console.print(
f"{ICONS['error']} Error processing {doi}: {str(e)}", style="error"
)
finally:
# Clear current status
progress.update(status_task, visible=False)
# Print final summary
print_summary(results, console)
return results
def create_argument_parser() -> argparse.ArgumentParser:
"""
Create and configure the argument parser for the CLI.
Returns:
argparse.ArgumentParser: Configured argument parser.
"""
parser = argparse.ArgumentParser(description="Process DOIs to generate metadata")
parser.add_argument("dois", nargs="*", help="One or more DOIs to process")
parser.add_argument(
"-f",
"--file",
help="File containing DOIs (one per line)",
type=argparse.FileType("r"),
)
parser.add_argument(
"-o",
"--output-dir",
help="Output directory for metadata files",
default=".",
)
parser.add_argument("-d", "--depositor", help="Name of the depositor", default=None)
parser.add_argument(
"-s",
"--subject",
help="Default subject",
default="Medicine, Health and Life Sciences",
)
parser.add_argument(
"-m", "--contact-mail", help="Contact email address", default=False
)
parser.add_argument(
"-u", "--upload", help="Upload to Dataverse", action="store_true"
)
parser.add_argument(
"-r", "--use-ror", help="Use ROR ID if available", action="store_true"
)
return parser
def main() -> None:
"""Main entry point for the console script."""
console = Console(theme=THEME)
try:
parser = create_argument_parser()
args = parser.parse_args()
# Ensure we have either DOIs as arguments or a file
if not args.dois and not args.file:
console.print(
f"{ICONS['error']} Error: No DOIs provided. Use either command line arguments or -f/--file option.",
style="error",
)
parser.print_help()
sys.exit(1)
# Get DOIs from both direct arguments and file if provided
dois = set(args.dois) # Start with directly provided DOIs
if args.file:
console.print(
f"{ICONS['file']} Reading DOIs from file: {args.file.name}",
style="info",
)
dois.update(line.strip() for line in args.file if line.strip())
# Create output directory if it doesn't exist
output_dir = Path(args.output_dir)
try:
output_dir.mkdir(parents=True, exist_ok=True)
console.print(
f"{ICONS['folder']} Output directory: {output_dir}\n", style="info"
)
except Exception as e:
console.print(
f"Failed to create output directory: {str(e)}\n", style="error"
)
sys.exit(1)
if args.contact_mail:
if not validate_email_address(args.contact_mail):
raise ValueError(f"Not a valid email address: {args.contact_mail}")
console.print(
f"{ICONS['info']} Exposing contact email <{args.contact_mail}> to API services.\n",
style="info",
)
# Process DOIs and track time
process_doi_batch(
dois=dois,
output_dir=output_dir,
depositor=args.depositor,
default_subject=args.subject,
contact_mail=args.contact_mail,
upload=args.upload,
ror=args.use_ror,
console=console,
)
except KeyboardInterrupt:
console.print(
f"\n{ICONS['warning']} Processing interrupted by user", style="warning"
)
sys.exit(1)
except Exception as e:
console.print(
f"\n{ICONS['error']} An unexpected error occurred: {str(e)}", style="error"
)
sys.exit(1)
if __name__ == "__main__":
main()

11
doi2dataset/main.py Normal file
View file

@ -0,0 +1,11 @@
"""
Main entry point for doi2dataset.
This module provides the primary entry point for the doi2dataset package,
importing and calling the main CLI function.
"""
from .cli import main
if __name__ == "__main__":
main()

View file

@ -6,6 +6,7 @@ and other helper functions used throughout the application.
"""
from .validation import (
normalize_doi,
normalize_string,
sanitize_filename,
split_name,
@ -19,4 +20,5 @@ __all__ = [
"sanitize_filename",
"split_name",
"normalize_string",
"normalize_doi",
]

View file

@ -32,6 +32,21 @@ def validate_doi(doi: str) -> bool:
return is_doi(doi)
def normalize_doi(doi: str) -> str:
"""
Normalize a DOI string using idutils.
Args:
doi (str): The DOI to normalize.
Returns:
str: The normalized DOI string.
"""
from idutils.normalizers import normalize_doi as idutils_normalize_doi
return idutils_normalize_doi(doi)
def validate_email_address(email: str) -> bool:
"""
Validate an email address and ensure its domain has an MX record.

View file

@ -57,7 +57,7 @@ test = [
]
[project.scripts]
doi2dataset = "doi2dataset:main"
doi2dataset = "doi2dataset.cli:main"
[tool.setuptools_scm]
version_scheme = "python-simplified-semver"