diff --git a/doi2dataset/__init__.py b/doi2dataset/__init__.py index c1681f4..28973a7 100644 --- a/doi2dataset/__init__.py +++ b/doi2dataset/__init__.py @@ -35,6 +35,7 @@ from .api import ( APIClient, LicenseProcessor, ) +from .cli import main, print_summary, process_doi_batch from .core import ( Abstract, BaseMetadataField, @@ -89,6 +90,10 @@ __all__ = [ "NameProcessor", "PIFinder", "SubjectMapper", + # CLI components + "main", + "process_doi_batch", + "print_summary", # Utilities "validate_doi", "validate_email_address", diff --git a/doi2dataset/cli.py b/doi2dataset/cli.py new file mode 100644 index 0000000..d092be3 --- /dev/null +++ b/doi2dataset/cli.py @@ -0,0 +1,326 @@ +""" +Command-line interface for doi2dataset. + +This module provides the main CLI functionality for processing DOIs and generating +metadata for Dataverse datasets. It handles argument parsing, progress tracking, +and batch processing of multiple DOIs. +""" + +import argparse +import sys +from pathlib import Path +from typing import Any + +from rich.console import Console +from rich.panel import Panel +from rich.progress import ( + BarColumn, + Progress, + SpinnerColumn, + TextColumn, + TimeElapsedColumn, +) +from rich.table import Table +from rich.theme import Theme + +from .processing.metadata import MetadataProcessor +from .utils.validation import normalize_doi, sanitize_filename, validate_email_address + +# Console icons for user-friendly output +ICONS = { + "success": "✓", # Simple checkmark + "error": "✗", # Simple X + "warning": "!", # Simple exclamation + "info": "ℹ", # Info symbol + "processing": "⋯", # Three dots + "done": "∎", # Filled square + "file": "⨳", # Document symbol + "folder": "⊞", # Folder symbol + "clock": "◷", # Clock symbol + "search": "⌕", # Search symbol + "data": "≡", # Three lines + "doi": "∾", # Link symbol + "total": "∑", # Sum symbol + "save": "⤓", # Save/download arrow + "upload": "⤒", # Upload arrow +} + +# Theme configuration for Rich console output +THEME = Theme( + { + "info": "cyan", + "warning": "yellow", + "error": "red bold", + "success": "green", + } +) + + +def print_summary(results: dict[str, list[Any]], console: Console) -> None: + """ + Print a summary table of processing results to the console. + + Args: + results (dict[str, list[Any]]): Dictionary containing success and failed DOIs. + console (Console): Rich console object for output. + """ + table = Table(title="Processing Results") + + table.add_column("Status", style="bold") + table.add_column("Count", justify="right") + table.add_column("DOIs", style="dim") + + table.add_row( + f"{ICONS['success']} Success", + str(len(results["success"])), + ", ".join(results["success"][:3]) + + ("..." if len(results["success"]) > 3 else ""), + ) + + if results["failed"]: + table.add_row( + f"{ICONS['error']} Failed", + str(len(results["failed"])), + ", ".join(doi for doi, _ in results["failed"][:3]) + + ("..." if len(results["failed"]) > 3 else ""), + ) + + console.print(Panel(table, title="Summary", border_style="blue")) + + +def process_doi_batch( + dois: set[str], + output_dir: Path, + depositor: str | None = None, + default_subject: str = "Medicine, Health and Life Sciences", + contact_mail: str | None = None, + upload: bool = False, + ror: bool = False, + console: Console | None = None, +) -> dict[str, list[Any]]: + """ + Process a batch of DOIs and return a summary of results. + + Args: + dois (set[str]): Set of DOIs to process. + output_dir (Path): Directory where metadata files will be saved. + depositor (str | None): Depositor name. + default_subject (str): Default subject for metadata. + contact_mail (str | None): Contact email address. + upload (bool): Flag indicating whether to upload metadata to Dataverse. + ror (bool): Flag indication whether to use ROR id for affiliation. + console (Console | None): Rich console instance for output. + + Returns: + dict[str, list[Any]]: Dictionary with keys 'success' and 'failed'. + """ + results: dict[str, list[Any]] = {"success": [], "failed": []} + + # Use provided console or create a new one + if console is None: + console = Console() + + progress_columns = [ + SpinnerColumn(), + TextColumn("[bold blue]{task.description:<50}"), + BarColumn(bar_width=None), + TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), + TextColumn("•"), # Separator + TimeElapsedColumn(), + TextColumn("•"), # Separator + TextColumn("[bold]{task.completed}/{task.total}"), + ] + + # Define steps for each DOI processing + if upload: + doi_total_steps = 4 # Fetch, Build, Upload, Save + else: + doi_total_steps = 3 # Fetch, Build, Save + + with Progress( + *progress_columns, + console=console, + transient=True, # This makes the progress bar disappear after completion + ) as progress: + # Add main task + main_task = progress.add_task("[bold blue]Processing DOIs...", total=len(dois)) + + # Add status task for current DOI + status_task = progress.add_task( + "[cyan]Current:", total=doi_total_steps, visible=False + ) + + for doi in dois: + try: + # Update status display + progress.update( + status_task, + description=f"[cyan]Current: [white]{doi[:50]}...", + visible=True, + completed=0, # Reset progress for new DOI + ) + + # Process the DOI + sanitized_filename = sanitize_filename(normalize_doi(doi)) + output_path = output_dir / f"{sanitized_filename}_metadata.json" + + processor = MetadataProcessor( + doi=doi, + depositor=depositor, + output_path=output_path, + default_subject=default_subject, + contact_mail=contact_mail, + upload=upload, + ror=ror, + console=console, + progress=progress, + task_id=status_task, + ) + + # Process and capture result + processor.process() + results["success"].append(doi) + + # Update progress + progress.advance(main_task) + + except Exception as e: + # Handle errors + results["failed"].append((doi, str(e))) + + # Show error but keep progress bar + progress.console.print( + f"{ICONS['error']} Error processing {doi}: {str(e)}", style="error" + ) + finally: + # Clear current status + progress.update(status_task, visible=False) + + # Print final summary + print_summary(results, console) + + return results + + +def create_argument_parser() -> argparse.ArgumentParser: + """ + Create and configure the argument parser for the CLI. + + Returns: + argparse.ArgumentParser: Configured argument parser. + """ + parser = argparse.ArgumentParser(description="Process DOIs to generate metadata") + + parser.add_argument("dois", nargs="*", help="One or more DOIs to process") + + parser.add_argument( + "-f", + "--file", + help="File containing DOIs (one per line)", + type=argparse.FileType("r"), + ) + + parser.add_argument( + "-o", + "--output-dir", + help="Output directory for metadata files", + default=".", + ) + + parser.add_argument("-d", "--depositor", help="Name of the depositor", default=None) + + parser.add_argument( + "-s", + "--subject", + help="Default subject", + default="Medicine, Health and Life Sciences", + ) + + parser.add_argument( + "-m", "--contact-mail", help="Contact email address", default=False + ) + + parser.add_argument( + "-u", "--upload", help="Upload to Dataverse", action="store_true" + ) + + parser.add_argument( + "-r", "--use-ror", help="Use ROR ID if available", action="store_true" + ) + + return parser + + +def main() -> None: + """Main entry point for the console script.""" + console = Console(theme=THEME) + + try: + parser = create_argument_parser() + args = parser.parse_args() + + # Ensure we have either DOIs as arguments or a file + if not args.dois and not args.file: + console.print( + f"{ICONS['error']} Error: No DOIs provided. Use either command line arguments or -f/--file option.", + style="error", + ) + parser.print_help() + sys.exit(1) + + # Get DOIs from both direct arguments and file if provided + dois = set(args.dois) # Start with directly provided DOIs + if args.file: + console.print( + f"{ICONS['file']} Reading DOIs from file: {args.file.name}", + style="info", + ) + dois.update(line.strip() for line in args.file if line.strip()) + + # Create output directory if it doesn't exist + output_dir = Path(args.output_dir) + try: + output_dir.mkdir(parents=True, exist_ok=True) + console.print( + f"{ICONS['folder']} Output directory: {output_dir}\n", style="info" + ) + except Exception as e: + console.print( + f"Failed to create output directory: {str(e)}\n", style="error" + ) + sys.exit(1) + + if args.contact_mail: + if not validate_email_address(args.contact_mail): + raise ValueError(f"Not a valid email address: {args.contact_mail}") + console.print( + f"{ICONS['info']} Exposing contact email <{args.contact_mail}> to API services.\n", + style="info", + ) + + # Process DOIs and track time + process_doi_batch( + dois=dois, + output_dir=output_dir, + depositor=args.depositor, + default_subject=args.subject, + contact_mail=args.contact_mail, + upload=args.upload, + ror=args.use_ror, + console=console, + ) + + except KeyboardInterrupt: + console.print( + f"\n{ICONS['warning']} Processing interrupted by user", style="warning" + ) + sys.exit(1) + except Exception as e: + console.print( + f"\n{ICONS['error']} An unexpected error occurred: {str(e)}", style="error" + ) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/doi2dataset/main.py b/doi2dataset/main.py new file mode 100644 index 0000000..955eb85 --- /dev/null +++ b/doi2dataset/main.py @@ -0,0 +1,11 @@ +""" +Main entry point for doi2dataset. + +This module provides the primary entry point for the doi2dataset package, +importing and calling the main CLI function. +""" + +from .cli import main + +if __name__ == "__main__": + main() diff --git a/doi2dataset/utils/__init__.py b/doi2dataset/utils/__init__.py index e9fa8c7..008fc1b 100644 --- a/doi2dataset/utils/__init__.py +++ b/doi2dataset/utils/__init__.py @@ -6,6 +6,7 @@ and other helper functions used throughout the application. """ from .validation import ( + normalize_doi, normalize_string, sanitize_filename, split_name, @@ -19,4 +20,5 @@ __all__ = [ "sanitize_filename", "split_name", "normalize_string", + "normalize_doi", ] diff --git a/doi2dataset/utils/validation.py b/doi2dataset/utils/validation.py index f336351..ba85e7e 100644 --- a/doi2dataset/utils/validation.py +++ b/doi2dataset/utils/validation.py @@ -32,6 +32,21 @@ def validate_doi(doi: str) -> bool: return is_doi(doi) +def normalize_doi(doi: str) -> str: + """ + Normalize a DOI string using idutils. + + Args: + doi (str): The DOI to normalize. + + Returns: + str: The normalized DOI string. + """ + from idutils.normalizers import normalize_doi as idutils_normalize_doi + + return idutils_normalize_doi(doi) + + def validate_email_address(email: str) -> bool: """ Validate an email address and ensure its domain has an MX record. diff --git a/pyproject.toml b/pyproject.toml index af7c476..6c88b2d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,7 +57,7 @@ test = [ ] [project.scripts] -doi2dataset = "doi2dataset:main" +doi2dataset = "doi2dataset.cli:main" [tool.setuptools_scm] version_scheme = "python-simplified-semver"