feat: complete CLI module separation

- Extract CLI functionality from monolithic script to dedicated modules - Create doi2dataset/cli.py with all command-line interface logic - Create doi2dataset/main.py as clean entry point - Add normalize_doi function to utils.validation module - Update pyproject.toml entry point to use new CLI module - Maintain full backward compatibility with original doi2dataset.py CLI modules created: - cli.py: CLI functionality, argument parsing, progress tracking - main.py: Clean entry point module Features maintained: - All command-line arguments and options work identically - Progress tracking and rich console output preserved - Error handling and validation maintained - DOI batch processing functionality intact - All 38 tests passing with 61% coverage Benefits achieved: - Clean separation between CLI and core business logic - Multiple entry points available (cli.py, main.py, original script) - Foundation for future CLI enhancements and testing - Professional package structure with proper entry points
2025-07-22 11:08:35 +02:00 · 2025-07-22 11:08:35 +02:00 · 091311038d
commit 091311038d
parent b6209691c3
6 changed files with 360 additions and 1 deletions
--- a/doi2dataset/init.py
+++ b/doi2dataset/init.py
@ -35,6 +35,7 @@ from .api import (
    APIClient,
    LicenseProcessor,
 )
+from .cli import main, print_summary, process_doi_batch
 from .core import (
    Abstract,
    BaseMetadataField,
@ -89,6 +90,10 @@ __all__ = [
    "NameProcessor",
    "PIFinder",
    "SubjectMapper",
+    # CLI components
+    "main",
+    "process_doi_batch",
+    "print_summary",
    # Utilities
    "validate_doi",
    "validate_email_address",
--- a/doi2dataset/cli.py
+++ b/doi2dataset/cli.py
@ -0,0 +1,326 @@
+"""
+Command-line interface for doi2dataset.
+
+This module provides the main CLI functionality for processing DOIs and generating
+metadata for Dataverse datasets. It handles argument parsing, progress tracking,
+and batch processing of multiple DOIs.
+"""
+
+import argparse
+import sys
+from pathlib import Path
+from typing import Any
+
+from rich.console import Console
+from rich.panel import Panel
+from rich.progress import (
+    BarColumn,
+    Progress,
+    SpinnerColumn,
+    TextColumn,
+    TimeElapsedColumn,
+)
+from rich.table import Table
+from rich.theme import Theme
+
+from .processing.metadata import MetadataProcessor
+from .utils.validation import normalize_doi, sanitize_filename, validate_email_address
+
+# Console icons for user-friendly output
+ICONS = {
+    "success": "✓",  # Simple checkmark
+    "error": "✗",  # Simple X
+    "warning": "!",  # Simple exclamation
+    "info": "ℹ",  # Info symbol
+    "processing": "⋯",  # Three dots
+    "done": "∎",  # Filled square
+    "file": "⨳",  # Document symbol
+    "folder": "⊞",  # Folder symbol
+    "clock": "◷",  # Clock symbol
+    "search": "⌕",  # Search symbol
+    "data": "≡",  # Three lines
+    "doi": "∾",  # Link symbol
+    "total": "∑",  # Sum symbol
+    "save": "⤓",  # Save/download arrow
+    "upload": "⤒",  # Upload arrow
+}
+
+# Theme configuration for Rich console output
+THEME = Theme(
+    {
+        "info": "cyan",
+        "warning": "yellow",
+        "error": "red bold",
+        "success": "green",
+    }
+)
+
+
+def print_summary(results: dict[str, list[Any]], console: Console) -> None:
+    """
+    Print a summary table of processing results to the console.
+
+    Args:
+        results (dict[str, list[Any]]): Dictionary containing success and failed DOIs.
+        console (Console): Rich console object for output.
+    """
+    table = Table(title="Processing Results")
+
+    table.add_column("Status", style="bold")
+    table.add_column("Count", justify="right")
+    table.add_column("DOIs", style="dim")
+
+    table.add_row(
+        f"{ICONS['success']} Success",
+        str(len(results["success"])),
+        ", ".join(results["success"][:3])
+        + ("..." if len(results["success"]) > 3 else ""),
+    )
+
+    if results["failed"]:
+        table.add_row(
+            f"{ICONS['error']} Failed",
+            str(len(results["failed"])),
+            ", ".join(doi for doi, _ in results["failed"][:3])
+            + ("..." if len(results["failed"]) > 3 else ""),
+        )
+
+    console.print(Panel(table, title="Summary", border_style="blue"))
+
+
+def process_doi_batch(
+    dois: set[str],
+    output_dir: Path,
+    depositor: str | None = None,
+    default_subject: str = "Medicine, Health and Life Sciences",
+    contact_mail: str | None = None,
+    upload: bool = False,
+    ror: bool = False,
+    console: Console | None = None,
+) -> dict[str, list[Any]]:
+    """
+    Process a batch of DOIs and return a summary of results.
+
+    Args:
+        dois (set[str]): Set of DOIs to process.
+        output_dir (Path): Directory where metadata files will be saved.
+        depositor (str | None): Depositor name.
+        default_subject (str): Default subject for metadata.
+        contact_mail (str | None): Contact email address.
+        upload (bool): Flag indicating whether to upload metadata to Dataverse.
+        ror (bool): Flag indication whether to use ROR id for affiliation.
+        console (Console | None): Rich console instance for output.
+
+    Returns:
+        dict[str, list[Any]]: Dictionary with keys 'success' and 'failed'.
+    """
+    results: dict[str, list[Any]] = {"success": [], "failed": []}
+
+    # Use provided console or create a new one
+    if console is None:
+        console = Console()
+
+    progress_columns = [
+        SpinnerColumn(),
+        TextColumn("[bold blue]{task.description:<50}"),
+        BarColumn(bar_width=None),
+        TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+        TextColumn("•"),  # Separator
+        TimeElapsedColumn(),
+        TextColumn("•"),  # Separator
+        TextColumn("[bold]{task.completed}/{task.total}"),
+    ]
+
+    # Define steps for each DOI processing
+    if upload:
+        doi_total_steps = 4  # Fetch, Build, Upload, Save
+    else:
+        doi_total_steps = 3  # Fetch, Build, Save
+
+    with Progress(
+        *progress_columns,
+        console=console,
+        transient=True,  # This makes the progress bar disappear after completion
+    ) as progress:
+        # Add main task
+        main_task = progress.add_task("[bold blue]Processing DOIs...", total=len(dois))
+
+        # Add status task for current DOI
+        status_task = progress.add_task(
+            "[cyan]Current:", total=doi_total_steps, visible=False
+        )
+
+        for doi in dois:
+            try:
+                # Update status display
+                progress.update(
+                    status_task,
+                    description=f"[cyan]Current: [white]{doi[:50]}...",
+                    visible=True,
+                    completed=0,  # Reset progress for new DOI
+                )
+
+                # Process the DOI
+                sanitized_filename = sanitize_filename(normalize_doi(doi))
+                output_path = output_dir / f"{sanitized_filename}_metadata.json"
+
+                processor = MetadataProcessor(
+                    doi=doi,
+                    depositor=depositor,
+                    output_path=output_path,
+                    default_subject=default_subject,
+                    contact_mail=contact_mail,
+                    upload=upload,
+                    ror=ror,
+                    console=console,
+                    progress=progress,
+                    task_id=status_task,
+                )
+
+                # Process and capture result
+                processor.process()
+                results["success"].append(doi)
+
+                # Update progress
+                progress.advance(main_task)
+
+            except Exception as e:
+                # Handle errors
+                results["failed"].append((doi, str(e)))
+
+                # Show error but keep progress bar
+                progress.console.print(
+                    f"{ICONS['error']} Error processing {doi}: {str(e)}", style="error"
+                )
+            finally:
+                # Clear current status
+                progress.update(status_task, visible=False)
+
+    # Print final summary
+    print_summary(results, console)
+
+    return results
+
+
+def create_argument_parser() -> argparse.ArgumentParser:
+    """
+    Create and configure the argument parser for the CLI.
+
+    Returns:
+        argparse.ArgumentParser: Configured argument parser.
+    """
+    parser = argparse.ArgumentParser(description="Process DOIs to generate metadata")
+
+    parser.add_argument("dois", nargs="*", help="One or more DOIs to process")
+
+    parser.add_argument(
+        "-f",
+        "--file",
+        help="File containing DOIs (one per line)",
+        type=argparse.FileType("r"),
+    )
+
+    parser.add_argument(
+        "-o",
+        "--output-dir",
+        help="Output directory for metadata files",
+        default=".",
+    )
+
+    parser.add_argument("-d", "--depositor", help="Name of the depositor", default=None)
+
+    parser.add_argument(
+        "-s",
+        "--subject",
+        help="Default subject",
+        default="Medicine, Health and Life Sciences",
+    )
+
+    parser.add_argument(
+        "-m", "--contact-mail", help="Contact email address", default=False
+    )
+
+    parser.add_argument(
+        "-u", "--upload", help="Upload to Dataverse", action="store_true"
+    )
+
+    parser.add_argument(
+        "-r", "--use-ror", help="Use ROR ID if available", action="store_true"
+    )
+
+    return parser
+
+
+def main() -> None:
+    """Main entry point for the console script."""
+    console = Console(theme=THEME)
+
+    try:
+        parser = create_argument_parser()
+        args = parser.parse_args()
+
+        # Ensure we have either DOIs as arguments or a file
+        if not args.dois and not args.file:
+            console.print(
+                f"{ICONS['error']} Error: No DOIs provided. Use either command line arguments or -f/--file option.",
+                style="error",
+            )
+            parser.print_help()
+            sys.exit(1)
+
+        # Get DOIs from both direct arguments and file if provided
+        dois = set(args.dois)  # Start with directly provided DOIs
+        if args.file:
+            console.print(
+                f"{ICONS['file']} Reading DOIs from file: {args.file.name}",
+                style="info",
+            )
+            dois.update(line.strip() for line in args.file if line.strip())
+
+        # Create output directory if it doesn't exist
+        output_dir = Path(args.output_dir)
+        try:
+            output_dir.mkdir(parents=True, exist_ok=True)
+            console.print(
+                f"{ICONS['folder']} Output directory: {output_dir}\n", style="info"
+            )
+        except Exception as e:
+            console.print(
+                f"Failed to create output directory: {str(e)}\n", style="error"
+            )
+            sys.exit(1)
+
+        if args.contact_mail:
+            if not validate_email_address(args.contact_mail):
+                raise ValueError(f"Not a valid email address: {args.contact_mail}")
+            console.print(
+                f"{ICONS['info']} Exposing contact email <{args.contact_mail}> to API services.\n",
+                style="info",
+            )
+
+        # Process DOIs and track time
+        process_doi_batch(
+            dois=dois,
+            output_dir=output_dir,
+            depositor=args.depositor,
+            default_subject=args.subject,
+            contact_mail=args.contact_mail,
+            upload=args.upload,
+            ror=args.use_ror,
+            console=console,
+        )
+
+    except KeyboardInterrupt:
+        console.print(
+            f"\n{ICONS['warning']}  Processing interrupted by user", style="warning"
+        )
+        sys.exit(1)
+    except Exception as e:
+        console.print(
+            f"\n{ICONS['error']}  An unexpected error occurred: {str(e)}", style="error"
+        )
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/doi2dataset/main.py
+++ b/doi2dataset/main.py
@ -0,0 +1,11 @@
+"""
+Main entry point for doi2dataset.
+
+This module provides the primary entry point for the doi2dataset package,
+importing and calling the main CLI function.
+"""
+
+from .cli import main
+
+if __name__ == "__main__":
+    main()
--- a/doi2dataset/utils/init.py
+++ b/doi2dataset/utils/init.py
@ -6,6 +6,7 @@ and other helper functions used throughout the application.
 """

 from .validation import (
+    normalize_doi,
    normalize_string,
    sanitize_filename,
    split_name,
@ -19,4 +20,5 @@ __all__ = [
    "sanitize_filename",
    "split_name",
    "normalize_string",
+    "normalize_doi",
 ]
--- a/doi2dataset/utils/validation.py
+++ b/doi2dataset/utils/validation.py
@ -32,6 +32,21 @@ def validate_doi(doi: str) -> bool:
    return is_doi(doi)


+def normalize_doi(doi: str) -> str:
+    """
+    Normalize a DOI string using idutils.
+
+    Args:
+        doi (str): The DOI to normalize.
+
+    Returns:
+        str: The normalized DOI string.
+    """
+    from idutils.normalizers import normalize_doi as idutils_normalize_doi
+
+    return idutils_normalize_doi(doi)
+
+
 def validate_email_address(email: str) -> bool:
    """
    Validate an email address and ensure its domain has an MX record.
--- a/pyproject.toml
+++ b/pyproject.toml
@ -57,7 +57,7 @@ test = [
 ]

 [project.scripts]
-doi2dataset = "doi2dataset:main"
+doi2dataset = "doi2dataset.cli:main"

 [tool.setuptools_scm]
 version_scheme = "python-simplified-semver"