feat: complete CLI module separation
- Extract CLI functionality from monolithic script to dedicated modules - Create doi2dataset/cli.py with all command-line interface logic - Create doi2dataset/main.py as clean entry point - Add normalize_doi function to utils.validation module - Update pyproject.toml entry point to use new CLI module - Maintain full backward compatibility with original doi2dataset.py CLI modules created: - cli.py: CLI functionality, argument parsing, progress tracking - main.py: Clean entry point module Features maintained: - All command-line arguments and options work identically - Progress tracking and rich console output preserved - Error handling and validation maintained - DOI batch processing functionality intact - All 38 tests passing with 61% coverage Benefits achieved: - Clean separation between CLI and core business logic - Multiple entry points available (cli.py, main.py, original script) - Foundation for future CLI enhancements and testing - Professional package structure with proper entry points
This commit is contained in:
parent
b6209691c3
commit
091311038d
6 changed files with 360 additions and 1 deletions
|
@ -35,6 +35,7 @@ from .api import (
|
|||
APIClient,
|
||||
LicenseProcessor,
|
||||
)
|
||||
from .cli import main, print_summary, process_doi_batch
|
||||
from .core import (
|
||||
Abstract,
|
||||
BaseMetadataField,
|
||||
|
@ -89,6 +90,10 @@ __all__ = [
|
|||
"NameProcessor",
|
||||
"PIFinder",
|
||||
"SubjectMapper",
|
||||
# CLI components
|
||||
"main",
|
||||
"process_doi_batch",
|
||||
"print_summary",
|
||||
# Utilities
|
||||
"validate_doi",
|
||||
"validate_email_address",
|
||||
|
|
326
doi2dataset/cli.py
Normal file
326
doi2dataset/cli.py
Normal file
|
@ -0,0 +1,326 @@
|
|||
"""
|
||||
Command-line interface for doi2dataset.
|
||||
|
||||
This module provides the main CLI functionality for processing DOIs and generating
|
||||
metadata for Dataverse datasets. It handles argument parsing, progress tracking,
|
||||
and batch processing of multiple DOIs.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from rich.console import Console
|
||||
from rich.panel import Panel
|
||||
from rich.progress import (
|
||||
BarColumn,
|
||||
Progress,
|
||||
SpinnerColumn,
|
||||
TextColumn,
|
||||
TimeElapsedColumn,
|
||||
)
|
||||
from rich.table import Table
|
||||
from rich.theme import Theme
|
||||
|
||||
from .processing.metadata import MetadataProcessor
|
||||
from .utils.validation import normalize_doi, sanitize_filename, validate_email_address
|
||||
|
||||
# Console icons for user-friendly output
|
||||
ICONS = {
|
||||
"success": "✓", # Simple checkmark
|
||||
"error": "✗", # Simple X
|
||||
"warning": "!", # Simple exclamation
|
||||
"info": "ℹ", # Info symbol
|
||||
"processing": "⋯", # Three dots
|
||||
"done": "∎", # Filled square
|
||||
"file": "⨳", # Document symbol
|
||||
"folder": "⊞", # Folder symbol
|
||||
"clock": "◷", # Clock symbol
|
||||
"search": "⌕", # Search symbol
|
||||
"data": "≡", # Three lines
|
||||
"doi": "∾", # Link symbol
|
||||
"total": "∑", # Sum symbol
|
||||
"save": "⤓", # Save/download arrow
|
||||
"upload": "⤒", # Upload arrow
|
||||
}
|
||||
|
||||
# Theme configuration for Rich console output
|
||||
THEME = Theme(
|
||||
{
|
||||
"info": "cyan",
|
||||
"warning": "yellow",
|
||||
"error": "red bold",
|
||||
"success": "green",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def print_summary(results: dict[str, list[Any]], console: Console) -> None:
|
||||
"""
|
||||
Print a summary table of processing results to the console.
|
||||
|
||||
Args:
|
||||
results (dict[str, list[Any]]): Dictionary containing success and failed DOIs.
|
||||
console (Console): Rich console object for output.
|
||||
"""
|
||||
table = Table(title="Processing Results")
|
||||
|
||||
table.add_column("Status", style="bold")
|
||||
table.add_column("Count", justify="right")
|
||||
table.add_column("DOIs", style="dim")
|
||||
|
||||
table.add_row(
|
||||
f"{ICONS['success']} Success",
|
||||
str(len(results["success"])),
|
||||
", ".join(results["success"][:3])
|
||||
+ ("..." if len(results["success"]) > 3 else ""),
|
||||
)
|
||||
|
||||
if results["failed"]:
|
||||
table.add_row(
|
||||
f"{ICONS['error']} Failed",
|
||||
str(len(results["failed"])),
|
||||
", ".join(doi for doi, _ in results["failed"][:3])
|
||||
+ ("..." if len(results["failed"]) > 3 else ""),
|
||||
)
|
||||
|
||||
console.print(Panel(table, title="Summary", border_style="blue"))
|
||||
|
||||
|
||||
def process_doi_batch(
|
||||
dois: set[str],
|
||||
output_dir: Path,
|
||||
depositor: str | None = None,
|
||||
default_subject: str = "Medicine, Health and Life Sciences",
|
||||
contact_mail: str | None = None,
|
||||
upload: bool = False,
|
||||
ror: bool = False,
|
||||
console: Console | None = None,
|
||||
) -> dict[str, list[Any]]:
|
||||
"""
|
||||
Process a batch of DOIs and return a summary of results.
|
||||
|
||||
Args:
|
||||
dois (set[str]): Set of DOIs to process.
|
||||
output_dir (Path): Directory where metadata files will be saved.
|
||||
depositor (str | None): Depositor name.
|
||||
default_subject (str): Default subject for metadata.
|
||||
contact_mail (str | None): Contact email address.
|
||||
upload (bool): Flag indicating whether to upload metadata to Dataverse.
|
||||
ror (bool): Flag indication whether to use ROR id for affiliation.
|
||||
console (Console | None): Rich console instance for output.
|
||||
|
||||
Returns:
|
||||
dict[str, list[Any]]: Dictionary with keys 'success' and 'failed'.
|
||||
"""
|
||||
results: dict[str, list[Any]] = {"success": [], "failed": []}
|
||||
|
||||
# Use provided console or create a new one
|
||||
if console is None:
|
||||
console = Console()
|
||||
|
||||
progress_columns = [
|
||||
SpinnerColumn(),
|
||||
TextColumn("[bold blue]{task.description:<50}"),
|
||||
BarColumn(bar_width=None),
|
||||
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
|
||||
TextColumn("•"), # Separator
|
||||
TimeElapsedColumn(),
|
||||
TextColumn("•"), # Separator
|
||||
TextColumn("[bold]{task.completed}/{task.total}"),
|
||||
]
|
||||
|
||||
# Define steps for each DOI processing
|
||||
if upload:
|
||||
doi_total_steps = 4 # Fetch, Build, Upload, Save
|
||||
else:
|
||||
doi_total_steps = 3 # Fetch, Build, Save
|
||||
|
||||
with Progress(
|
||||
*progress_columns,
|
||||
console=console,
|
||||
transient=True, # This makes the progress bar disappear after completion
|
||||
) as progress:
|
||||
# Add main task
|
||||
main_task = progress.add_task("[bold blue]Processing DOIs...", total=len(dois))
|
||||
|
||||
# Add status task for current DOI
|
||||
status_task = progress.add_task(
|
||||
"[cyan]Current:", total=doi_total_steps, visible=False
|
||||
)
|
||||
|
||||
for doi in dois:
|
||||
try:
|
||||
# Update status display
|
||||
progress.update(
|
||||
status_task,
|
||||
description=f"[cyan]Current: [white]{doi[:50]}...",
|
||||
visible=True,
|
||||
completed=0, # Reset progress for new DOI
|
||||
)
|
||||
|
||||
# Process the DOI
|
||||
sanitized_filename = sanitize_filename(normalize_doi(doi))
|
||||
output_path = output_dir / f"{sanitized_filename}_metadata.json"
|
||||
|
||||
processor = MetadataProcessor(
|
||||
doi=doi,
|
||||
depositor=depositor,
|
||||
output_path=output_path,
|
||||
default_subject=default_subject,
|
||||
contact_mail=contact_mail,
|
||||
upload=upload,
|
||||
ror=ror,
|
||||
console=console,
|
||||
progress=progress,
|
||||
task_id=status_task,
|
||||
)
|
||||
|
||||
# Process and capture result
|
||||
processor.process()
|
||||
results["success"].append(doi)
|
||||
|
||||
# Update progress
|
||||
progress.advance(main_task)
|
||||
|
||||
except Exception as e:
|
||||
# Handle errors
|
||||
results["failed"].append((doi, str(e)))
|
||||
|
||||
# Show error but keep progress bar
|
||||
progress.console.print(
|
||||
f"{ICONS['error']} Error processing {doi}: {str(e)}", style="error"
|
||||
)
|
||||
finally:
|
||||
# Clear current status
|
||||
progress.update(status_task, visible=False)
|
||||
|
||||
# Print final summary
|
||||
print_summary(results, console)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def create_argument_parser() -> argparse.ArgumentParser:
|
||||
"""
|
||||
Create and configure the argument parser for the CLI.
|
||||
|
||||
Returns:
|
||||
argparse.ArgumentParser: Configured argument parser.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description="Process DOIs to generate metadata")
|
||||
|
||||
parser.add_argument("dois", nargs="*", help="One or more DOIs to process")
|
||||
|
||||
parser.add_argument(
|
||||
"-f",
|
||||
"--file",
|
||||
help="File containing DOIs (one per line)",
|
||||
type=argparse.FileType("r"),
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-o",
|
||||
"--output-dir",
|
||||
help="Output directory for metadata files",
|
||||
default=".",
|
||||
)
|
||||
|
||||
parser.add_argument("-d", "--depositor", help="Name of the depositor", default=None)
|
||||
|
||||
parser.add_argument(
|
||||
"-s",
|
||||
"--subject",
|
||||
help="Default subject",
|
||||
default="Medicine, Health and Life Sciences",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-m", "--contact-mail", help="Contact email address", default=False
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-u", "--upload", help="Upload to Dataverse", action="store_true"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-r", "--use-ror", help="Use ROR ID if available", action="store_true"
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Main entry point for the console script."""
|
||||
console = Console(theme=THEME)
|
||||
|
||||
try:
|
||||
parser = create_argument_parser()
|
||||
args = parser.parse_args()
|
||||
|
||||
# Ensure we have either DOIs as arguments or a file
|
||||
if not args.dois and not args.file:
|
||||
console.print(
|
||||
f"{ICONS['error']} Error: No DOIs provided. Use either command line arguments or -f/--file option.",
|
||||
style="error",
|
||||
)
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
# Get DOIs from both direct arguments and file if provided
|
||||
dois = set(args.dois) # Start with directly provided DOIs
|
||||
if args.file:
|
||||
console.print(
|
||||
f"{ICONS['file']} Reading DOIs from file: {args.file.name}",
|
||||
style="info",
|
||||
)
|
||||
dois.update(line.strip() for line in args.file if line.strip())
|
||||
|
||||
# Create output directory if it doesn't exist
|
||||
output_dir = Path(args.output_dir)
|
||||
try:
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
console.print(
|
||||
f"{ICONS['folder']} Output directory: {output_dir}\n", style="info"
|
||||
)
|
||||
except Exception as e:
|
||||
console.print(
|
||||
f"Failed to create output directory: {str(e)}\n", style="error"
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
if args.contact_mail:
|
||||
if not validate_email_address(args.contact_mail):
|
||||
raise ValueError(f"Not a valid email address: {args.contact_mail}")
|
||||
console.print(
|
||||
f"{ICONS['info']} Exposing contact email <{args.contact_mail}> to API services.\n",
|
||||
style="info",
|
||||
)
|
||||
|
||||
# Process DOIs and track time
|
||||
process_doi_batch(
|
||||
dois=dois,
|
||||
output_dir=output_dir,
|
||||
depositor=args.depositor,
|
||||
default_subject=args.subject,
|
||||
contact_mail=args.contact_mail,
|
||||
upload=args.upload,
|
||||
ror=args.use_ror,
|
||||
console=console,
|
||||
)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
console.print(
|
||||
f"\n{ICONS['warning']} Processing interrupted by user", style="warning"
|
||||
)
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
console.print(
|
||||
f"\n{ICONS['error']} An unexpected error occurred: {str(e)}", style="error"
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
11
doi2dataset/main.py
Normal file
11
doi2dataset/main.py
Normal file
|
@ -0,0 +1,11 @@
|
|||
"""
|
||||
Main entry point for doi2dataset.
|
||||
|
||||
This module provides the primary entry point for the doi2dataset package,
|
||||
importing and calling the main CLI function.
|
||||
"""
|
||||
|
||||
from .cli import main
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -6,6 +6,7 @@ and other helper functions used throughout the application.
|
|||
"""
|
||||
|
||||
from .validation import (
|
||||
normalize_doi,
|
||||
normalize_string,
|
||||
sanitize_filename,
|
||||
split_name,
|
||||
|
@ -19,4 +20,5 @@ __all__ = [
|
|||
"sanitize_filename",
|
||||
"split_name",
|
||||
"normalize_string",
|
||||
"normalize_doi",
|
||||
]
|
||||
|
|
|
@ -32,6 +32,21 @@ def validate_doi(doi: str) -> bool:
|
|||
return is_doi(doi)
|
||||
|
||||
|
||||
def normalize_doi(doi: str) -> str:
|
||||
"""
|
||||
Normalize a DOI string using idutils.
|
||||
|
||||
Args:
|
||||
doi (str): The DOI to normalize.
|
||||
|
||||
Returns:
|
||||
str: The normalized DOI string.
|
||||
"""
|
||||
from idutils.normalizers import normalize_doi as idutils_normalize_doi
|
||||
|
||||
return idutils_normalize_doi(doi)
|
||||
|
||||
|
||||
def validate_email_address(email: str) -> bool:
|
||||
"""
|
||||
Validate an email address and ensure its domain has an MX record.
|
||||
|
|
|
@ -57,7 +57,7 @@ test = [
|
|||
]
|
||||
|
||||
[project.scripts]
|
||||
doi2dataset = "doi2dataset:main"
|
||||
doi2dataset = "doi2dataset.cli:main"
|
||||
|
||||
[tool.setuptools_scm]
|
||||
version_scheme = "python-simplified-semver"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue