From f84a27484816405d78ff55739cc98f64b4451b0f Mon Sep 17 00:00:00 2001 From: Alexander Minges Date: Tue, 20 May 2025 13:07:43 +0200 Subject: [PATCH 1/3] Add support for ROR institution identifiers in affiliations --- doi2dataset.py | 132 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 112 insertions(+), 20 deletions(-) diff --git a/doi2dataset.py b/doi2dataset.py index 683f4dc..7a49732 100755 --- a/doi2dataset.py +++ b/doi2dataset.py @@ -158,6 +158,7 @@ class BaseMetadataField[T]: multiple: bool value: T type: FieldType = field(init=False) + expanded_value: dict[str, str] | None = field(default=None) def __post_init__(self) -> None: """ @@ -196,19 +197,29 @@ class PrimitiveMetadataField(BaseMetadataField[str]): def _set_type(self) -> None: self.type = FieldType.PRIMITIVE - def to_dict(self) -> dict[str, str | bool]: + def to_dict(self) -> dict[str, str | bool | dict[str, str]]: """ Convert the primitive metadata field to a dictionary representation. Returns: dict[str, str | bool]: Dictionary with field properties. """ - return { - "typeName": self.name, - "typeClass": self.type.value, - "multiple": self.multiple, - "value": self.value, - } + + if self.expanded_value: + return { + "typeName": self.name, + "typeClass": self.type.value, + "multiple": self.multiple, + "value": self.value, + "expandedValue": self.expanded_value + } + else: + return { + "typeName": self.name, + "typeClass": self.type.value, + "multiple": self.multiple, + "value": self.value, + } @dataclass class ControlledVocabularyMetadataField(BaseMetadataField[str | list[str]]): @@ -264,6 +275,22 @@ class CompoundMetadataField( "value": value_list } +@dataclass +class Institution: + display_name: str + ror: str = "" + + def affiliation_field(self) -> PrimitiveMetadataField: + if self.ror: + expanded_value = { + "scheme": "http://www.grid.ac/ontology/", + "termName": self.display_name, + "@type": "https://schema.org/Organization" + } + return PrimitiveMetadataField("authorAffiliation", False, self.ror, expanded_value) + else: + return PrimitiveMetadataField("authorAffiliation", False, self.display_name) + @dataclass class Person: """ @@ -274,16 +301,36 @@ class Person: given_name (str): Given name of the person. orcid (str): ORCID identifier (optional). email (str): Email address (optional). - affiliation (str): Affiliation of the person (optional). + affiliation (Institution): Affiliation of the person (optional). project (list[str]): List of associated projects. """ family_name: str given_name: str orcid: str = "" email: str = "" - affiliation: str = "" + affiliation: Institution | str = "" project: list[str] = field(default_factory=list) + def to_dict(self) -> dict[str, str | list[str]]: + """Convert Person to a dictionary for JSON serialization.""" + + return_dict = { + "family_name": self.family_name, + "given_name": self.given_name, + "orcid": self.orcid, + "email": self.email, + "project": self.project + } + + if self.affiliation.ror: + return_dict["affiliation"] = self.affiliation.ror + elif self.affiliation.display_name: + return_dict["affiliation"] = self.affiliation.display_name + else: + return_dict["affiliation"] = "" + + return return_dict + def format_name(self) -> str: """ Format the name in 'Family, Given' order. @@ -300,17 +347,23 @@ class Person: Returns: list: List of metadata fields representing the author. """ + affiliation_field = None + if isinstance(self.affiliation, Institution): + affiliation_field = self.affiliation.affiliation_field() + else: + affiliation_field = PrimitiveMetadataField("authorAffiliation", False, self.affiliation) + if self.orcid: return [ PrimitiveMetadataField("authorName", False, self.format_name()), - PrimitiveMetadataField("authorAffiliation", False, self.affiliation), + affiliation_field, ControlledVocabularyMetadataField("authorIdentifierScheme", False, "ORCID"), PrimitiveMetadataField("authorIdentifier", False, self.orcid) ] else: return [ PrimitiveMetadataField("authorName", False, self.format_name()), - PrimitiveMetadataField("authorAffiliation", False, self.affiliation) + affiliation_field ] def dataset_contact_fields(self) -> list[PrimitiveMetadataField]: @@ -320,12 +373,20 @@ class Person: Returns: list: List of metadata fields for the dataset contact. """ + + affiliation_field = None + if isinstance(self.affiliation, Institution): + affiliation_field = self.affiliation.affiliation_field() + else: + affiliation_field = PrimitiveMetadataField("authorAffiliation", False, self.affiliation) + return [ PrimitiveMetadataField("datasetContactName", False, self.format_name()), - PrimitiveMetadataField("datasetContactAffiliation", False, self.affiliation), + affiliation_field, PrimitiveMetadataField("datasetContactEmail", False, self.email) ] + @dataclass class License: """ @@ -898,7 +959,7 @@ class CitationBuilder: """ Builds various citation-related metadata fields. """ - def __init__(self, data: dict[str, Any], doi: str, pi_finder: PIFinder) -> None: + def __init__(self, data: dict[str, Any], doi: str, pi_finder: PIFinder, ror: bool = False) -> None: """ Initialize the CitationBuilder with data, DOI, and a PIFinder. @@ -909,6 +970,7 @@ class CitationBuilder: """ self.data = data self.doi = doi + self.ror = ror self.pi_finder = pi_finder def build_other_ids(self) -> list[list[PrimitiveMetadataField]]: @@ -995,7 +1057,7 @@ class CitationBuilder: authorship (dict[str, Any]): Authorship metadata. Returns: - Person: Processed author. + Person: Processed author """ display_name = author.get("display_name", "") given_name, family_name = NameProcessor.split_name(display_name) @@ -1003,9 +1065,19 @@ class CitationBuilder: person = Person(family_name, given_name) if affiliations := authorship.get("affiliations"): - affiliation = affiliations[0].get("raw_affiliation_string", "").strip() + affiliation = Institution(affiliations[0].get("raw_affiliation_string", "").strip()) + person.affiliation = affiliation + if self.ror: + if institutions := authorship.get("institutions"): + institution = institutions[0] + if institution.get("ror"): + affiliation = Institution(institution.get("display_name"), institution.get("ror")) + + person.affiliation = affiliation + + if orcid := author.get("orcid"): person.orcid = normalize_orcid(orcid) @@ -1102,6 +1174,7 @@ class MetadataProcessor: default_subject: str = "Other", contact_mail: str | None = None, upload: bool = False, + ror: bool= False, console: Console | None = None, progress: Progress | None = None, task_id: TaskID | None = None @@ -1115,6 +1188,7 @@ class MetadataProcessor: output_path (Path | None): Path where metadata will be saved. default_subject (str): Default subject. contact_mail (str | None): Contact email address. + ror (bool): Whether to use ROR id for affiliation upload (bool): Whether to upload metadata. console (Console | None): Rich console instance. progress (Progress | None): Progress bar instance. @@ -1134,6 +1208,7 @@ class MetadataProcessor: pi_objects = [Person(**pi) for pi in config.PIS] self.pi_finder = PIFinder(pi_objects) self.upload = upload + self.ror = ror self.progress = progress self.task_id = task_id @@ -1250,9 +1325,10 @@ class MetadataProcessor: license_info = LicenseProcessor.process_license(data) abstract_processor = AbstractProcessor(self.api_client) abstract = abstract_processor.get_abstract(self.doi, data, license_info) - citation_builder = CitationBuilder(data, self.doi, self.pi_finder) + citation_builder = CitationBuilder(data, self.doi, self.pi_finder, self.ror) authors, corresponding_authors = citation_builder.build_authors() + author_fields: list[list[PrimitiveMetadataField | ControlledVocabularyMetadataField]] = [] corresponding_author_fields: list[list[PrimitiveMetadataField]] = [] for author in authors: @@ -1451,8 +1527,15 @@ class MetadataProcessor: """ if self.output_path: try: + # Custom JSON encoder to handle custom objects + class CustomEncoder(json.JSONEncoder): + def default(self, obj): + if hasattr(obj, 'to_dict'): + return obj.to_dict() + return super().default(obj) + with open(self.output_path, "w", encoding="utf-8") as f: - json.dump(metadata, f, indent=4, ensure_ascii=False) + json.dump(metadata, f, indent=4, ensure_ascii=False, cls=CustomEncoder) self.console.print(f"{ICONS['save']} Metadata saved in: {self.output_path}", style="info") except Exception as e: self.console.print(f"{ICONS['error']} Error saving metadata: {str(e)}\n", style="error") @@ -1537,7 +1620,8 @@ def process_doi_batch( depositor: str | None = None, default_subject: str = "Medicine, Health and Life Sciences", contact_mail: str | None = None, - upload: bool = False + upload: bool = False, + ror: bool = False ) -> dict[str, list[Any]]: """ Process a batch of DOIs and return a summary of results. @@ -1549,6 +1633,7 @@ def process_doi_batch( default_subject (str): Default subject for metadata. contact_mail (str | None): Contact email address. upload (bool): Flag indicating whether to upload metadata to Dataverse. + ror (bool): Flag indication whether to use ROR id for affiliation. Returns: dict[str, list[Any]]: Dictionary with keys 'success' and 'failed'. @@ -1617,6 +1702,7 @@ def process_doi_batch( default_subject=default_subject, contact_mail=contact_mail, upload=upload, + ror=ror, console=console, progress=progress, task_id=status_task @@ -1688,7 +1774,12 @@ if __name__ == "__main__": parser.add_argument( "-u", "--upload", help="Upload to Dataverse", - action='store_true' + action="store_true" + ) + parser.add_argument( + "-r", "--use-ror", + help="Use ROR ID if available", + action="store_true" ) args = parser.parse_args() @@ -1727,7 +1818,8 @@ if __name__ == "__main__": depositor=args.depositor, default_subject=args.subject, contact_mail=args.contact_mail, - upload=args.upload + upload=args.upload, + ror=args.use_ror ) From 554951265e7b3c8218f1d16dcde80d3a887805d6 Mon Sep 17 00:00:00 2001 From: Alexander Minges Date: Tue, 20 May 2025 13:20:18 +0200 Subject: [PATCH 2/3] Add comprehensive docstrings to classes and methods Improve type annotations in Person.to_dict() and CustomEncoder class to better reflect the actual return types. Enhance documentation for Institution, Person, Abstract, and CustomEncoder classes with detailed method descriptions and parameter explanations. --- doi2dataset.py | 104 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 78 insertions(+), 26 deletions(-) diff --git a/doi2dataset.py b/doi2dataset.py index 7a49732..fa0d711 100755 --- a/doi2dataset.py +++ b/doi2dataset.py @@ -277,10 +277,24 @@ class CompoundMetadataField( @dataclass class Institution: + """ + Represents an institution or organization. + + Attributes: + display_name (str): The name of the institution. + ror (str): Research Organization Registry identifier (optional). + """ display_name: str ror: str = "" def affiliation_field(self) -> PrimitiveMetadataField: + """ + Create a metadata field for the affiliation. + + Returns: + PrimitiveMetadataField: A metadata field representing the institution, + using ROR ID when available. + """ if self.ror: expanded_value = { "scheme": "http://www.grid.ac/ontology/", @@ -311,25 +325,36 @@ class Person: affiliation: Institution | str = "" project: list[str] = field(default_factory=list) - def to_dict(self) -> dict[str, str | list[str]]: - """Convert Person to a dictionary for JSON serialization.""" + def to_dict(self) -> dict[str, str | list[str] | dict[str, str]]: + """ + Convert Person to a dictionary for JSON serialization. - return_dict = { - "family_name": self.family_name, - "given_name": self.given_name, - "orcid": self.orcid, - "email": self.email, - "project": self.project - } + Handles affiliations properly by checking if the affiliation + is an Institution object or a string. - if self.affiliation.ror: - return_dict["affiliation"] = self.affiliation.ror - elif self.affiliation.display_name: - return_dict["affiliation"] = self.affiliation.display_name - else: - return_dict["affiliation"] = "" + Returns: + dict: A dictionary containing the person's information including + name, contact details, and affiliation. + """ + return_dict: dict[str, str | list[str] | dict[str, str]] = { + "family_name": self.family_name, + "given_name": self.given_name, + "orcid": self.orcid, + "email": self.email, + "project": self.project + } - return return_dict + if isinstance(self.affiliation, Institution): + if self.affiliation.ror: + return_dict["affiliation"] = self.affiliation.ror + elif self.affiliation.display_name: + return_dict["affiliation"] = self.affiliation.display_name + else: + return_dict["affiliation"] = "" + else: + return_dict["affiliation"] = "" + + return return_dict def format_name(self) -> str: """ @@ -342,17 +367,21 @@ class Person: def author_fields(self) -> list[PrimitiveMetadataField | ControlledVocabularyMetadataField]: """ - Build metadata fields for an author. + Build metadata fields for the author. + + The method handles both Institution objects and string values for affiliations. + Different fields are generated depending on whether ORCID is available. Returns: - list: List of metadata fields representing the author. + list: List of metadata fields representing the author, including name, + affiliation, and optionally ORCID identifier information. """ affiliation_field = None if isinstance(self.affiliation, Institution): affiliation_field = self.affiliation.affiliation_field() else: affiliation_field = PrimitiveMetadataField("authorAffiliation", False, self.affiliation) - + if self.orcid: return [ PrimitiveMetadataField("authorName", False, self.format_name()), @@ -368,10 +397,14 @@ class Person: def dataset_contact_fields(self) -> list[PrimitiveMetadataField]: """ - Build metadata fields for dataset contact information. + Generate metadata fields for dataset contact. + + The method handles both Institution objects and string values for affiliations. + Creates fields for the contact name, affiliation, and email address. Returns: - list: List of metadata fields for the dataset contact. + list: List of metadata fields for the dataset contact including name, + affiliation, and email address. """ affiliation_field = None @@ -379,7 +412,7 @@ class Person: affiliation_field = self.affiliation.affiliation_field() else: affiliation_field = PrimitiveMetadataField("authorAffiliation", False, self.affiliation) - + return [ PrimitiveMetadataField("datasetContactName", False, self.format_name()), affiliation_field, @@ -414,6 +447,12 @@ class Abstract: source: str def __post_init__(self): + """ + Validate that the abstract source is one of the allowed values. + + Raises: + ValueError: If source is not one of the allowed values. + """ allowed_sources = ["crossref", "openalex", "none"] if self.source not in allowed_sources: raise ValueError(f"{self.source} is not valid! Needs to be one of {str(allowed_sources)}.") @@ -1529,10 +1568,23 @@ class MetadataProcessor: try: # Custom JSON encoder to handle custom objects class CustomEncoder(json.JSONEncoder): - def default(self, obj): - if hasattr(obj, 'to_dict'): - return obj.to_dict() - return super().default(obj) + """ + Custom JSON encoder that handles objects with to_dict method. + + This allows for proper serialization of custom classes like + Institution and Person by calling their to_dict method when + available. + + Args: + o: The object to serialize. + + Returns: + A JSON-serializable representation of the object. + """ + def default(self, o: Any) -> Any: + if hasattr(o, 'to_dict'): + return o.to_dict() + return super().default(o) with open(self.output_path, "w", encoding="utf-8") as f: json.dump(metadata, f, indent=4, ensure_ascii=False, cls=CustomEncoder) From 77930649b16245408434be380550844088f82642 Mon Sep 17 00:00:00 2001 From: Alexander Minges Date: Tue, 20 May 2025 13:27:54 +0200 Subject: [PATCH 3/3] Fix affiliation field name in Person class --- doi2dataset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doi2dataset.py b/doi2dataset.py index fa0d711..9cbd676 100755 --- a/doi2dataset.py +++ b/doi2dataset.py @@ -279,7 +279,7 @@ class CompoundMetadataField( class Institution: """ Represents an institution or organization. - + Attributes: display_name (str): The name of the institution. ror (str): Research Organization Registry identifier (optional). @@ -290,7 +290,7 @@ class Institution: def affiliation_field(self) -> PrimitiveMetadataField: """ Create a metadata field for the affiliation. - + Returns: PrimitiveMetadataField: A metadata field representing the institution, using ROR ID when available. @@ -411,7 +411,7 @@ class Person: if isinstance(self.affiliation, Institution): affiliation_field = self.affiliation.affiliation_field() else: - affiliation_field = PrimitiveMetadataField("authorAffiliation", False, self.affiliation) + affiliation_field = PrimitiveMetadataField("datasetContactAffiliation", False, self.affiliation) return [ PrimitiveMetadataField("datasetContactName", False, self.format_name()), @@ -449,7 +449,7 @@ class Abstract: def __post_init__(self): """ Validate that the abstract source is one of the allowed values. - + Raises: ValueError: If source is not one of the allowed values. """