Add support for ROR institution identifiers in affiliations

This commit is contained in:
Alexander Minges 2025-05-20 13:07:43 +02:00
parent f7130898fc
commit f84a274848
Signed by: Athemis
SSH key fingerprint: SHA256:TUXshgulbwL+FRYvBNo54pCsI0auROsSEgSvueKbkZ4

View file

@ -158,6 +158,7 @@ class BaseMetadataField[T]:
multiple: bool
value: T
type: FieldType = field(init=False)
expanded_value: dict[str, str] | None = field(default=None)
def __post_init__(self) -> None:
"""
@ -196,13 +197,23 @@ class PrimitiveMetadataField(BaseMetadataField[str]):
def _set_type(self) -> None:
self.type = FieldType.PRIMITIVE
def to_dict(self) -> dict[str, str | bool]:
def to_dict(self) -> dict[str, str | bool | dict[str, str]]:
"""
Convert the primitive metadata field to a dictionary representation.
Returns:
dict[str, str | bool]: Dictionary with field properties.
"""
if self.expanded_value:
return {
"typeName": self.name,
"typeClass": self.type.value,
"multiple": self.multiple,
"value": self.value,
"expandedValue": self.expanded_value
}
else:
return {
"typeName": self.name,
"typeClass": self.type.value,
@ -264,6 +275,22 @@ class CompoundMetadataField(
"value": value_list
}
@dataclass
class Institution:
display_name: str
ror: str = ""
def affiliation_field(self) -> PrimitiveMetadataField:
if self.ror:
expanded_value = {
"scheme": "http://www.grid.ac/ontology/",
"termName": self.display_name,
"@type": "https://schema.org/Organization"
}
return PrimitiveMetadataField("authorAffiliation", False, self.ror, expanded_value)
else:
return PrimitiveMetadataField("authorAffiliation", False, self.display_name)
@dataclass
class Person:
"""
@ -274,16 +301,36 @@ class Person:
given_name (str): Given name of the person.
orcid (str): ORCID identifier (optional).
email (str): Email address (optional).
affiliation (str): Affiliation of the person (optional).
affiliation (Institution): Affiliation of the person (optional).
project (list[str]): List of associated projects.
"""
family_name: str
given_name: str
orcid: str = ""
email: str = ""
affiliation: str = ""
affiliation: Institution | str = ""
project: list[str] = field(default_factory=list)
def to_dict(self) -> dict[str, str | list[str]]:
"""Convert Person to a dictionary for JSON serialization."""
return_dict = {
"family_name": self.family_name,
"given_name": self.given_name,
"orcid": self.orcid,
"email": self.email,
"project": self.project
}
if self.affiliation.ror:
return_dict["affiliation"] = self.affiliation.ror
elif self.affiliation.display_name:
return_dict["affiliation"] = self.affiliation.display_name
else:
return_dict["affiliation"] = ""
return return_dict
def format_name(self) -> str:
"""
Format the name in 'Family, Given' order.
@ -300,17 +347,23 @@ class Person:
Returns:
list: List of metadata fields representing the author.
"""
affiliation_field = None
if isinstance(self.affiliation, Institution):
affiliation_field = self.affiliation.affiliation_field()
else:
affiliation_field = PrimitiveMetadataField("authorAffiliation", False, self.affiliation)
if self.orcid:
return [
PrimitiveMetadataField("authorName", False, self.format_name()),
PrimitiveMetadataField("authorAffiliation", False, self.affiliation),
affiliation_field,
ControlledVocabularyMetadataField("authorIdentifierScheme", False, "ORCID"),
PrimitiveMetadataField("authorIdentifier", False, self.orcid)
]
else:
return [
PrimitiveMetadataField("authorName", False, self.format_name()),
PrimitiveMetadataField("authorAffiliation", False, self.affiliation)
affiliation_field
]
def dataset_contact_fields(self) -> list[PrimitiveMetadataField]:
@ -320,12 +373,20 @@ class Person:
Returns:
list: List of metadata fields for the dataset contact.
"""
affiliation_field = None
if isinstance(self.affiliation, Institution):
affiliation_field = self.affiliation.affiliation_field()
else:
affiliation_field = PrimitiveMetadataField("authorAffiliation", False, self.affiliation)
return [
PrimitiveMetadataField("datasetContactName", False, self.format_name()),
PrimitiveMetadataField("datasetContactAffiliation", False, self.affiliation),
affiliation_field,
PrimitiveMetadataField("datasetContactEmail", False, self.email)
]
@dataclass
class License:
"""
@ -898,7 +959,7 @@ class CitationBuilder:
"""
Builds various citation-related metadata fields.
"""
def __init__(self, data: dict[str, Any], doi: str, pi_finder: PIFinder) -> None:
def __init__(self, data: dict[str, Any], doi: str, pi_finder: PIFinder, ror: bool = False) -> None:
"""
Initialize the CitationBuilder with data, DOI, and a PIFinder.
@ -909,6 +970,7 @@ class CitationBuilder:
"""
self.data = data
self.doi = doi
self.ror = ror
self.pi_finder = pi_finder
def build_other_ids(self) -> list[list[PrimitiveMetadataField]]:
@ -995,7 +1057,7 @@ class CitationBuilder:
authorship (dict[str, Any]): Authorship metadata.
Returns:
Person: Processed author.
Person: Processed author
"""
display_name = author.get("display_name", "")
given_name, family_name = NameProcessor.split_name(display_name)
@ -1003,9 +1065,19 @@ class CitationBuilder:
person = Person(family_name, given_name)
if affiliations := authorship.get("affiliations"):
affiliation = affiliations[0].get("raw_affiliation_string", "").strip()
affiliation = Institution(affiliations[0].get("raw_affiliation_string", "").strip())
person.affiliation = affiliation
if self.ror:
if institutions := authorship.get("institutions"):
institution = institutions[0]
if institution.get("ror"):
affiliation = Institution(institution.get("display_name"), institution.get("ror"))
person.affiliation = affiliation
if orcid := author.get("orcid"):
person.orcid = normalize_orcid(orcid)
@ -1102,6 +1174,7 @@ class MetadataProcessor:
default_subject: str = "Other",
contact_mail: str | None = None,
upload: bool = False,
ror: bool= False,
console: Console | None = None,
progress: Progress | None = None,
task_id: TaskID | None = None
@ -1115,6 +1188,7 @@ class MetadataProcessor:
output_path (Path | None): Path where metadata will be saved.
default_subject (str): Default subject.
contact_mail (str | None): Contact email address.
ror (bool): Whether to use ROR id for affiliation
upload (bool): Whether to upload metadata.
console (Console | None): Rich console instance.
progress (Progress | None): Progress bar instance.
@ -1134,6 +1208,7 @@ class MetadataProcessor:
pi_objects = [Person(**pi) for pi in config.PIS]
self.pi_finder = PIFinder(pi_objects)
self.upload = upload
self.ror = ror
self.progress = progress
self.task_id = task_id
@ -1250,9 +1325,10 @@ class MetadataProcessor:
license_info = LicenseProcessor.process_license(data)
abstract_processor = AbstractProcessor(self.api_client)
abstract = abstract_processor.get_abstract(self.doi, data, license_info)
citation_builder = CitationBuilder(data, self.doi, self.pi_finder)
citation_builder = CitationBuilder(data, self.doi, self.pi_finder, self.ror)
authors, corresponding_authors = citation_builder.build_authors()
author_fields: list[list[PrimitiveMetadataField | ControlledVocabularyMetadataField]] = []
corresponding_author_fields: list[list[PrimitiveMetadataField]] = []
for author in authors:
@ -1451,8 +1527,15 @@ class MetadataProcessor:
"""
if self.output_path:
try:
# Custom JSON encoder to handle custom objects
class CustomEncoder(json.JSONEncoder):
def default(self, obj):
if hasattr(obj, 'to_dict'):
return obj.to_dict()
return super().default(obj)
with open(self.output_path, "w", encoding="utf-8") as f:
json.dump(metadata, f, indent=4, ensure_ascii=False)
json.dump(metadata, f, indent=4, ensure_ascii=False, cls=CustomEncoder)
self.console.print(f"{ICONS['save']} Metadata saved in: {self.output_path}", style="info")
except Exception as e:
self.console.print(f"{ICONS['error']} Error saving metadata: {str(e)}\n", style="error")
@ -1537,7 +1620,8 @@ def process_doi_batch(
depositor: str | None = None,
default_subject: str = "Medicine, Health and Life Sciences",
contact_mail: str | None = None,
upload: bool = False
upload: bool = False,
ror: bool = False
) -> dict[str, list[Any]]:
"""
Process a batch of DOIs and return a summary of results.
@ -1549,6 +1633,7 @@ def process_doi_batch(
default_subject (str): Default subject for metadata.
contact_mail (str | None): Contact email address.
upload (bool): Flag indicating whether to upload metadata to Dataverse.
ror (bool): Flag indication whether to use ROR id for affiliation.
Returns:
dict[str, list[Any]]: Dictionary with keys 'success' and 'failed'.
@ -1617,6 +1702,7 @@ def process_doi_batch(
default_subject=default_subject,
contact_mail=contact_mail,
upload=upload,
ror=ror,
console=console,
progress=progress,
task_id=status_task
@ -1688,7 +1774,12 @@ if __name__ == "__main__":
parser.add_argument(
"-u", "--upload",
help="Upload to Dataverse",
action='store_true'
action="store_true"
)
parser.add_argument(
"-r", "--use-ror",
help="Use ROR ID if available",
action="store_true"
)
args = parser.parse_args()
@ -1727,7 +1818,8 @@ if __name__ == "__main__":
depositor=args.depositor,
default_subject=args.subject,
contact_mail=args.contact_mail,
upload=args.upload
upload=args.upload,
ror=args.use_ror
)