Compare commits

...

3 commits

Author SHA1 Message Date
77930649b1
Fix affiliation field name in Person class
All checks were successful
Test pipeline / test (push) Successful in 13s
2025-05-20 13:27:54 +02:00
554951265e
Add comprehensive docstrings to classes and methods
Improve type annotations in Person.to_dict() and CustomEncoder class
to better reflect the actual return types. Enhance documentation for
Institution, Person, Abstract, and CustomEncoder classes with detailed
method descriptions and parameter explanations.
2025-05-20 13:20:18 +02:00
f84a274848
Add support for ROR institution identifiers in affiliations 2025-05-20 13:07:43 +02:00

View file

@ -158,6 +158,7 @@ class BaseMetadataField[T]:
multiple: bool
value: T
type: FieldType = field(init=False)
expanded_value: dict[str, str] | None = field(default=None)
def __post_init__(self) -> None:
"""
@ -196,19 +197,29 @@ class PrimitiveMetadataField(BaseMetadataField[str]):
def _set_type(self) -> None:
self.type = FieldType.PRIMITIVE
def to_dict(self) -> dict[str, str | bool]:
def to_dict(self) -> dict[str, str | bool | dict[str, str]]:
"""
Convert the primitive metadata field to a dictionary representation.
Returns:
dict[str, str | bool]: Dictionary with field properties.
"""
return {
"typeName": self.name,
"typeClass": self.type.value,
"multiple": self.multiple,
"value": self.value,
}
if self.expanded_value:
return {
"typeName": self.name,
"typeClass": self.type.value,
"multiple": self.multiple,
"value": self.value,
"expandedValue": self.expanded_value
}
else:
return {
"typeName": self.name,
"typeClass": self.type.value,
"multiple": self.multiple,
"value": self.value,
}
@dataclass
class ControlledVocabularyMetadataField(BaseMetadataField[str | list[str]]):
@ -264,6 +275,36 @@ class CompoundMetadataField(
"value": value_list
}
@dataclass
class Institution:
"""
Represents an institution or organization.
Attributes:
display_name (str): The name of the institution.
ror (str): Research Organization Registry identifier (optional).
"""
display_name: str
ror: str = ""
def affiliation_field(self) -> PrimitiveMetadataField:
"""
Create a metadata field for the affiliation.
Returns:
PrimitiveMetadataField: A metadata field representing the institution,
using ROR ID when available.
"""
if self.ror:
expanded_value = {
"scheme": "http://www.grid.ac/ontology/",
"termName": self.display_name,
"@type": "https://schema.org/Organization"
}
return PrimitiveMetadataField("authorAffiliation", False, self.ror, expanded_value)
else:
return PrimitiveMetadataField("authorAffiliation", False, self.display_name)
@dataclass
class Person:
"""
@ -274,16 +315,47 @@ class Person:
given_name (str): Given name of the person.
orcid (str): ORCID identifier (optional).
email (str): Email address (optional).
affiliation (str): Affiliation of the person (optional).
affiliation (Institution): Affiliation of the person (optional).
project (list[str]): List of associated projects.
"""
family_name: str
given_name: str
orcid: str = ""
email: str = ""
affiliation: str = ""
affiliation: Institution | str = ""
project: list[str] = field(default_factory=list)
def to_dict(self) -> dict[str, str | list[str] | dict[str, str]]:
"""
Convert Person to a dictionary for JSON serialization.
Handles affiliations properly by checking if the affiliation
is an Institution object or a string.
Returns:
dict: A dictionary containing the person's information including
name, contact details, and affiliation.
"""
return_dict: dict[str, str | list[str] | dict[str, str]] = {
"family_name": self.family_name,
"given_name": self.given_name,
"orcid": self.orcid,
"email": self.email,
"project": self.project
}
if isinstance(self.affiliation, Institution):
if self.affiliation.ror:
return_dict["affiliation"] = self.affiliation.ror
elif self.affiliation.display_name:
return_dict["affiliation"] = self.affiliation.display_name
else:
return_dict["affiliation"] = ""
else:
return_dict["affiliation"] = ""
return return_dict
def format_name(self) -> str:
"""
Format the name in 'Family, Given' order.
@ -295,37 +367,59 @@ class Person:
def author_fields(self) -> list[PrimitiveMetadataField | ControlledVocabularyMetadataField]:
"""
Build metadata fields for an author.
Build metadata fields for the author.
The method handles both Institution objects and string values for affiliations.
Different fields are generated depending on whether ORCID is available.
Returns:
list: List of metadata fields representing the author.
list: List of metadata fields representing the author, including name,
affiliation, and optionally ORCID identifier information.
"""
affiliation_field = None
if isinstance(self.affiliation, Institution):
affiliation_field = self.affiliation.affiliation_field()
else:
affiliation_field = PrimitiveMetadataField("authorAffiliation", False, self.affiliation)
if self.orcid:
return [
PrimitiveMetadataField("authorName", False, self.format_name()),
PrimitiveMetadataField("authorAffiliation", False, self.affiliation),
affiliation_field,
ControlledVocabularyMetadataField("authorIdentifierScheme", False, "ORCID"),
PrimitiveMetadataField("authorIdentifier", False, self.orcid)
]
else:
return [
PrimitiveMetadataField("authorName", False, self.format_name()),
PrimitiveMetadataField("authorAffiliation", False, self.affiliation)
affiliation_field
]
def dataset_contact_fields(self) -> list[PrimitiveMetadataField]:
"""
Build metadata fields for dataset contact information.
Generate metadata fields for dataset contact.
The method handles both Institution objects and string values for affiliations.
Creates fields for the contact name, affiliation, and email address.
Returns:
list: List of metadata fields for the dataset contact.
list: List of metadata fields for the dataset contact including name,
affiliation, and email address.
"""
affiliation_field = None
if isinstance(self.affiliation, Institution):
affiliation_field = self.affiliation.affiliation_field()
else:
affiliation_field = PrimitiveMetadataField("datasetContactAffiliation", False, self.affiliation)
return [
PrimitiveMetadataField("datasetContactName", False, self.format_name()),
PrimitiveMetadataField("datasetContactAffiliation", False, self.affiliation),
affiliation_field,
PrimitiveMetadataField("datasetContactEmail", False, self.email)
]
@dataclass
class License:
"""
@ -353,6 +447,12 @@ class Abstract:
source: str
def __post_init__(self):
"""
Validate that the abstract source is one of the allowed values.
Raises:
ValueError: If source is not one of the allowed values.
"""
allowed_sources = ["crossref", "openalex", "none"]
if self.source not in allowed_sources:
raise ValueError(f"{self.source} is not valid! Needs to be one of {str(allowed_sources)}.")
@ -898,7 +998,7 @@ class CitationBuilder:
"""
Builds various citation-related metadata fields.
"""
def __init__(self, data: dict[str, Any], doi: str, pi_finder: PIFinder) -> None:
def __init__(self, data: dict[str, Any], doi: str, pi_finder: PIFinder, ror: bool = False) -> None:
"""
Initialize the CitationBuilder with data, DOI, and a PIFinder.
@ -909,6 +1009,7 @@ class CitationBuilder:
"""
self.data = data
self.doi = doi
self.ror = ror
self.pi_finder = pi_finder
def build_other_ids(self) -> list[list[PrimitiveMetadataField]]:
@ -995,7 +1096,7 @@ class CitationBuilder:
authorship (dict[str, Any]): Authorship metadata.
Returns:
Person: Processed author.
Person: Processed author
"""
display_name = author.get("display_name", "")
given_name, family_name = NameProcessor.split_name(display_name)
@ -1003,9 +1104,19 @@ class CitationBuilder:
person = Person(family_name, given_name)
if affiliations := authorship.get("affiliations"):
affiliation = affiliations[0].get("raw_affiliation_string", "").strip()
affiliation = Institution(affiliations[0].get("raw_affiliation_string", "").strip())
person.affiliation = affiliation
if self.ror:
if institutions := authorship.get("institutions"):
institution = institutions[0]
if institution.get("ror"):
affiliation = Institution(institution.get("display_name"), institution.get("ror"))
person.affiliation = affiliation
if orcid := author.get("orcid"):
person.orcid = normalize_orcid(orcid)
@ -1102,6 +1213,7 @@ class MetadataProcessor:
default_subject: str = "Other",
contact_mail: str | None = None,
upload: bool = False,
ror: bool= False,
console: Console | None = None,
progress: Progress | None = None,
task_id: TaskID | None = None
@ -1115,6 +1227,7 @@ class MetadataProcessor:
output_path (Path | None): Path where metadata will be saved.
default_subject (str): Default subject.
contact_mail (str | None): Contact email address.
ror (bool): Whether to use ROR id for affiliation
upload (bool): Whether to upload metadata.
console (Console | None): Rich console instance.
progress (Progress | None): Progress bar instance.
@ -1134,6 +1247,7 @@ class MetadataProcessor:
pi_objects = [Person(**pi) for pi in config.PIS]
self.pi_finder = PIFinder(pi_objects)
self.upload = upload
self.ror = ror
self.progress = progress
self.task_id = task_id
@ -1250,9 +1364,10 @@ class MetadataProcessor:
license_info = LicenseProcessor.process_license(data)
abstract_processor = AbstractProcessor(self.api_client)
abstract = abstract_processor.get_abstract(self.doi, data, license_info)
citation_builder = CitationBuilder(data, self.doi, self.pi_finder)
citation_builder = CitationBuilder(data, self.doi, self.pi_finder, self.ror)
authors, corresponding_authors = citation_builder.build_authors()
author_fields: list[list[PrimitiveMetadataField | ControlledVocabularyMetadataField]] = []
corresponding_author_fields: list[list[PrimitiveMetadataField]] = []
for author in authors:
@ -1451,8 +1566,28 @@ class MetadataProcessor:
"""
if self.output_path:
try:
# Custom JSON encoder to handle custom objects
class CustomEncoder(json.JSONEncoder):
"""
Custom JSON encoder that handles objects with to_dict method.
This allows for proper serialization of custom classes like
Institution and Person by calling their to_dict method when
available.
Args:
o: The object to serialize.
Returns:
A JSON-serializable representation of the object.
"""
def default(self, o: Any) -> Any:
if hasattr(o, 'to_dict'):
return o.to_dict()
return super().default(o)
with open(self.output_path, "w", encoding="utf-8") as f:
json.dump(metadata, f, indent=4, ensure_ascii=False)
json.dump(metadata, f, indent=4, ensure_ascii=False, cls=CustomEncoder)
self.console.print(f"{ICONS['save']} Metadata saved in: {self.output_path}", style="info")
except Exception as e:
self.console.print(f"{ICONS['error']} Error saving metadata: {str(e)}\n", style="error")
@ -1537,7 +1672,8 @@ def process_doi_batch(
depositor: str | None = None,
default_subject: str = "Medicine, Health and Life Sciences",
contact_mail: str | None = None,
upload: bool = False
upload: bool = False,
ror: bool = False
) -> dict[str, list[Any]]:
"""
Process a batch of DOIs and return a summary of results.
@ -1549,6 +1685,7 @@ def process_doi_batch(
default_subject (str): Default subject for metadata.
contact_mail (str | None): Contact email address.
upload (bool): Flag indicating whether to upload metadata to Dataverse.
ror (bool): Flag indication whether to use ROR id for affiliation.
Returns:
dict[str, list[Any]]: Dictionary with keys 'success' and 'failed'.
@ -1617,6 +1754,7 @@ def process_doi_batch(
default_subject=default_subject,
contact_mail=contact_mail,
upload=upload,
ror=ror,
console=console,
progress=progress,
task_id=status_task
@ -1688,7 +1826,12 @@ if __name__ == "__main__":
parser.add_argument(
"-u", "--upload",
help="Upload to Dataverse",
action='store_true'
action="store_true"
)
parser.add_argument(
"-r", "--use-ror",
help="Use ROR ID if available",
action="store_true"
)
args = parser.parse_args()
@ -1727,7 +1870,8 @@ if __name__ == "__main__":
depositor=args.depositor,
default_subject=args.subject,
contact_mail=args.contact_mail,
upload=args.upload
upload=args.upload,
ror=args.use_ror
)