Compare commits

..

No commits in common. "77930649b16245408434be380550844088f82642" and "f7130898fcfe6a0708c3f505300a0f8157ff0ce3" have entirely different histories.

View file

@ -158,7 +158,6 @@ class BaseMetadataField[T]:
multiple: bool
value: T
type: FieldType = field(init=False)
expanded_value: dict[str, str] | None = field(default=None)
def __post_init__(self) -> None:
"""
@ -197,29 +196,19 @@ class PrimitiveMetadataField(BaseMetadataField[str]):
def _set_type(self) -> None:
self.type = FieldType.PRIMITIVE
def to_dict(self) -> dict[str, str | bool | dict[str, str]]:
def to_dict(self) -> dict[str, str | bool]:
"""
Convert the primitive metadata field to a dictionary representation.
Returns:
dict[str, str | bool]: Dictionary with field properties.
"""
if self.expanded_value:
return {
"typeName": self.name,
"typeClass": self.type.value,
"multiple": self.multiple,
"value": self.value,
"expandedValue": self.expanded_value
}
else:
return {
"typeName": self.name,
"typeClass": self.type.value,
"multiple": self.multiple,
"value": self.value,
}
return {
"typeName": self.name,
"typeClass": self.type.value,
"multiple": self.multiple,
"value": self.value,
}
@dataclass
class ControlledVocabularyMetadataField(BaseMetadataField[str | list[str]]):
@ -275,36 +264,6 @@ class CompoundMetadataField(
"value": value_list
}
@dataclass
class Institution:
"""
Represents an institution or organization.
Attributes:
display_name (str): The name of the institution.
ror (str): Research Organization Registry identifier (optional).
"""
display_name: str
ror: str = ""
def affiliation_field(self) -> PrimitiveMetadataField:
"""
Create a metadata field for the affiliation.
Returns:
PrimitiveMetadataField: A metadata field representing the institution,
using ROR ID when available.
"""
if self.ror:
expanded_value = {
"scheme": "http://www.grid.ac/ontology/",
"termName": self.display_name,
"@type": "https://schema.org/Organization"
}
return PrimitiveMetadataField("authorAffiliation", False, self.ror, expanded_value)
else:
return PrimitiveMetadataField("authorAffiliation", False, self.display_name)
@dataclass
class Person:
"""
@ -315,47 +274,16 @@ class Person:
given_name (str): Given name of the person.
orcid (str): ORCID identifier (optional).
email (str): Email address (optional).
affiliation (Institution): Affiliation of the person (optional).
affiliation (str): Affiliation of the person (optional).
project (list[str]): List of associated projects.
"""
family_name: str
given_name: str
orcid: str = ""
email: str = ""
affiliation: Institution | str = ""
affiliation: str = ""
project: list[str] = field(default_factory=list)
def to_dict(self) -> dict[str, str | list[str] | dict[str, str]]:
"""
Convert Person to a dictionary for JSON serialization.
Handles affiliations properly by checking if the affiliation
is an Institution object or a string.
Returns:
dict: A dictionary containing the person's information including
name, contact details, and affiliation.
"""
return_dict: dict[str, str | list[str] | dict[str, str]] = {
"family_name": self.family_name,
"given_name": self.given_name,
"orcid": self.orcid,
"email": self.email,
"project": self.project
}
if isinstance(self.affiliation, Institution):
if self.affiliation.ror:
return_dict["affiliation"] = self.affiliation.ror
elif self.affiliation.display_name:
return_dict["affiliation"] = self.affiliation.display_name
else:
return_dict["affiliation"] = ""
else:
return_dict["affiliation"] = ""
return return_dict
def format_name(self) -> str:
"""
Format the name in 'Family, Given' order.
@ -367,59 +295,37 @@ class Person:
def author_fields(self) -> list[PrimitiveMetadataField | ControlledVocabularyMetadataField]:
"""
Build metadata fields for the author.
The method handles both Institution objects and string values for affiliations.
Different fields are generated depending on whether ORCID is available.
Build metadata fields for an author.
Returns:
list: List of metadata fields representing the author, including name,
affiliation, and optionally ORCID identifier information.
list: List of metadata fields representing the author.
"""
affiliation_field = None
if isinstance(self.affiliation, Institution):
affiliation_field = self.affiliation.affiliation_field()
else:
affiliation_field = PrimitiveMetadataField("authorAffiliation", False, self.affiliation)
if self.orcid:
return [
PrimitiveMetadataField("authorName", False, self.format_name()),
affiliation_field,
PrimitiveMetadataField("authorAffiliation", False, self.affiliation),
ControlledVocabularyMetadataField("authorIdentifierScheme", False, "ORCID"),
PrimitiveMetadataField("authorIdentifier", False, self.orcid)
]
else:
return [
PrimitiveMetadataField("authorName", False, self.format_name()),
affiliation_field
PrimitiveMetadataField("authorAffiliation", False, self.affiliation)
]
def dataset_contact_fields(self) -> list[PrimitiveMetadataField]:
"""
Generate metadata fields for dataset contact.
The method handles both Institution objects and string values for affiliations.
Creates fields for the contact name, affiliation, and email address.
Build metadata fields for dataset contact information.
Returns:
list: List of metadata fields for the dataset contact including name,
affiliation, and email address.
list: List of metadata fields for the dataset contact.
"""
affiliation_field = None
if isinstance(self.affiliation, Institution):
affiliation_field = self.affiliation.affiliation_field()
else:
affiliation_field = PrimitiveMetadataField("datasetContactAffiliation", False, self.affiliation)
return [
PrimitiveMetadataField("datasetContactName", False, self.format_name()),
affiliation_field,
PrimitiveMetadataField("datasetContactAffiliation", False, self.affiliation),
PrimitiveMetadataField("datasetContactEmail", False, self.email)
]
@dataclass
class License:
"""
@ -447,12 +353,6 @@ class Abstract:
source: str
def __post_init__(self):
"""
Validate that the abstract source is one of the allowed values.
Raises:
ValueError: If source is not one of the allowed values.
"""
allowed_sources = ["crossref", "openalex", "none"]
if self.source not in allowed_sources:
raise ValueError(f"{self.source} is not valid! Needs to be one of {str(allowed_sources)}.")
@ -998,7 +898,7 @@ class CitationBuilder:
"""
Builds various citation-related metadata fields.
"""
def __init__(self, data: dict[str, Any], doi: str, pi_finder: PIFinder, ror: bool = False) -> None:
def __init__(self, data: dict[str, Any], doi: str, pi_finder: PIFinder) -> None:
"""
Initialize the CitationBuilder with data, DOI, and a PIFinder.
@ -1009,7 +909,6 @@ class CitationBuilder:
"""
self.data = data
self.doi = doi
self.ror = ror
self.pi_finder = pi_finder
def build_other_ids(self) -> list[list[PrimitiveMetadataField]]:
@ -1096,7 +995,7 @@ class CitationBuilder:
authorship (dict[str, Any]): Authorship metadata.
Returns:
Person: Processed author
Person: Processed author.
"""
display_name = author.get("display_name", "")
given_name, family_name = NameProcessor.split_name(display_name)
@ -1104,19 +1003,9 @@ class CitationBuilder:
person = Person(family_name, given_name)
if affiliations := authorship.get("affiliations"):
affiliation = Institution(affiliations[0].get("raw_affiliation_string", "").strip())
affiliation = affiliations[0].get("raw_affiliation_string", "").strip()
person.affiliation = affiliation
if self.ror:
if institutions := authorship.get("institutions"):
institution = institutions[0]
if institution.get("ror"):
affiliation = Institution(institution.get("display_name"), institution.get("ror"))
person.affiliation = affiliation
if orcid := author.get("orcid"):
person.orcid = normalize_orcid(orcid)
@ -1213,7 +1102,6 @@ class MetadataProcessor:
default_subject: str = "Other",
contact_mail: str | None = None,
upload: bool = False,
ror: bool= False,
console: Console | None = None,
progress: Progress | None = None,
task_id: TaskID | None = None
@ -1227,7 +1115,6 @@ class MetadataProcessor:
output_path (Path | None): Path where metadata will be saved.
default_subject (str): Default subject.
contact_mail (str | None): Contact email address.
ror (bool): Whether to use ROR id for affiliation
upload (bool): Whether to upload metadata.
console (Console | None): Rich console instance.
progress (Progress | None): Progress bar instance.
@ -1247,7 +1134,6 @@ class MetadataProcessor:
pi_objects = [Person(**pi) for pi in config.PIS]
self.pi_finder = PIFinder(pi_objects)
self.upload = upload
self.ror = ror
self.progress = progress
self.task_id = task_id
@ -1364,10 +1250,9 @@ class MetadataProcessor:
license_info = LicenseProcessor.process_license(data)
abstract_processor = AbstractProcessor(self.api_client)
abstract = abstract_processor.get_abstract(self.doi, data, license_info)
citation_builder = CitationBuilder(data, self.doi, self.pi_finder, self.ror)
citation_builder = CitationBuilder(data, self.doi, self.pi_finder)
authors, corresponding_authors = citation_builder.build_authors()
author_fields: list[list[PrimitiveMetadataField | ControlledVocabularyMetadataField]] = []
corresponding_author_fields: list[list[PrimitiveMetadataField]] = []
for author in authors:
@ -1566,28 +1451,8 @@ class MetadataProcessor:
"""
if self.output_path:
try:
# Custom JSON encoder to handle custom objects
class CustomEncoder(json.JSONEncoder):
"""
Custom JSON encoder that handles objects with to_dict method.
This allows for proper serialization of custom classes like
Institution and Person by calling their to_dict method when
available.
Args:
o: The object to serialize.
Returns:
A JSON-serializable representation of the object.
"""
def default(self, o: Any) -> Any:
if hasattr(o, 'to_dict'):
return o.to_dict()
return super().default(o)
with open(self.output_path, "w", encoding="utf-8") as f:
json.dump(metadata, f, indent=4, ensure_ascii=False, cls=CustomEncoder)
json.dump(metadata, f, indent=4, ensure_ascii=False)
self.console.print(f"{ICONS['save']} Metadata saved in: {self.output_path}", style="info")
except Exception as e:
self.console.print(f"{ICONS['error']} Error saving metadata: {str(e)}\n", style="error")
@ -1672,8 +1537,7 @@ def process_doi_batch(
depositor: str | None = None,
default_subject: str = "Medicine, Health and Life Sciences",
contact_mail: str | None = None,
upload: bool = False,
ror: bool = False
upload: bool = False
) -> dict[str, list[Any]]:
"""
Process a batch of DOIs and return a summary of results.
@ -1685,7 +1549,6 @@ def process_doi_batch(
default_subject (str): Default subject for metadata.
contact_mail (str | None): Contact email address.
upload (bool): Flag indicating whether to upload metadata to Dataverse.
ror (bool): Flag indication whether to use ROR id for affiliation.
Returns:
dict[str, list[Any]]: Dictionary with keys 'success' and 'failed'.
@ -1754,7 +1617,6 @@ def process_doi_batch(
default_subject=default_subject,
contact_mail=contact_mail,
upload=upload,
ror=ror,
console=console,
progress=progress,
task_id=status_task
@ -1826,12 +1688,7 @@ if __name__ == "__main__":
parser.add_argument(
"-u", "--upload",
help="Upload to Dataverse",
action="store_true"
)
parser.add_argument(
"-r", "--use-ror",
help="Use ROR ID if available",
action="store_true"
action='store_true'
)
args = parser.parse_args()
@ -1870,8 +1727,7 @@ if __name__ == "__main__":
depositor=args.depositor,
default_subject=args.subject,
contact_mail=args.contact_mail,
upload=args.upload,
ror=args.use_ror
upload=args.upload
)