Compare commits
No commits in common. "77930649b16245408434be380550844088f82642" and "f7130898fcfe6a0708c3f505300a0f8157ff0ce3" have entirely different histories.
77930649b1
...
f7130898fc
1 changed files with 24 additions and 168 deletions
192
doi2dataset.py
192
doi2dataset.py
|
@ -158,7 +158,6 @@ class BaseMetadataField[T]:
|
||||||
multiple: bool
|
multiple: bool
|
||||||
value: T
|
value: T
|
||||||
type: FieldType = field(init=False)
|
type: FieldType = field(init=False)
|
||||||
expanded_value: dict[str, str] | None = field(default=None)
|
|
||||||
|
|
||||||
def __post_init__(self) -> None:
|
def __post_init__(self) -> None:
|
||||||
"""
|
"""
|
||||||
|
@ -197,29 +196,19 @@ class PrimitiveMetadataField(BaseMetadataField[str]):
|
||||||
def _set_type(self) -> None:
|
def _set_type(self) -> None:
|
||||||
self.type = FieldType.PRIMITIVE
|
self.type = FieldType.PRIMITIVE
|
||||||
|
|
||||||
def to_dict(self) -> dict[str, str | bool | dict[str, str]]:
|
def to_dict(self) -> dict[str, str | bool]:
|
||||||
"""
|
"""
|
||||||
Convert the primitive metadata field to a dictionary representation.
|
Convert the primitive metadata field to a dictionary representation.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict[str, str | bool]: Dictionary with field properties.
|
dict[str, str | bool]: Dictionary with field properties.
|
||||||
"""
|
"""
|
||||||
|
return {
|
||||||
if self.expanded_value:
|
"typeName": self.name,
|
||||||
return {
|
"typeClass": self.type.value,
|
||||||
"typeName": self.name,
|
"multiple": self.multiple,
|
||||||
"typeClass": self.type.value,
|
"value": self.value,
|
||||||
"multiple": self.multiple,
|
}
|
||||||
"value": self.value,
|
|
||||||
"expandedValue": self.expanded_value
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
return {
|
|
||||||
"typeName": self.name,
|
|
||||||
"typeClass": self.type.value,
|
|
||||||
"multiple": self.multiple,
|
|
||||||
"value": self.value,
|
|
||||||
}
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class ControlledVocabularyMetadataField(BaseMetadataField[str | list[str]]):
|
class ControlledVocabularyMetadataField(BaseMetadataField[str | list[str]]):
|
||||||
|
@ -275,36 +264,6 @@ class CompoundMetadataField(
|
||||||
"value": value_list
|
"value": value_list
|
||||||
}
|
}
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class Institution:
|
|
||||||
"""
|
|
||||||
Represents an institution or organization.
|
|
||||||
|
|
||||||
Attributes:
|
|
||||||
display_name (str): The name of the institution.
|
|
||||||
ror (str): Research Organization Registry identifier (optional).
|
|
||||||
"""
|
|
||||||
display_name: str
|
|
||||||
ror: str = ""
|
|
||||||
|
|
||||||
def affiliation_field(self) -> PrimitiveMetadataField:
|
|
||||||
"""
|
|
||||||
Create a metadata field for the affiliation.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
PrimitiveMetadataField: A metadata field representing the institution,
|
|
||||||
using ROR ID when available.
|
|
||||||
"""
|
|
||||||
if self.ror:
|
|
||||||
expanded_value = {
|
|
||||||
"scheme": "http://www.grid.ac/ontology/",
|
|
||||||
"termName": self.display_name,
|
|
||||||
"@type": "https://schema.org/Organization"
|
|
||||||
}
|
|
||||||
return PrimitiveMetadataField("authorAffiliation", False, self.ror, expanded_value)
|
|
||||||
else:
|
|
||||||
return PrimitiveMetadataField("authorAffiliation", False, self.display_name)
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Person:
|
class Person:
|
||||||
"""
|
"""
|
||||||
|
@ -315,47 +274,16 @@ class Person:
|
||||||
given_name (str): Given name of the person.
|
given_name (str): Given name of the person.
|
||||||
orcid (str): ORCID identifier (optional).
|
orcid (str): ORCID identifier (optional).
|
||||||
email (str): Email address (optional).
|
email (str): Email address (optional).
|
||||||
affiliation (Institution): Affiliation of the person (optional).
|
affiliation (str): Affiliation of the person (optional).
|
||||||
project (list[str]): List of associated projects.
|
project (list[str]): List of associated projects.
|
||||||
"""
|
"""
|
||||||
family_name: str
|
family_name: str
|
||||||
given_name: str
|
given_name: str
|
||||||
orcid: str = ""
|
orcid: str = ""
|
||||||
email: str = ""
|
email: str = ""
|
||||||
affiliation: Institution | str = ""
|
affiliation: str = ""
|
||||||
project: list[str] = field(default_factory=list)
|
project: list[str] = field(default_factory=list)
|
||||||
|
|
||||||
def to_dict(self) -> dict[str, str | list[str] | dict[str, str]]:
|
|
||||||
"""
|
|
||||||
Convert Person to a dictionary for JSON serialization.
|
|
||||||
|
|
||||||
Handles affiliations properly by checking if the affiliation
|
|
||||||
is an Institution object or a string.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
dict: A dictionary containing the person's information including
|
|
||||||
name, contact details, and affiliation.
|
|
||||||
"""
|
|
||||||
return_dict: dict[str, str | list[str] | dict[str, str]] = {
|
|
||||||
"family_name": self.family_name,
|
|
||||||
"given_name": self.given_name,
|
|
||||||
"orcid": self.orcid,
|
|
||||||
"email": self.email,
|
|
||||||
"project": self.project
|
|
||||||
}
|
|
||||||
|
|
||||||
if isinstance(self.affiliation, Institution):
|
|
||||||
if self.affiliation.ror:
|
|
||||||
return_dict["affiliation"] = self.affiliation.ror
|
|
||||||
elif self.affiliation.display_name:
|
|
||||||
return_dict["affiliation"] = self.affiliation.display_name
|
|
||||||
else:
|
|
||||||
return_dict["affiliation"] = ""
|
|
||||||
else:
|
|
||||||
return_dict["affiliation"] = ""
|
|
||||||
|
|
||||||
return return_dict
|
|
||||||
|
|
||||||
def format_name(self) -> str:
|
def format_name(self) -> str:
|
||||||
"""
|
"""
|
||||||
Format the name in 'Family, Given' order.
|
Format the name in 'Family, Given' order.
|
||||||
|
@ -367,59 +295,37 @@ class Person:
|
||||||
|
|
||||||
def author_fields(self) -> list[PrimitiveMetadataField | ControlledVocabularyMetadataField]:
|
def author_fields(self) -> list[PrimitiveMetadataField | ControlledVocabularyMetadataField]:
|
||||||
"""
|
"""
|
||||||
Build metadata fields for the author.
|
Build metadata fields for an author.
|
||||||
|
|
||||||
The method handles both Institution objects and string values for affiliations.
|
|
||||||
Different fields are generated depending on whether ORCID is available.
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
list: List of metadata fields representing the author, including name,
|
list: List of metadata fields representing the author.
|
||||||
affiliation, and optionally ORCID identifier information.
|
|
||||||
"""
|
"""
|
||||||
affiliation_field = None
|
|
||||||
if isinstance(self.affiliation, Institution):
|
|
||||||
affiliation_field = self.affiliation.affiliation_field()
|
|
||||||
else:
|
|
||||||
affiliation_field = PrimitiveMetadataField("authorAffiliation", False, self.affiliation)
|
|
||||||
|
|
||||||
if self.orcid:
|
if self.orcid:
|
||||||
return [
|
return [
|
||||||
PrimitiveMetadataField("authorName", False, self.format_name()),
|
PrimitiveMetadataField("authorName", False, self.format_name()),
|
||||||
affiliation_field,
|
PrimitiveMetadataField("authorAffiliation", False, self.affiliation),
|
||||||
ControlledVocabularyMetadataField("authorIdentifierScheme", False, "ORCID"),
|
ControlledVocabularyMetadataField("authorIdentifierScheme", False, "ORCID"),
|
||||||
PrimitiveMetadataField("authorIdentifier", False, self.orcid)
|
PrimitiveMetadataField("authorIdentifier", False, self.orcid)
|
||||||
]
|
]
|
||||||
else:
|
else:
|
||||||
return [
|
return [
|
||||||
PrimitiveMetadataField("authorName", False, self.format_name()),
|
PrimitiveMetadataField("authorName", False, self.format_name()),
|
||||||
affiliation_field
|
PrimitiveMetadataField("authorAffiliation", False, self.affiliation)
|
||||||
]
|
]
|
||||||
|
|
||||||
def dataset_contact_fields(self) -> list[PrimitiveMetadataField]:
|
def dataset_contact_fields(self) -> list[PrimitiveMetadataField]:
|
||||||
"""
|
"""
|
||||||
Generate metadata fields for dataset contact.
|
Build metadata fields for dataset contact information.
|
||||||
|
|
||||||
The method handles both Institution objects and string values for affiliations.
|
|
||||||
Creates fields for the contact name, affiliation, and email address.
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
list: List of metadata fields for the dataset contact including name,
|
list: List of metadata fields for the dataset contact.
|
||||||
affiliation, and email address.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
affiliation_field = None
|
|
||||||
if isinstance(self.affiliation, Institution):
|
|
||||||
affiliation_field = self.affiliation.affiliation_field()
|
|
||||||
else:
|
|
||||||
affiliation_field = PrimitiveMetadataField("datasetContactAffiliation", False, self.affiliation)
|
|
||||||
|
|
||||||
return [
|
return [
|
||||||
PrimitiveMetadataField("datasetContactName", False, self.format_name()),
|
PrimitiveMetadataField("datasetContactName", False, self.format_name()),
|
||||||
affiliation_field,
|
PrimitiveMetadataField("datasetContactAffiliation", False, self.affiliation),
|
||||||
PrimitiveMetadataField("datasetContactEmail", False, self.email)
|
PrimitiveMetadataField("datasetContactEmail", False, self.email)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class License:
|
class License:
|
||||||
"""
|
"""
|
||||||
|
@ -447,12 +353,6 @@ class Abstract:
|
||||||
source: str
|
source: str
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
"""
|
|
||||||
Validate that the abstract source is one of the allowed values.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
ValueError: If source is not one of the allowed values.
|
|
||||||
"""
|
|
||||||
allowed_sources = ["crossref", "openalex", "none"]
|
allowed_sources = ["crossref", "openalex", "none"]
|
||||||
if self.source not in allowed_sources:
|
if self.source not in allowed_sources:
|
||||||
raise ValueError(f"{self.source} is not valid! Needs to be one of {str(allowed_sources)}.")
|
raise ValueError(f"{self.source} is not valid! Needs to be one of {str(allowed_sources)}.")
|
||||||
|
@ -998,7 +898,7 @@ class CitationBuilder:
|
||||||
"""
|
"""
|
||||||
Builds various citation-related metadata fields.
|
Builds various citation-related metadata fields.
|
||||||
"""
|
"""
|
||||||
def __init__(self, data: dict[str, Any], doi: str, pi_finder: PIFinder, ror: bool = False) -> None:
|
def __init__(self, data: dict[str, Any], doi: str, pi_finder: PIFinder) -> None:
|
||||||
"""
|
"""
|
||||||
Initialize the CitationBuilder with data, DOI, and a PIFinder.
|
Initialize the CitationBuilder with data, DOI, and a PIFinder.
|
||||||
|
|
||||||
|
@ -1009,7 +909,6 @@ class CitationBuilder:
|
||||||
"""
|
"""
|
||||||
self.data = data
|
self.data = data
|
||||||
self.doi = doi
|
self.doi = doi
|
||||||
self.ror = ror
|
|
||||||
self.pi_finder = pi_finder
|
self.pi_finder = pi_finder
|
||||||
|
|
||||||
def build_other_ids(self) -> list[list[PrimitiveMetadataField]]:
|
def build_other_ids(self) -> list[list[PrimitiveMetadataField]]:
|
||||||
|
@ -1096,7 +995,7 @@ class CitationBuilder:
|
||||||
authorship (dict[str, Any]): Authorship metadata.
|
authorship (dict[str, Any]): Authorship metadata.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Person: Processed author
|
Person: Processed author.
|
||||||
"""
|
"""
|
||||||
display_name = author.get("display_name", "")
|
display_name = author.get("display_name", "")
|
||||||
given_name, family_name = NameProcessor.split_name(display_name)
|
given_name, family_name = NameProcessor.split_name(display_name)
|
||||||
|
@ -1104,19 +1003,9 @@ class CitationBuilder:
|
||||||
person = Person(family_name, given_name)
|
person = Person(family_name, given_name)
|
||||||
|
|
||||||
if affiliations := authorship.get("affiliations"):
|
if affiliations := authorship.get("affiliations"):
|
||||||
affiliation = Institution(affiliations[0].get("raw_affiliation_string", "").strip())
|
affiliation = affiliations[0].get("raw_affiliation_string", "").strip()
|
||||||
|
|
||||||
person.affiliation = affiliation
|
person.affiliation = affiliation
|
||||||
|
|
||||||
if self.ror:
|
|
||||||
if institutions := authorship.get("institutions"):
|
|
||||||
institution = institutions[0]
|
|
||||||
if institution.get("ror"):
|
|
||||||
affiliation = Institution(institution.get("display_name"), institution.get("ror"))
|
|
||||||
|
|
||||||
person.affiliation = affiliation
|
|
||||||
|
|
||||||
|
|
||||||
if orcid := author.get("orcid"):
|
if orcid := author.get("orcid"):
|
||||||
person.orcid = normalize_orcid(orcid)
|
person.orcid = normalize_orcid(orcid)
|
||||||
|
|
||||||
|
@ -1213,7 +1102,6 @@ class MetadataProcessor:
|
||||||
default_subject: str = "Other",
|
default_subject: str = "Other",
|
||||||
contact_mail: str | None = None,
|
contact_mail: str | None = None,
|
||||||
upload: bool = False,
|
upload: bool = False,
|
||||||
ror: bool= False,
|
|
||||||
console: Console | None = None,
|
console: Console | None = None,
|
||||||
progress: Progress | None = None,
|
progress: Progress | None = None,
|
||||||
task_id: TaskID | None = None
|
task_id: TaskID | None = None
|
||||||
|
@ -1227,7 +1115,6 @@ class MetadataProcessor:
|
||||||
output_path (Path | None): Path where metadata will be saved.
|
output_path (Path | None): Path where metadata will be saved.
|
||||||
default_subject (str): Default subject.
|
default_subject (str): Default subject.
|
||||||
contact_mail (str | None): Contact email address.
|
contact_mail (str | None): Contact email address.
|
||||||
ror (bool): Whether to use ROR id for affiliation
|
|
||||||
upload (bool): Whether to upload metadata.
|
upload (bool): Whether to upload metadata.
|
||||||
console (Console | None): Rich console instance.
|
console (Console | None): Rich console instance.
|
||||||
progress (Progress | None): Progress bar instance.
|
progress (Progress | None): Progress bar instance.
|
||||||
|
@ -1247,7 +1134,6 @@ class MetadataProcessor:
|
||||||
pi_objects = [Person(**pi) for pi in config.PIS]
|
pi_objects = [Person(**pi) for pi in config.PIS]
|
||||||
self.pi_finder = PIFinder(pi_objects)
|
self.pi_finder = PIFinder(pi_objects)
|
||||||
self.upload = upload
|
self.upload = upload
|
||||||
self.ror = ror
|
|
||||||
self.progress = progress
|
self.progress = progress
|
||||||
self.task_id = task_id
|
self.task_id = task_id
|
||||||
|
|
||||||
|
@ -1364,10 +1250,9 @@ class MetadataProcessor:
|
||||||
license_info = LicenseProcessor.process_license(data)
|
license_info = LicenseProcessor.process_license(data)
|
||||||
abstract_processor = AbstractProcessor(self.api_client)
|
abstract_processor = AbstractProcessor(self.api_client)
|
||||||
abstract = abstract_processor.get_abstract(self.doi, data, license_info)
|
abstract = abstract_processor.get_abstract(self.doi, data, license_info)
|
||||||
citation_builder = CitationBuilder(data, self.doi, self.pi_finder, self.ror)
|
citation_builder = CitationBuilder(data, self.doi, self.pi_finder)
|
||||||
|
|
||||||
authors, corresponding_authors = citation_builder.build_authors()
|
authors, corresponding_authors = citation_builder.build_authors()
|
||||||
|
|
||||||
author_fields: list[list[PrimitiveMetadataField | ControlledVocabularyMetadataField]] = []
|
author_fields: list[list[PrimitiveMetadataField | ControlledVocabularyMetadataField]] = []
|
||||||
corresponding_author_fields: list[list[PrimitiveMetadataField]] = []
|
corresponding_author_fields: list[list[PrimitiveMetadataField]] = []
|
||||||
for author in authors:
|
for author in authors:
|
||||||
|
@ -1566,28 +1451,8 @@ class MetadataProcessor:
|
||||||
"""
|
"""
|
||||||
if self.output_path:
|
if self.output_path:
|
||||||
try:
|
try:
|
||||||
# Custom JSON encoder to handle custom objects
|
|
||||||
class CustomEncoder(json.JSONEncoder):
|
|
||||||
"""
|
|
||||||
Custom JSON encoder that handles objects with to_dict method.
|
|
||||||
|
|
||||||
This allows for proper serialization of custom classes like
|
|
||||||
Institution and Person by calling their to_dict method when
|
|
||||||
available.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
o: The object to serialize.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
A JSON-serializable representation of the object.
|
|
||||||
"""
|
|
||||||
def default(self, o: Any) -> Any:
|
|
||||||
if hasattr(o, 'to_dict'):
|
|
||||||
return o.to_dict()
|
|
||||||
return super().default(o)
|
|
||||||
|
|
||||||
with open(self.output_path, "w", encoding="utf-8") as f:
|
with open(self.output_path, "w", encoding="utf-8") as f:
|
||||||
json.dump(metadata, f, indent=4, ensure_ascii=False, cls=CustomEncoder)
|
json.dump(metadata, f, indent=4, ensure_ascii=False)
|
||||||
self.console.print(f"{ICONS['save']} Metadata saved in: {self.output_path}", style="info")
|
self.console.print(f"{ICONS['save']} Metadata saved in: {self.output_path}", style="info")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.console.print(f"{ICONS['error']} Error saving metadata: {str(e)}\n", style="error")
|
self.console.print(f"{ICONS['error']} Error saving metadata: {str(e)}\n", style="error")
|
||||||
|
@ -1672,8 +1537,7 @@ def process_doi_batch(
|
||||||
depositor: str | None = None,
|
depositor: str | None = None,
|
||||||
default_subject: str = "Medicine, Health and Life Sciences",
|
default_subject: str = "Medicine, Health and Life Sciences",
|
||||||
contact_mail: str | None = None,
|
contact_mail: str | None = None,
|
||||||
upload: bool = False,
|
upload: bool = False
|
||||||
ror: bool = False
|
|
||||||
) -> dict[str, list[Any]]:
|
) -> dict[str, list[Any]]:
|
||||||
"""
|
"""
|
||||||
Process a batch of DOIs and return a summary of results.
|
Process a batch of DOIs and return a summary of results.
|
||||||
|
@ -1685,7 +1549,6 @@ def process_doi_batch(
|
||||||
default_subject (str): Default subject for metadata.
|
default_subject (str): Default subject for metadata.
|
||||||
contact_mail (str | None): Contact email address.
|
contact_mail (str | None): Contact email address.
|
||||||
upload (bool): Flag indicating whether to upload metadata to Dataverse.
|
upload (bool): Flag indicating whether to upload metadata to Dataverse.
|
||||||
ror (bool): Flag indication whether to use ROR id for affiliation.
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict[str, list[Any]]: Dictionary with keys 'success' and 'failed'.
|
dict[str, list[Any]]: Dictionary with keys 'success' and 'failed'.
|
||||||
|
@ -1754,7 +1617,6 @@ def process_doi_batch(
|
||||||
default_subject=default_subject,
|
default_subject=default_subject,
|
||||||
contact_mail=contact_mail,
|
contact_mail=contact_mail,
|
||||||
upload=upload,
|
upload=upload,
|
||||||
ror=ror,
|
|
||||||
console=console,
|
console=console,
|
||||||
progress=progress,
|
progress=progress,
|
||||||
task_id=status_task
|
task_id=status_task
|
||||||
|
@ -1826,12 +1688,7 @@ if __name__ == "__main__":
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-u", "--upload",
|
"-u", "--upload",
|
||||||
help="Upload to Dataverse",
|
help="Upload to Dataverse",
|
||||||
action="store_true"
|
action='store_true'
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"-r", "--use-ror",
|
|
||||||
help="Use ROR ID if available",
|
|
||||||
action="store_true"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
@ -1870,8 +1727,7 @@ if __name__ == "__main__":
|
||||||
depositor=args.depositor,
|
depositor=args.depositor,
|
||||||
default_subject=args.subject,
|
default_subject=args.subject,
|
||||||
contact_mail=args.contact_mail,
|
contact_mail=args.contact_mail,
|
||||||
upload=args.upload,
|
upload=args.upload
|
||||||
ror=args.use_ror
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue