feat!: generalize script by removing organizational metadata
All checks were successful
Test pipeline / test (push) Successful in 14s
All checks were successful
Test pipeline / test (push) Successful in 14s
Remove Phase class, organizational metadata blocks, and unused project fields. Update configuration to use 'default_grants' and simplify PI usage to fallback corresponding author determination only. BREAKING CHANGES: - Remove 'phase' and 'project' fields from configuration - Use 'default_grants' instead of 'default_grant' - Generate only standard Dataverse citation metadata
This commit is contained in:
parent
01bc537bd8
commit
67b46d5140
11 changed files with 207 additions and 269 deletions
123
doi2dataset.py
123
doi2dataset.py
|
@ -109,36 +109,6 @@ class FieldType(Enum):
|
|||
COMPOUND = "compound"
|
||||
VOCABULARY = "controlledVocabulary"
|
||||
|
||||
@dataclass
|
||||
class Phase:
|
||||
"""
|
||||
Represents a project phase with a defined time span.
|
||||
|
||||
Attributes:
|
||||
name (str): The name of the project phase.
|
||||
start (int): The start year of the project phase.
|
||||
end (int): The end year of the project phase.
|
||||
"""
|
||||
|
||||
name: str
|
||||
start: int
|
||||
end: int
|
||||
|
||||
def check_year(self, year: int) -> bool:
|
||||
"""
|
||||
Checks whether a given year falls within the project's phase boundaries.
|
||||
|
||||
Args:
|
||||
year (int): The year to check.
|
||||
|
||||
Returns:
|
||||
bool: True if the year is within the phase boundaries, otherwise False.
|
||||
"""
|
||||
|
||||
if self.start <= year <= self.end:
|
||||
return True
|
||||
return False
|
||||
|
||||
@dataclass
|
||||
class BaseMetadataField[T]:
|
||||
"""
|
||||
|
@ -301,7 +271,7 @@ class Institution:
|
|||
"termName": self.display_name,
|
||||
"@type": "https://schema.org/Organization"
|
||||
}
|
||||
return PrimitiveMetadataField("authorAffiliation", False, self.ror, expanded_value)
|
||||
return PrimitiveMetadataField("authorAffiliation", False, self.ror, expanded_value=expanded_value)
|
||||
else:
|
||||
return PrimitiveMetadataField("authorAffiliation", False, self.display_name)
|
||||
|
||||
|
@ -316,14 +286,12 @@ class Person:
|
|||
orcid (str): ORCID identifier (optional).
|
||||
email (str): Email address (optional).
|
||||
affiliation (Institution): Affiliation of the person (optional).
|
||||
project (list[str]): List of associated projects.
|
||||
"""
|
||||
family_name: str
|
||||
given_name: str
|
||||
orcid: str = ""
|
||||
email: str = ""
|
||||
affiliation: Institution | str = ""
|
||||
project: list[str] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> dict[str, str | list[str] | dict[str, str]]:
|
||||
"""
|
||||
|
@ -340,8 +308,7 @@ class Person:
|
|||
"family_name": self.family_name,
|
||||
"given_name": self.given_name,
|
||||
"orcid": self.orcid,
|
||||
"email": self.email,
|
||||
"project": self.project
|
||||
"email": self.email
|
||||
}
|
||||
|
||||
if isinstance(self.affiliation, Institution):
|
||||
|
@ -464,12 +431,10 @@ class ConfigData:
|
|||
|
||||
Attributes:
|
||||
dataverse (dict[str, str]): Dataverse-related configuration.
|
||||
phase (dict[str, dict[str, int]]): Mapping of project phases.
|
||||
pis (list[dict[str, Any]]): List of principal investigator configurations.
|
||||
default_grants (list[dict[str, str]]): Default grant configurations.
|
||||
"""
|
||||
dataverse: dict[str, str]
|
||||
phase: dict[str, dict[str, int]]
|
||||
pis: list[dict[str, Any]]
|
||||
default_grants: list[dict[str, str]]
|
||||
|
||||
|
@ -523,7 +488,6 @@ class Config:
|
|||
|
||||
cls._config_data = ConfigData(
|
||||
dataverse=config_data.get('dataverse', {}),
|
||||
phase=config_data.get('phase', {}),
|
||||
pis=config_data.get('pis', []),
|
||||
default_grants=config_data.get('default_grants', [])
|
||||
)
|
||||
|
@ -545,16 +509,6 @@ class Config:
|
|||
raise RuntimeError("Failed to load configuration")
|
||||
return cls._config_data
|
||||
|
||||
@property
|
||||
def PHASE(self) -> dict[str, dict[str, int]]:
|
||||
"""
|
||||
Get phase configuration.
|
||||
|
||||
Returns:
|
||||
dict[str, dict[str, int]]: Mapping of phases.
|
||||
"""
|
||||
return self.get_config().phase
|
||||
|
||||
@property
|
||||
def PIS(self) -> list[dict[str, Any]]:
|
||||
"""
|
||||
|
@ -833,7 +787,10 @@ class AbstractProcessor:
|
|||
else:
|
||||
console.print(f"\n{ICONS['warning']} No abstract found in CrossRef!", style="warning")
|
||||
else:
|
||||
console.print(f"\n{ICONS['info']} License {license.name} does not allow derivative works. Reconstructing abstract from OpenAlex!", style="info")
|
||||
if license.name:
|
||||
console.print(f"\n{ICONS['info']} License {license.name} does not allow derivative works. Reconstructing abstract from OpenAlex!", style="info")
|
||||
else:
|
||||
console.print(f"\n{ICONS['info']} Custom license does not allow derivative works. Reconstructing abstract from OpenAlex!", style="info")
|
||||
|
||||
|
||||
openalex_abstract = self._get_openalex_abstract(data)
|
||||
|
@ -1406,8 +1363,7 @@ class MetadataProcessor:
|
|||
CompoundMetadataField("grantNumber", True, grants).to_dict()
|
||||
],
|
||||
"displayName": "Citation Metadata"
|
||||
},
|
||||
"crc1430_org_v1": self._build_organization_metadata(data)
|
||||
}
|
||||
},
|
||||
"files": []
|
||||
}
|
||||
|
@ -1473,71 +1429,22 @@ class MetadataProcessor:
|
|||
"""
|
||||
return data.get("publication_year", "")
|
||||
|
||||
def _build_organization_metadata(self, data: dict[str, Any]) -> dict[str, Any]:
|
||||
"""
|
||||
Build organization metadata fields (phase, project, PI names).
|
||||
|
||||
Args:
|
||||
data (dict[str, Any]): The metadata.
|
||||
|
||||
Returns:
|
||||
dict[str, Any]: Organization metadata.
|
||||
"""
|
||||
publication_year = self._get_publication_year(data)
|
||||
if publication_year:
|
||||
phases = self._get_phases(int(publication_year))
|
||||
else:
|
||||
phases = []
|
||||
|
||||
pis = self._get_involved_pis(data)
|
||||
projects: list[str] = []
|
||||
for pi in pis:
|
||||
for project in pi.project:
|
||||
projects.append(project)
|
||||
|
||||
pi_names: list[str] = []
|
||||
for pi in pis:
|
||||
pi_names.append(pi.format_name())
|
||||
|
||||
# Deduplicate projects and PI names
|
||||
unique_projects = list(set(projects))
|
||||
unique_pi_names = list(set(pi_names))
|
||||
|
||||
return {
|
||||
"fields": [
|
||||
ControlledVocabularyMetadataField("crc1430OrgV1Phase", True, phases).to_dict(),
|
||||
ControlledVocabularyMetadataField("crc1430OrgV1Project", True, unique_projects).to_dict(),
|
||||
ControlledVocabularyMetadataField("crc1430OrgV1PI", True, unique_pi_names).to_dict()
|
||||
]
|
||||
}
|
||||
|
||||
def _get_phases(self, year: int) -> list[str]:
|
||||
"""
|
||||
Determine the project phases matching a given publication year.
|
||||
|
||||
Args:
|
||||
year (int): The publication year.
|
||||
|
||||
Returns:
|
||||
list[str]: List of matching phase names.
|
||||
"""
|
||||
config = Config()
|
||||
matching_phases: list[str] = []
|
||||
for phase_name, phase_info in config.PHASE.items():
|
||||
phase = Phase(phase_name, phase_info["start"], phase_info["end"])
|
||||
if phase.check_year(year):
|
||||
matching_phases.append(phase.name)
|
||||
return matching_phases
|
||||
|
||||
def _get_involved_pis(self, data: dict[str, Any]) -> list[Person]:
|
||||
"""
|
||||
Identify involved principal investigators from the metadata.
|
||||
Identify involved principal investigators from the metadata for use as fallback
|
||||
corresponding authors.
|
||||
|
||||
This method matches authors in the publication metadata against the configured
|
||||
PIs and returns matching PIs. It is used as a fallback when no corresponding
|
||||
authors are explicitly declared in the publication metadata.
|
||||
|
||||
Args:
|
||||
data (dict[str, Any]): The metadata.
|
||||
data (dict[str, Any]): The metadata from OpenAlex.
|
||||
|
||||
Returns:
|
||||
list[Person]: List of PIs.
|
||||
list[Person]: List of matching PIs for use as corresponding authors.
|
||||
"""
|
||||
involved_pis: list[Person] = []
|
||||
for authorship in data.get("authorships", []):
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue