feat!: generalize script by removing organizational metadata
All checks were successful
Test pipeline / test (push) Successful in 14s

Remove Phase class, organizational metadata blocks, and unused project fields. Update configuration
to use 'default_grants' and simplify PI usage to fallback corresponding author determination only.

BREAKING CHANGES: - Remove 'phase' and 'project' fields from configuration - Use 'default_grants'
instead of 'default_grant' - Generate only standard Dataverse citation metadata
This commit is contained in:
Alexander Minges 2025-07-07 14:41:39 +02:00
parent 01bc537bd8
commit 67b46d5140
Signed by: Athemis
SSH key fingerprint: SHA256:TUXshgulbwL+FRYvBNo54pCsI0auROsSEgSvueKbkZ4
11 changed files with 207 additions and 269 deletions

View file

@ -109,36 +109,6 @@ class FieldType(Enum):
COMPOUND = "compound"
VOCABULARY = "controlledVocabulary"
@dataclass
class Phase:
"""
Represents a project phase with a defined time span.
Attributes:
name (str): The name of the project phase.
start (int): The start year of the project phase.
end (int): The end year of the project phase.
"""
name: str
start: int
end: int
def check_year(self, year: int) -> bool:
"""
Checks whether a given year falls within the project's phase boundaries.
Args:
year (int): The year to check.
Returns:
bool: True if the year is within the phase boundaries, otherwise False.
"""
if self.start <= year <= self.end:
return True
return False
@dataclass
class BaseMetadataField[T]:
"""
@ -301,7 +271,7 @@ class Institution:
"termName": self.display_name,
"@type": "https://schema.org/Organization"
}
return PrimitiveMetadataField("authorAffiliation", False, self.ror, expanded_value)
return PrimitiveMetadataField("authorAffiliation", False, self.ror, expanded_value=expanded_value)
else:
return PrimitiveMetadataField("authorAffiliation", False, self.display_name)
@ -316,14 +286,12 @@ class Person:
orcid (str): ORCID identifier (optional).
email (str): Email address (optional).
affiliation (Institution): Affiliation of the person (optional).
project (list[str]): List of associated projects.
"""
family_name: str
given_name: str
orcid: str = ""
email: str = ""
affiliation: Institution | str = ""
project: list[str] = field(default_factory=list)
def to_dict(self) -> dict[str, str | list[str] | dict[str, str]]:
"""
@ -340,8 +308,7 @@ class Person:
"family_name": self.family_name,
"given_name": self.given_name,
"orcid": self.orcid,
"email": self.email,
"project": self.project
"email": self.email
}
if isinstance(self.affiliation, Institution):
@ -464,12 +431,10 @@ class ConfigData:
Attributes:
dataverse (dict[str, str]): Dataverse-related configuration.
phase (dict[str, dict[str, int]]): Mapping of project phases.
pis (list[dict[str, Any]]): List of principal investigator configurations.
default_grants (list[dict[str, str]]): Default grant configurations.
"""
dataverse: dict[str, str]
phase: dict[str, dict[str, int]]
pis: list[dict[str, Any]]
default_grants: list[dict[str, str]]
@ -523,7 +488,6 @@ class Config:
cls._config_data = ConfigData(
dataverse=config_data.get('dataverse', {}),
phase=config_data.get('phase', {}),
pis=config_data.get('pis', []),
default_grants=config_data.get('default_grants', [])
)
@ -545,16 +509,6 @@ class Config:
raise RuntimeError("Failed to load configuration")
return cls._config_data
@property
def PHASE(self) -> dict[str, dict[str, int]]:
"""
Get phase configuration.
Returns:
dict[str, dict[str, int]]: Mapping of phases.
"""
return self.get_config().phase
@property
def PIS(self) -> list[dict[str, Any]]:
"""
@ -833,7 +787,10 @@ class AbstractProcessor:
else:
console.print(f"\n{ICONS['warning']} No abstract found in CrossRef!", style="warning")
else:
console.print(f"\n{ICONS['info']} License {license.name} does not allow derivative works. Reconstructing abstract from OpenAlex!", style="info")
if license.name:
console.print(f"\n{ICONS['info']} License {license.name} does not allow derivative works. Reconstructing abstract from OpenAlex!", style="info")
else:
console.print(f"\n{ICONS['info']} Custom license does not allow derivative works. Reconstructing abstract from OpenAlex!", style="info")
openalex_abstract = self._get_openalex_abstract(data)
@ -1406,8 +1363,7 @@ class MetadataProcessor:
CompoundMetadataField("grantNumber", True, grants).to_dict()
],
"displayName": "Citation Metadata"
},
"crc1430_org_v1": self._build_organization_metadata(data)
}
},
"files": []
}
@ -1473,71 +1429,22 @@ class MetadataProcessor:
"""
return data.get("publication_year", "")
def _build_organization_metadata(self, data: dict[str, Any]) -> dict[str, Any]:
"""
Build organization metadata fields (phase, project, PI names).
Args:
data (dict[str, Any]): The metadata.
Returns:
dict[str, Any]: Organization metadata.
"""
publication_year = self._get_publication_year(data)
if publication_year:
phases = self._get_phases(int(publication_year))
else:
phases = []
pis = self._get_involved_pis(data)
projects: list[str] = []
for pi in pis:
for project in pi.project:
projects.append(project)
pi_names: list[str] = []
for pi in pis:
pi_names.append(pi.format_name())
# Deduplicate projects and PI names
unique_projects = list(set(projects))
unique_pi_names = list(set(pi_names))
return {
"fields": [
ControlledVocabularyMetadataField("crc1430OrgV1Phase", True, phases).to_dict(),
ControlledVocabularyMetadataField("crc1430OrgV1Project", True, unique_projects).to_dict(),
ControlledVocabularyMetadataField("crc1430OrgV1PI", True, unique_pi_names).to_dict()
]
}
def _get_phases(self, year: int) -> list[str]:
"""
Determine the project phases matching a given publication year.
Args:
year (int): The publication year.
Returns:
list[str]: List of matching phase names.
"""
config = Config()
matching_phases: list[str] = []
for phase_name, phase_info in config.PHASE.items():
phase = Phase(phase_name, phase_info["start"], phase_info["end"])
if phase.check_year(year):
matching_phases.append(phase.name)
return matching_phases
def _get_involved_pis(self, data: dict[str, Any]) -> list[Person]:
"""
Identify involved principal investigators from the metadata.
Identify involved principal investigators from the metadata for use as fallback
corresponding authors.
This method matches authors in the publication metadata against the configured
PIs and returns matching PIs. It is used as a fallback when no corresponding
authors are explicitly declared in the publication metadata.
Args:
data (dict[str, Any]): The metadata.
data (dict[str, Any]): The metadata from OpenAlex.
Returns:
list[Person]: List of PIs.
list[Person]: List of matching PIs for use as corresponding authors.
"""
involved_pis: list[Person] = []
for authorship in data.get("authorships", []):