From d8036fea2f407ed63fd9b20516e37df47e4f6db2 Mon Sep 17 00:00:00 2001
From: Alexander Minges
Date: Thu, 10 Jul 2025 15:13:39 +0200
Subject: [PATCH] fix: Improve JATS list tag conversion with sequential
processing
Refactored list tag conversion to handle ordered and unordered lists more robustly. Uses regex for
closing ordered list tags to prevent duplicate key issues and ensures proper tag replacement.
---
doi2dataset.py | 21 +++++++++++++++++----
1 file changed, 17 insertions(+), 4 deletions(-)
diff --git a/doi2dataset.py b/doi2dataset.py
index 6d5d694..7d23409 100755
--- a/doi2dataset.py
+++ b/doi2dataset.py
@@ -866,6 +866,23 @@ class AbstractProcessor:
if not text:
return ""
+ # Handle list tags with sequential processing to avoid duplicate keys
+ # Process ordered lists first - replace both opening and closing tags
+ text = text.replace('', "")
+ # Find and replace closing tags for ordered lists
+ import re
+
+ # Replace closing tags that follow ordered list openings
+ # This regex matches
that comes after tags
+ pattern = r'(.*?)'
+ text = re.sub(pattern, r'\1
', text, flags=re.DOTALL)
+
+ # Process unordered lists second
+ text = text.replace('', " tags as unordered list closings
+ text = text.replace('', '')
+
+ # Handle other JATS tags
replacements = {
"": "",
"": "",
@@ -885,10 +902,6 @@ class AbstractProcessor:
"": "
",
"": "",
"
": "",
- '': "": "",
- '': "",
- "
": "",
"": "",
"": "",
"": "",