From d8036fea2f407ed63fd9b20516e37df47e4f6db2 Mon Sep 17 00:00:00 2001 From: Alexander Minges Date: Thu, 10 Jul 2025 15:13:39 +0200 Subject: [PATCH] fix: Improve JATS list tag conversion with sequential processing Refactored list tag conversion to handle ordered and unordered lists more robustly. Uses regex for closing ordered list tags to prevent duplicate key issues and ensures proper tag replacement. --- doi2dataset.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/doi2dataset.py b/doi2dataset.py index 6d5d694..7d23409 100755 --- a/doi2dataset.py +++ b/doi2dataset.py @@ -866,6 +866,23 @@ class AbstractProcessor: if not text: return "" + # Handle list tags with sequential processing to avoid duplicate keys + # Process ordered lists first - replace both opening and closing tags + text = text.replace('', "
    ") + # Find and replace closing tags for ordered lists + import re + + # Replace closing tags that follow ordered list openings + # This regex matches that comes after
      tags + pattern = r'(
        .*?)' + text = re.sub(pattern, r'\1
      ', text, flags=re.DOTALL) + + # Process unordered lists second + text = text.replace('', "
        ") + # Replace remaining tags as unordered list closings + text = text.replace('', '
      ') + + # Handle other JATS tags replacements = { "": "", "": "", @@ -885,10 +902,6 @@ class AbstractProcessor: "": "

      ", "": "

      ", "": "

      ", - '': "
        ", - "": "
      ", - '': "
        ", - "": "
      ", "": "
    1. ", "": "
    2. ", "": "
      ",