fix: Improve JATS list tag conversion with sequential processing

Refactored list tag conversion to handle ordered and unordered lists more robustly. Uses regex for
closing ordered list tags to prevent duplicate key issues and ensures proper tag replacement.
This commit is contained in:
Alexander Minges 2025-07-10 15:13:39 +02:00
parent d5bd11a8ed
commit d8036fea2f
Signed by: Athemis
SSH key fingerprint: SHA256:TUXshgulbwL+FRYvBNo54pCsI0auROsSEgSvueKbkZ4

View file

@ -866,6 +866,23 @@ class AbstractProcessor:
if not text:
return ""
# Handle list tags with sequential processing to avoid duplicate keys
# Process ordered lists first - replace both opening and closing tags
text = text.replace('<jats:list list-type="order">', "<ol>")
# Find and replace closing tags for ordered lists
import re
# Replace closing tags that follow ordered list openings
# This regex matches </jats:list> that comes after <ol> tags
pattern = r'(<ol>.*?)</jats:list>'
text = re.sub(pattern, r'\1</ol>', text, flags=re.DOTALL)
# Process unordered lists second
text = text.replace('<jats:list list-type="bullet">', "<ul>")
# Replace remaining </jats:list> tags as unordered list closings
text = text.replace('</jats:list>', '</ul>')
# Handle other JATS tags
replacements = {
"<jats:italic>": "<i>",
"</jats:italic>": "</i>",
@ -885,10 +902,6 @@ class AbstractProcessor:
"</jats:p>": "</p>",
"<jats:title>": "<h2>",
"</jats:title>": "</h2>",
'<jats:list list-type="bullet">': "<ul>",
"</jats:list>": "</ul>",
'<jats:list list-type="order">': "<ol>",
"</jats:list>": "</ol>",
"<jats:list-item>": "<li>",
"</jats:list-item>": "</li>",
"<jats:blockquote>": "<blockquote>",