fix: Improve JATS list tag conversion with sequential processing
Refactored list tag conversion to handle ordered and unordered lists more robustly. Uses regex for closing ordered list tags to prevent duplicate key issues and ensures proper tag replacement.
This commit is contained in:
parent
d5bd11a8ed
commit
d8036fea2f
1 changed files with 17 additions and 4 deletions
|
@ -866,6 +866,23 @@ class AbstractProcessor:
|
|||
if not text:
|
||||
return ""
|
||||
|
||||
# Handle list tags with sequential processing to avoid duplicate keys
|
||||
# Process ordered lists first - replace both opening and closing tags
|
||||
text = text.replace('<jats:list list-type="order">', "<ol>")
|
||||
# Find and replace closing tags for ordered lists
|
||||
import re
|
||||
|
||||
# Replace closing tags that follow ordered list openings
|
||||
# This regex matches </jats:list> that comes after <ol> tags
|
||||
pattern = r'(<ol>.*?)</jats:list>'
|
||||
text = re.sub(pattern, r'\1</ol>', text, flags=re.DOTALL)
|
||||
|
||||
# Process unordered lists second
|
||||
text = text.replace('<jats:list list-type="bullet">', "<ul>")
|
||||
# Replace remaining </jats:list> tags as unordered list closings
|
||||
text = text.replace('</jats:list>', '</ul>')
|
||||
|
||||
# Handle other JATS tags
|
||||
replacements = {
|
||||
"<jats:italic>": "<i>",
|
||||
"</jats:italic>": "</i>",
|
||||
|
@ -885,10 +902,6 @@ class AbstractProcessor:
|
|||
"</jats:p>": "</p>",
|
||||
"<jats:title>": "<h2>",
|
||||
"</jats:title>": "</h2>",
|
||||
'<jats:list list-type="bullet">': "<ul>",
|
||||
"</jats:list>": "</ul>",
|
||||
'<jats:list list-type="order">': "<ol>",
|
||||
"</jats:list>": "</ol>",
|
||||
"<jats:list-item>": "<li>",
|
||||
"</jats:list-item>": "</li>",
|
||||
"<jats:blockquote>": "<blockquote>",
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue