Coverage for apps/ptf/cmds/xml/ckeditor/ckeditor_parser.py: 45%
426 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-05-19 19:20 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-05-19 19:20 +0000
1##################################################################################################
2#
3# README
4#
5# ckeditor_parser.py parses the HTML strings created by a CKEditor
6# with tex formulas inside <span class="math-tex">
7# It returns the JATS equivalent.
8#
9# Ex: <p>Te<st <span class="math-tex">\(x = {-b \pm \sqrt{b^2-4ac} \over 2a}\)</span> done</p>
10# <ul><li>Item</li></ul><ol><li>Item 1<br />New line</li><li> </li></ol>
11#
12##################################################################################################
14if __name__ == "__main__": 14 ↛ 15line 14 didn't jump to line 15, because the condition on line 14 was never true
15 import os
16 import sys
18 BASE_DIR = os.path.dirname(
19 os.path.dirname(
20 os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
21 )
22 )
23 sys.path.append(BASE_DIR)
25import os
27from lxml import etree
29from django.conf import settings
31from ptf.cmds.xml.xml_utils import escape
32from ptf.cmds.xml.xml_utils import normalize
33from ptf.cmds.xml.xml_utils import replace_html_entities
34from ptf.utils import create_innerlink_for_citation
37class CkeditorParser:
38 def __init__(self, *args, **kwargs):
39 self.warnings = []
40 self.value_xml = ""
41 self.value_html = ""
42 self.value_tex = ""
44 if "tree" not in kwargs and "html_value" in kwargs: 44 ↛ 56line 44 didn't jump to line 56, because the condition on line 44 was never false
45 parser = etree.XMLParser(
46 huge_tree=True,
47 recover=True,
48 remove_blank_text=False,
49 remove_comments=True,
50 resolve_entities=True,
51 )
52 html_value = kwargs["html_value"].replace("\n\n", "")
53 body = f"<body>{replace_html_entities(html_value)}</body>"
54 tree = etree.fromstring(body.encode("utf-8"), parser=parser)
55 else:
56 tree = kwargs["tree"]
58 self.mml_formulas = kwargs["mml_formulas"]
59 self.ignore_p = kwargs["ignore_p"] if "ignore_p" in kwargs else False
60 self.pid = kwargs.get("pid", None)
61 self.volume = kwargs.get("volume", None)
62 self.issue_pid = kwargs.get("issue_pid", None)
63 self.check_citation = kwargs.get("check_citation", False)
64 self.biblio = kwargs.get("biblio", None)
66 self.parse_tree(tree)
68 def parse_formula(self, node, **kwargs):
69 formula = node.text or ""
70 display = kwargs.get("display", None)
71 if len(formula) > 0 and formula.find("\\(") == 0:
72 formula = formula[2:-2]
73 # elif len(formula) > 0 and formula.find("\[") == 0:
74 # formula = formula[1:-1]
75 mml = ""
76 if len(self.mml_formulas) > 0:
77 mml = self.mml_formulas.pop(0)
79 is_inline = True
80 parent = node.getparent()
81 if parent is not None and parent.tag == "p" and not parent.text and not parent.tail:
82 is_inline = False
84 formula = f"${formula}$"
85 if mml:
86 html_text = f'<span class="mathjax-formula" title="{formula}">{mml}</span>'
87 elif display: 87 ↛ 88line 87 didn't jump to line 88, because the condition on line 87 was never true
88 html_text = f'<span class="mathjax-formula display" title="{formula}">{formula}</span>'
89 else:
90 html_text = f'<span class="mathjax-formula" title="{formula}">{formula}</span>'
91 tex_text = formula
93 if is_inline:
94 xml_text = "<inline-formula><alternatives>"
95 if len(mml) > 0:
96 xml_text += mml
97 xml_text += f"<tex-math>{escape(formula)}</tex-math>"
98 xml_text += "</alternatives></inline-formula>"
99 else:
100 prefix = '<table class="formula mathjax-formula"><tr><td class="formula-inner">'
101 suffix = '</td><td class="formula-label"></td></tr></table>'
102 html_text = prefix + html_text + suffix
103 tex_text = prefix + tex_text + suffix
105 xml_text = '<disp-formula xml:space="preserve">\n<alternatives>'
106 if len(mml) > 0:
107 xml_text += mml
108 xml_text += f"<tex-math>{escape(formula)}</tex-math>"
109 xml_text += "</alternatives></disp-formula>"
111 return html_text, tex_text, xml_text
113 def parse_list(self, node, **kwargs):
114 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
115 node, **kwargs
116 )
118 list_type = "simple" if node.tag == "ul" else "number"
120 xml_text = f'<list list-type="{list_type}">'
121 xml_text += inner_jats_xml_text
122 xml_text += "</list>"
124 # # JATS requires <list> to be inside <p>
125 # parent = node.getparent()
126 # if parent is None or parent.tag != "p":
127 # xml_text = f"<p>{xml_text}</p>"
128 html_text = f"<{node.tag}>{inner_html_text}</{node.tag}>"
129 tex_text = f"<{node.tag}>{inner_tex_text}</{node.tag}>"
131 return html_text, tex_text, xml_text
133 def parse_node_inner(self, node, **kwargs):
134 """
135 Used by parse_node_with_mixed_content for nodes that have a different tag in JATS or HTML
136 :param node:
137 :param kwargs:
138 :return:
139 """
141 kwargs["is_top"] = False
142 inner_html_text = inner_tex_text = inner_jats_xml_text = ""
144 if node.text:
145 text = node.text
147 if len(text) > 0 and text[0] == "\n" and node.tag in ("list", "item"): 147 ↛ 148line 147 didn't jump to line 148, because the condition on line 147 was never true
148 text = text[1:]
150 inner_jats_xml_text += escape(text)
151 inner_html_text += escape(text) if kwargs["escape"] else text
152 inner_tex_text += escape(text) if kwargs["escape"] else text
154 if self.check_citation and node.tag != "a": 154 ↛ 155line 154 didn't jump to line 155, because the condition on line 154 was never true
155 inner_html_text = create_innerlink_for_citation(inner_html_text, self.biblio)
157 for i in range(len(node)):
158 child = node[i]
160 (
161 child_html_text,
162 child_tex_text,
163 child_jats_xml_text,
164 ) = self.parse_node_with_mixed_content(child, **kwargs)
165 inner_html_text += child_html_text
166 inner_tex_text += child_tex_text
167 inner_jats_xml_text += child_jats_xml_text
169 return inner_html_text, inner_tex_text, inner_jats_xml_text
171 def parse_node_with_a(self, node, **kwargs):
172 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
173 node, **kwargs
174 )
176 href = ""
177 for attrib in node.attrib:
178 name = normalize(attrib)
179 if name == "href":
180 href = node.attrib[attrib]
182 if not href:
183 href = inner_tex_text
185 html_text = f'<a href="{href}">{inner_html_text}</a>'
186 tex_text = f'<a href="{href}">{inner_tex_text}</a>'
187 xml_text = f'<ext-link ext-link-type="uri" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="{href}">{inner_jats_xml_text}</ext-link>'
189 return html_text, tex_text, xml_text
191 def parse_node_with_br(self, node, **kwargs):
192 html_text = tex_text = "<br/>"
193 xml_text = "<break/>"
195 return html_text, tex_text, xml_text
197 def parse_node_with_colgroup(self, node, **kwargs):
198 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
199 node, **kwargs
200 )
201 classe = ""
202 for attrib in node.attrib:
203 name = normalize(attrib)
204 if name == "class":
205 classe = node.attrib[name]
206 html_text = f"<colgroup class={classe}>{inner_html_text}</colgroup>"
207 tex_text = f"<colgroup class={classe}>{inner_tex_text}</colgroup>"
209 xml_text = '<colgroup xml:space="preserve">' + inner_jats_xml_text + "</colgroup>"
210 return html_text, tex_text, xml_text
212 def parse_node_with_col(self, node, **kwargs):
213 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
214 node, **kwargs
215 )
216 classe = ""
217 style = ""
218 for attrib in node.attrib:
219 name = normalize(attrib)
220 if name == "class":
221 classe = node.attrib[name]
222 elif name == "style":
223 style = node.attrib[name]
224 if classe:
225 html_text = f"<col class={classe} style='{style}'>{inner_html_text}</col>"
226 tex_text = f"<col class={classe} style='{style}'>{inner_tex_text}</col>"
227 else:
228 html_text = f"<col style='{style}'>{inner_html_text}</col>"
229 tex_text = f"<col style='{style}'>{inner_tex_text}</col>"
231 xml_text = '<col xml:space="preserve">' + inner_jats_xml_text + "</col>"
232 return html_text, tex_text, xml_text
234 def parse_node_with_div(self, node, **kwargs):
235 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
236 node, **kwargs
237 )
238 classe = ""
239 for attrib in node.attrib:
240 name = normalize(attrib)
241 if name == "class":
242 classe = node.attrib[name]
243 # Next condition checks style identification with pandoc library used
244 # for docx --> html conversion
245 elif name == "data-custom-style":
246 if node.attrib[name] == "PCJ Equation":
247 classe = "mathjax-formula PCJ-Equation"
248 else:
249 classe = node.attrib[name].replace(" ", "-")
250 if classe == "PCJ-Section" and "References" in inner_html_text:
251 html_text = tex_text = xml_text = ""
252 return html_text, tex_text, xml_text
253 elif classe == "PCJ-Reference":
254 html_text = tex_text = xml_text = ""
255 return html_text, tex_text, xml_text
257 html_text = f"<div class='{classe}'>{inner_html_text}</div>"
258 tex_text = f"<div class='{classe}'>{inner_tex_text}</div>"
260 xml_text = '<div xml:space="preserve">' + inner_jats_xml_text + "</div>"
261 return html_text, tex_text, xml_text
263 def parse_node_with_em(self, node, **kwargs):
264 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
265 node, **kwargs
266 )
268 html_text = f'<span class="italique">{inner_html_text}</span>'
269 tex_text = f"<i>{inner_tex_text}</i>"
271 if len(inner_jats_xml_text) > 0:
272 xml_text = f"<italic>{inner_jats_xml_text}</italic>"
273 else:
274 xml_text = "<italic/>"
276 return html_text, tex_text, xml_text
278 def parse_node_with_h1(self, node, **kwargs):
279 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
280 node, **kwargs
281 )
282 classe = ""
283 for attrib in node.attrib:
284 name = normalize(attrib)
285 if name == "class":
286 classe = node.attrib[name]
287 html_text = f"<h1 class={classe}>{inner_html_text}</h1>"
288 tex_text = f"<h1 class={classe}>{inner_tex_text}</h1>"
290 xml_text = '<h1 xml:space="preserve">' + inner_jats_xml_text + "</h1>"
292 return html_text, tex_text, xml_text
294 def parse_node_with_h2(self, node, **kwargs):
295 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
296 node, **kwargs
297 )
298 classe = ""
299 for attrib in node.attrib:
300 name = normalize(attrib)
301 if name == "class":
302 classe = node.attrib[name]
303 html_text = f"<h2 class={classe}>{inner_html_text}</h2>"
304 tex_text = f"<h2 class={classe}>{inner_tex_text}</h2>"
306 xml_text = '<h2 xml:space="preserve">' + inner_jats_xml_text + "</h2>"
308 return html_text, tex_text, xml_text
310 def parse_node_with_h3(self, node, **kwargs):
311 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
312 node, **kwargs
313 )
314 classe = ""
315 for attrib in node.attrib:
316 name = normalize(attrib)
317 if name == "class":
318 classe = node.attrib[name]
319 html_text = f"<h3 class={classe}>{inner_html_text}</h3>"
320 tex_text = f"<h3 class={classe}>{inner_tex_text}</h3>"
322 xml_text = '<h3 xml:space="preserve">' + inner_jats_xml_text + "</h3>"
324 return html_text, tex_text, xml_text
326 def parse_node_with_h4(self, node, **kwargs):
327 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
328 node, **kwargs
329 )
330 classe = ""
331 for attrib in node.attrib:
332 name = normalize(attrib)
333 if name == "class":
334 classe = node.attrib[name]
335 html_text = f"<h4 class={classe}>{inner_html_text}</h4>"
336 tex_text = f"<h4 class={classe}>{inner_tex_text}</h4>"
338 xml_text = '<h4 xml:space="preserve">' + inner_jats_xml_text + "</h4>"
339 return html_text, tex_text, xml_text
341 def parse_node_with_h5(self, node, **kwargs):
342 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
343 node, **kwargs
344 )
345 classe = ""
346 for attrib in node.attrib:
347 name = normalize(attrib)
348 if name == "class":
349 classe = node.attrib[name]
350 html_text = f"<h5 class={classe}>{inner_html_text}</h5>"
351 tex_text = f"<h5 class={classe}>{inner_tex_text}</h5>"
353 xml_text = '<h5 xml:space="preserve">' + inner_jats_xml_text + "</h5>"
355 return html_text, tex_text, xml_text
357 def parse_node_with_h6(self, node, **kwargs):
358 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
359 node, **kwargs
360 )
361 classe = ""
362 for attrib in node.attrib:
363 name = normalize(attrib)
364 if name == "class":
365 classe = node.attrib[name]
366 html_text = f"<h6 class={classe}>{inner_html_text}</h6>"
367 tex_text = f"<h6 class={classe}>{inner_tex_text}</h6>"
369 xml_text = '<h6 xml:space="preserve">' + inner_jats_xml_text + "</h6>"
370 return html_text, tex_text, xml_text
372 def parse_node_with_img(self, node, **kwargs):
373 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
374 node, **kwargs
375 )
377 # node.attribe["style"] = ""
378 try:
379 prefix = settings.SITE_URL_PREFIX
380 except AttributeError:
381 prefix = ""
383 # src = f"{prefix}/media/img/{self.volume}/{self.pid}/src/media"
384 src = f"{prefix}/media/img/{self.issue_pid}/{self.pid}/src/media"
385 href = ""
386 classe = ""
387 for attrib in node.attrib:
388 name = normalize(attrib)
389 if name == "src":
390 img = os.path.basename(node.attrib[name])
391 name, ext = os.path.splitext(img)
392 # If an image was convreted to jpg, pandoc still wrote the html with the previous extension,
393 # '.tiff' for exemple
394 if ext in [".tiff", ".tif"]:
395 img = name + ".jpg"
396 src = f"{src}/{img}"
397 elif name == "style":
398 classe = "article-body-img"
399 elif name == "data-custom-style":
400 classe = node.attrib[name].replace(" ", "-")
402 html_text = f"<img src={src} class={classe}>{inner_html_text}</img>"
403 tex_text = f"<img src={src} class={classe}>{inner_html_text}</img>"
404 xml_text = f'<graphic xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="{href}">{inner_jats_xml_text}</graphic>'
406 return html_text, tex_text, xml_text
408 def parse_node_with_li(self, node, **kwargs):
409 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
410 node, **kwargs
411 )
412 parent_node = node.getparent()
413 if parent_node.tag == "ul":
414 html_text = f"<li >{inner_html_text}</li>"
415 tex_text = f"<li >{inner_tex_text}</li>"
416 else:
417 html_text = f"<li class='article-list'>{inner_html_text}</li>"
418 tex_text = f"<li class='article-list'>{inner_tex_text}</li>"
420 xml_text = f"<list-item><p>{inner_jats_xml_text}</p></list-item>"
422 return html_text, tex_text, xml_text
424 def parse_node_with_mixed_content(self, node, **kwargs):
425 """
426 Parse and return the text of an XML node which mixes text and XML sub-nodes.
427 Ex: <node>text1 <a>text_a</a> text2 <b>text_b</b>b_tail</node>
428 Some inner nodes are removed, others are kept or replaced.
430 Cedrics XMLs store the MathML and the TeX formulas in 2 siblings.
431 Parse the 2 nodes at the same time.
433 The JATS xml string is constructed at the same time because it is used during a PTF export
435 :param node: XML Node (with MathML), XML Node (with TexMath)
436 :param kwargs: params of the function
437 :return: HTML text, TeX test, XML text
438 """
440 html_text = tex_text = jats_xml_text = ""
442 if node is None: 442 ↛ 443line 442 didn't jump to line 443, because the condition on line 442 was never true
443 return html_text, tex_text, jats_xml_text
445 # The tail is the text following the end of the node
446 # Ex: <node>text1<a>text_a</a>a_tail</node>
447 # The HTML text has to include the tail
448 # only if html_from_mixed_content was called recursively
449 kwargs["is_top"] = kwargs["is_top"] if "is_top" in kwargs else True
451 # lxml replace HTML entities in node.tex and node.tail (like <)
452 # kwargs['escape'] allows to escape back the values
453 kwargs["escape"] = kwargs["escape"] if "escape" in kwargs else True
455 tag = node.tag
457 inner_html_text = inner_tex_text = inner_jats_xml_text = ""
459 # I. Add the node's text.
460 # Some tag have a corresponding html_from_@tag function to generate the HTML text.
462 fct_name = tag
463 fct_name = "parse_node_with_" + fct_name.replace("-", "_")
464 ftor = getattr(self, fct_name, None)
465 if callable(ftor):
466 inner_html_text, inner_tex_text, inner_jats_xml_text = ftor(node, **kwargs)
467 else:
468 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
469 node, **kwargs
470 )
472 html_text += inner_html_text
473 tex_text += inner_tex_text
474 jats_xml_text += inner_jats_xml_text
476 # III. Add the node's tail for children
477 if node.tail:
478 if self.check_citation and node.tag != "a": 478 ↛ 479line 478 didn't jump to line 479, because the condition on line 478 was never true
479 node.tail = create_innerlink_for_citation(node.tail, self.biblio)
480 kwargs["escape"] = False
481 html_text += escape(node.tail) if kwargs["escape"] else node.tail
482 tex_text += escape(node.tail) if kwargs["escape"] else node.tail
483 jats_xml_text += escape(node.tail)
485 return html_text, tex_text, jats_xml_text
487 def parse_node_with_ol(self, node, **kwargs):
488 # # JATS requires <list> to be inside <p>
489 # parent = node.getparent()
490 # if parent is None or parent.tag != "p":
491 # xml_text = f"<p>{xml_text}</p>"
493 return self.parse_list(node, **kwargs)
495 def parse_node_with_p(self, node, **kwargs):
496 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
497 node, **kwargs
498 )
500 html_text = inner_html_text if self.ignore_p else f"<p>{inner_html_text}</p>"
501 tex_text = inner_tex_text if self.ignore_p else f"<p>{inner_tex_text}</p>"
502 if self.ignore_p:
503 xml_text = inner_jats_xml_text
504 elif len(inner_jats_xml_text) > 0: 504 ↛ 507line 504 didn't jump to line 507, because the condition on line 504 was never false
505 xml_text = '<p xml:space="preserve">' + inner_jats_xml_text + "</p>"
506 else:
507 xml_text = '<p xml:space="preserve"/>'
509 return html_text, tex_text, xml_text
511 def parse_node_with_span(self, node, **kwargs):
512 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
513 node, **kwargs
514 )
516 the_class = node.get("class")
517 display = the_class == "math display"
518 if the_class in ["math inline", "math display"]: 518 ↛ 519line 518 didn't jump to line 519, because the condition on line 518 was never true
519 the_class = "mathjax-formula"
521 if the_class == "mathjax-formula":
522 html_text, tex_text, xml_text = self.parse_formula(node, display=display)
523 elif the_class is not None:
524 html_text = f'<span class="{the_class}">{inner_html_text}</span>'
525 tex_text = f'<span class="{the_class}">{inner_tex_text}</span>'
526 xml_text = inner_jats_xml_text
527 else:
528 html_text = f"<span>{inner_html_text}</span>"
529 tex_text = f"<span>{inner_tex_text}</span>"
530 xml_text = inner_jats_xml_text
532 return html_text, tex_text, xml_text
534 def parse_node_with_strong(self, node, **kwargs):
535 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
536 node, **kwargs
537 )
539 html_text = f"<strong>{inner_html_text}</strong>"
540 tex_text = f"<strong>{inner_tex_text}</strong>"
542 if len(inner_jats_xml_text) > 0:
543 xml_text = f"<bold>{inner_jats_xml_text}</bold>"
544 else:
545 xml_text = "<bold/>"
547 return html_text, tex_text, xml_text
549 def parse_node_with_sub(self, node, **kwargs):
550 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
551 node, **kwargs
552 )
554 html_text = f"<sub>{inner_html_text}</sub>"
555 tex_text = f"<sub>{inner_tex_text}</sub>"
556 xml_text = f"<sub>{inner_jats_xml_text}</sub>"
558 return html_text, tex_text, xml_text
560 def parse_node_with_sup(self, node, **kwargs):
561 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
562 node, **kwargs
563 )
565 html_text = f"<sup>{inner_html_text}</sup>"
566 tex_text = f"<sup>{inner_tex_text}</sup>"
567 xml_text = f"<sup>{inner_jats_xml_text}</sup>"
569 return html_text, tex_text, xml_text
571 def parse_node_with_table(self, node, **kwargs):
572 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
573 node, **kwargs
574 )
575 classe = ""
576 for attrib in node.attrib:
577 name = normalize(attrib)
578 if name == "class":
579 classe = node.attrib[name]
580 # Next condition checks style identification with pandoc library used
581 # for docx --> html conversion
582 elif name == "data-custom-style":
583 classe = node.attrib[name].replace(" ", "-")
584 if "PCJ" in self.issue_pid:
585 html_text = (
586 f"<div class='PCJ-table'><table class={classe}>{inner_html_text}</table></div>"
587 )
588 tex_text = (
589 f"<div class='PCJ-table'><table class={classe}>{inner_tex_text}</table></div>"
590 )
591 else:
592 html_text = f"<table class={classe}>{inner_html_text}</table>"
593 tex_text = f"<table class={classe}>{inner_tex_text}</table>"
595 xml_text = '<table xml:space="preserve">' + inner_jats_xml_text + "</table>"
596 return html_text, tex_text, xml_text
598 def parse_node_with_tbody(self, node, **kwargs):
599 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
600 node, **kwargs
601 )
602 classe = ""
603 for attrib in node.attrib:
604 name = normalize(attrib)
605 if name == "class":
606 classe = node.attrib[name]
607 html_text = f"<tbody class={classe}>{inner_html_text}</tbody>"
608 tex_text = f"<tbody class={classe}>{inner_tex_text}</tbody>"
610 xml_text = '<tbody xml:space="preserve">' + inner_jats_xml_text + "</tbody>"
611 return html_text, tex_text, xml_text
613 def parse_node_with_td(self, node, **kwargs):
614 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
615 node, **kwargs
616 )
617 classe = ""
618 rowspan = ""
619 colspan = ""
620 for attrib in node.attrib:
621 name = normalize(attrib)
622 if name == "class":
623 classe = node.attrib[name]
624 elif name == "rowspan":
625 rowspan = node.attrib[name]
626 elif name == "colspan":
627 colspan = node.attrib[name]
628 if classe:
629 html_text = f"<td class={classe} rowspan='{rowspan}' colspan='{colspan}'>{inner_html_text}</td>"
630 tex_text = (
631 f"<td class={classe} rowspan='{rowspan}' colspan='{colspan}'>{inner_tex_text}</td>"
632 )
633 else:
634 html_text = f"<td rowspan='{rowspan}' colspan='{colspan}'>{inner_html_text}</td>"
635 tex_text = f"<td rowspan='{rowspan}' colspan='{colspan}'>{inner_tex_text}</td>"
637 xml_text = '<td xml:space="preserve">' + inner_jats_xml_text + "</td>"
638 return html_text, tex_text, xml_text
640 def parse_node_with_th(self, node, **kwargs):
641 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
642 node, **kwargs
643 )
644 classe = ""
645 rowspan = ""
646 colspan = ""
647 for attrib in node.attrib:
648 name = normalize(attrib)
649 if name == "class":
650 classe = node.attrib[name]
651 elif name == "rowspan":
652 rowspan = node.attrib[name]
653 elif name == "colspan":
654 colspan = node.attrib[name]
655 if classe:
656 html_text = f"<th class={classe} rowspan='{rowspan}' colspan='{colspan}'>{inner_html_text}</th>"
657 tex_text = (
658 f"<th class={classe} rowspan='{rowspan}' colspan='{colspan}'>{inner_tex_text}</th>"
659 )
660 else:
661 html_text = f"<th rowspan='{rowspan}' colspan='{colspan}'>{inner_html_text}</th>"
662 tex_text = f"<th rowspan='{rowspan}' colspan='{colspan}'>{inner_tex_text}</th>"
664 xml_text = '<th xml:space="preserve">' + inner_jats_xml_text + "</th>"
665 return html_text, tex_text, xml_text
667 def parse_node_with_tr(self, node, **kwargs):
668 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner(
669 node, **kwargs
670 )
671 classe = ""
673 html_text = f"<tr class='{classe}'>{inner_html_text}</tr>"
674 tex_text = f"<tr class='{classe}'>{inner_tex_text}</tr>"
676 xml_text = '<tr xml:space="preserve">' + inner_jats_xml_text + "</tr>"
677 return html_text, tex_text, xml_text
679 def parse_node_with_ul(self, node, **kwargs):
680 return self.parse_list(node, **kwargs)
682 def parse_tree(self, tree):
683 self.value_html, self.value_tex, self.value_xml = self.parse_node_with_mixed_content(
684 tree, is_top=True
685 )
688if __name__ == "__main__": 688 ↛ 689line 688 didn't jump to line 689, because the condition on line 688 was never true
689 html_value = r'<p>Te<st <span class="mathjax-formula">\(x = {-b \pm \sqrt{b^2-4ac} \over 2a}\)</span> done</p><ul><li>Item</li></ul><ol><li>Item 1<br />New line</li><li> </li></ol>'
690 parser = CkeditorParser(html_value=html_value)
691 result = parser.value_xml
692 print(result)