Coverage for apps/ptf/cmds/xml/ckeditor/ckeditor_parser.py: 45%

426 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2024-07-18 09:02 +0000

1################################################################################################## 

2# 

3# README 

4# 

5# ckeditor_parser.py parses the HTML strings created by a CKEditor 

6# with tex formulas inside <span class="math-tex"> 

7# It returns the JATS equivalent. 

8# 

9# Ex: <p>Te&lt;st&nbsp;<span class="math-tex">\(x = {-b \pm \sqrt{b^2-4ac} \over 2a}\)</span> done</p> 

10# <ul><li>Item</li></ul><ol><li>Item 1<br />New line</li><li>&nbsp;</li></ol> 

11# 

12################################################################################################## 

13 

14if __name__ == "__main__": 14 ↛ 15line 14 didn't jump to line 15, because the condition on line 14 was never true

15 import os 

16 import sys 

17 

18 BASE_DIR = os.path.dirname( 

19 os.path.dirname( 

20 os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 

21 ) 

22 ) 

23 sys.path.append(BASE_DIR) 

24 

25import os 

26 

27from lxml import etree 

28 

29from django.conf import settings 

30 

31from ptf.cmds.xml.xml_utils import escape 

32from ptf.cmds.xml.xml_utils import normalize 

33from ptf.cmds.xml.xml_utils import replace_html_entities 

34from ptf.utils import create_innerlink_for_citation 

35 

36 

37class CkeditorParser: 

38 def __init__(self, *args, **kwargs): 

39 self.warnings = [] 

40 self.value_xml = "" 

41 self.value_html = "" 

42 self.value_tex = "" 

43 

44 if "tree" not in kwargs and "html_value" in kwargs: 44 ↛ 56line 44 didn't jump to line 56, because the condition on line 44 was never false

45 parser = etree.XMLParser( 

46 huge_tree=True, 

47 recover=True, 

48 remove_blank_text=False, 

49 remove_comments=True, 

50 resolve_entities=True, 

51 ) 

52 html_value = kwargs["html_value"].replace("\n\n", "") 

53 body = f"<body>{replace_html_entities(html_value)}</body>" 

54 tree = etree.fromstring(body.encode("utf-8"), parser=parser) 

55 else: 

56 tree = kwargs["tree"] 

57 

58 self.mml_formulas = kwargs["mml_formulas"] 

59 self.ignore_p = kwargs["ignore_p"] if "ignore_p" in kwargs else False 

60 self.pid = kwargs.get("pid", None) 

61 self.volume = kwargs.get("volume", None) 

62 self.issue_pid = kwargs.get("issue_pid", None) 

63 self.check_citation = kwargs.get("check_citation", False) 

64 self.biblio = kwargs.get("biblio", None) 

65 

66 self.parse_tree(tree) 

67 

68 def parse_formula(self, node, **kwargs): 

69 formula = node.text or "" 

70 display = kwargs.get("display", None) 

71 if len(formula) > 0 and formula.find("\\(") == 0: 

72 formula = formula[2:-2] 

73 # elif len(formula) > 0 and formula.find("\[") == 0: 

74 # formula = formula[1:-1] 

75 mml = "" 

76 if len(self.mml_formulas) > 0: 

77 mml = self.mml_formulas.pop(0) 

78 

79 is_inline = True 

80 parent = node.getparent() 

81 if parent is not None and parent.tag == "p" and not parent.text and not parent.tail: 

82 is_inline = False 

83 

84 formula = f"${formula}$" 

85 if mml: 

86 html_text = f'<span class="mathjax-formula" title="{formula}">{mml}</span>' 

87 elif display: 87 ↛ 88line 87 didn't jump to line 88, because the condition on line 87 was never true

88 html_text = f'<span class="mathjax-formula display" title="{formula}">{formula}</span>' 

89 else: 

90 html_text = f'<span class="mathjax-formula" title="{formula}">{formula}</span>' 

91 tex_text = formula 

92 

93 if is_inline: 

94 xml_text = "<inline-formula><alternatives>" 

95 if len(mml) > 0: 

96 xml_text += mml 

97 xml_text += f"<tex-math>{escape(formula)}</tex-math>" 

98 xml_text += "</alternatives></inline-formula>" 

99 else: 

100 prefix = '<table class="formula mathjax-formula"><tr><td class="formula-inner">' 

101 suffix = '</td><td class="formula-label"></td></tr></table>' 

102 html_text = prefix + html_text + suffix 

103 tex_text = prefix + tex_text + suffix 

104 

105 xml_text = '<disp-formula xml:space="preserve">\n<alternatives>' 

106 if len(mml) > 0: 

107 xml_text += mml 

108 xml_text += f"<tex-math>{escape(formula)}</tex-math>" 

109 xml_text += "</alternatives></disp-formula>" 

110 

111 return html_text, tex_text, xml_text 

112 

113 def parse_list(self, node, **kwargs): 

114 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

115 node, **kwargs 

116 ) 

117 

118 list_type = "simple" if node.tag == "ul" else "number" 

119 

120 xml_text = f'<list list-type="{list_type}">' 

121 xml_text += inner_jats_xml_text 

122 xml_text += "</list>" 

123 

124 # # JATS requires <list> to be inside <p> 

125 # parent = node.getparent() 

126 # if parent is None or parent.tag != "p": 

127 # xml_text = f"<p>{xml_text}</p>" 

128 html_text = f"<{node.tag}>{inner_html_text}</{node.tag}>" 

129 tex_text = f"<{node.tag}>{inner_tex_text}</{node.tag}>" 

130 

131 return html_text, tex_text, xml_text 

132 

133 def parse_node_inner(self, node, **kwargs): 

134 """ 

135 Used by parse_node_with_mixed_content for nodes that have a different tag in JATS or HTML 

136 :param node: 

137 :param kwargs: 

138 :return: 

139 """ 

140 

141 kwargs["is_top"] = False 

142 inner_html_text = inner_tex_text = inner_jats_xml_text = "" 

143 

144 if node.text: 

145 text = node.text 

146 

147 if len(text) > 0 and text[0] == "\n" and node.tag in ("list", "item"): 147 ↛ 148line 147 didn't jump to line 148, because the condition on line 147 was never true

148 text = text[1:] 

149 

150 inner_jats_xml_text += escape(text) 

151 inner_html_text += escape(text) if kwargs["escape"] else text 

152 inner_tex_text += escape(text) if kwargs["escape"] else text 

153 

154 if self.check_citation and node.tag != "a": 154 ↛ 155line 154 didn't jump to line 155, because the condition on line 154 was never true

155 inner_html_text = create_innerlink_for_citation(inner_html_text, self.biblio) 

156 

157 for i in range(len(node)): 

158 child = node[i] 

159 

160 ( 

161 child_html_text, 

162 child_tex_text, 

163 child_jats_xml_text, 

164 ) = self.parse_node_with_mixed_content(child, **kwargs) 

165 inner_html_text += child_html_text 

166 inner_tex_text += child_tex_text 

167 inner_jats_xml_text += child_jats_xml_text 

168 

169 return inner_html_text, inner_tex_text, inner_jats_xml_text 

170 

171 def parse_node_with_a(self, node, **kwargs): 

172 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

173 node, **kwargs 

174 ) 

175 

176 href = "" 

177 for attrib in node.attrib: 

178 name = normalize(attrib) 

179 if name == "href": 

180 href = node.attrib[attrib] 

181 

182 if not href: 

183 href = inner_tex_text 

184 

185 html_text = f'<a href="{href}">{inner_html_text}</a>' 

186 tex_text = f'<a href="{href}">{inner_tex_text}</a>' 

187 xml_text = f'<ext-link ext-link-type="uri" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="{href}">{inner_jats_xml_text}</ext-link>' 

188 

189 return html_text, tex_text, xml_text 

190 

191 def parse_node_with_br(self, node, **kwargs): 

192 html_text = tex_text = "<br/>" 

193 xml_text = "<break/>" 

194 

195 return html_text, tex_text, xml_text 

196 

197 def parse_node_with_colgroup(self, node, **kwargs): 

198 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

199 node, **kwargs 

200 ) 

201 classe = "" 

202 for attrib in node.attrib: 

203 name = normalize(attrib) 

204 if name == "class": 

205 classe = node.attrib[name] 

206 html_text = f"<colgroup class={classe}>{inner_html_text}</colgroup>" 

207 tex_text = f"<colgroup class={classe}>{inner_tex_text}</colgroup>" 

208 

209 xml_text = '<colgroup xml:space="preserve">' + inner_jats_xml_text + "</colgroup>" 

210 return html_text, tex_text, xml_text 

211 

212 def parse_node_with_col(self, node, **kwargs): 

213 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

214 node, **kwargs 

215 ) 

216 classe = "" 

217 style = "" 

218 for attrib in node.attrib: 

219 name = normalize(attrib) 

220 if name == "class": 

221 classe = node.attrib[name] 

222 elif name == "style": 

223 style = node.attrib[name] 

224 if classe: 

225 html_text = f"<col class={classe} style='{style}'>{inner_html_text}</col>" 

226 tex_text = f"<col class={classe} style='{style}'>{inner_tex_text}</col>" 

227 else: 

228 html_text = f"<col style='{style}'>{inner_html_text}</col>" 

229 tex_text = f"<col style='{style}'>{inner_tex_text}</col>" 

230 

231 xml_text = '<col xml:space="preserve">' + inner_jats_xml_text + "</col>" 

232 return html_text, tex_text, xml_text 

233 

234 def parse_node_with_div(self, node, **kwargs): 

235 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

236 node, **kwargs 

237 ) 

238 classe = "" 

239 for attrib in node.attrib: 

240 name = normalize(attrib) 

241 if name == "class": 

242 classe = node.attrib[name] 

243 # Next condition checks style identification with pandoc library used 

244 # for docx --> html conversion 

245 elif name == "data-custom-style": 

246 if node.attrib[name] == "PCJ Equation": 

247 classe = "mathjax-formula PCJ-Equation" 

248 else: 

249 classe = node.attrib[name].replace(" ", "-") 

250 if classe == "PCJ-Section" and "References" in inner_html_text: 

251 html_text = tex_text = xml_text = "" 

252 return html_text, tex_text, xml_text 

253 elif classe == "PCJ-Reference": 

254 html_text = tex_text = xml_text = "" 

255 return html_text, tex_text, xml_text 

256 

257 html_text = f"<div class='{classe}'>{inner_html_text}</div>" 

258 tex_text = f"<div class='{classe}'>{inner_tex_text}</div>" 

259 

260 xml_text = '<div xml:space="preserve">' + inner_jats_xml_text + "</div>" 

261 return html_text, tex_text, xml_text 

262 

263 def parse_node_with_em(self, node, **kwargs): 

264 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

265 node, **kwargs 

266 ) 

267 

268 html_text = f'<span class="italique">{inner_html_text}</span>' 

269 tex_text = f"<i>{inner_tex_text}</i>" 

270 

271 if len(inner_jats_xml_text) > 0: 

272 xml_text = f"<italic>{inner_jats_xml_text}</italic>" 

273 else: 

274 xml_text = "<italic/>" 

275 

276 return html_text, tex_text, xml_text 

277 

278 def parse_node_with_h1(self, node, **kwargs): 

279 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

280 node, **kwargs 

281 ) 

282 classe = "" 

283 for attrib in node.attrib: 

284 name = normalize(attrib) 

285 if name == "class": 

286 classe = node.attrib[name] 

287 html_text = f"<h1 class={classe}>{inner_html_text}</h1>" 

288 tex_text = f"<h1 class={classe}>{inner_tex_text}</h1>" 

289 

290 xml_text = '<h1 xml:space="preserve">' + inner_jats_xml_text + "</h1>" 

291 

292 return html_text, tex_text, xml_text 

293 

294 def parse_node_with_h2(self, node, **kwargs): 

295 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

296 node, **kwargs 

297 ) 

298 classe = "" 

299 for attrib in node.attrib: 

300 name = normalize(attrib) 

301 if name == "class": 

302 classe = node.attrib[name] 

303 html_text = f"<h2 class={classe}>{inner_html_text}</h2>" 

304 tex_text = f"<h2 class={classe}>{inner_tex_text}</h2>" 

305 

306 xml_text = '<h2 xml:space="preserve">' + inner_jats_xml_text + "</h2>" 

307 

308 return html_text, tex_text, xml_text 

309 

310 def parse_node_with_h3(self, node, **kwargs): 

311 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

312 node, **kwargs 

313 ) 

314 classe = "" 

315 for attrib in node.attrib: 

316 name = normalize(attrib) 

317 if name == "class": 

318 classe = node.attrib[name] 

319 html_text = f"<h3 class={classe}>{inner_html_text}</h3>" 

320 tex_text = f"<h3 class={classe}>{inner_tex_text}</h3>" 

321 

322 xml_text = '<h3 xml:space="preserve">' + inner_jats_xml_text + "</h3>" 

323 

324 return html_text, tex_text, xml_text 

325 

326 def parse_node_with_h4(self, node, **kwargs): 

327 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

328 node, **kwargs 

329 ) 

330 classe = "" 

331 for attrib in node.attrib: 

332 name = normalize(attrib) 

333 if name == "class": 

334 classe = node.attrib[name] 

335 html_text = f"<h4 class={classe}>{inner_html_text}</h4>" 

336 tex_text = f"<h4 class={classe}>{inner_tex_text}</h4>" 

337 

338 xml_text = '<h4 xml:space="preserve">' + inner_jats_xml_text + "</h4>" 

339 return html_text, tex_text, xml_text 

340 

341 def parse_node_with_h5(self, node, **kwargs): 

342 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

343 node, **kwargs 

344 ) 

345 classe = "" 

346 for attrib in node.attrib: 

347 name = normalize(attrib) 

348 if name == "class": 

349 classe = node.attrib[name] 

350 html_text = f"<h5 class={classe}>{inner_html_text}</h5>" 

351 tex_text = f"<h5 class={classe}>{inner_tex_text}</h5>" 

352 

353 xml_text = '<h5 xml:space="preserve">' + inner_jats_xml_text + "</h5>" 

354 

355 return html_text, tex_text, xml_text 

356 

357 def parse_node_with_h6(self, node, **kwargs): 

358 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

359 node, **kwargs 

360 ) 

361 classe = "" 

362 for attrib in node.attrib: 

363 name = normalize(attrib) 

364 if name == "class": 

365 classe = node.attrib[name] 

366 html_text = f"<h6 class={classe}>{inner_html_text}</h6>" 

367 tex_text = f"<h6 class={classe}>{inner_tex_text}</h6>" 

368 

369 xml_text = '<h6 xml:space="preserve">' + inner_jats_xml_text + "</h6>" 

370 return html_text, tex_text, xml_text 

371 

372 def parse_node_with_img(self, node, **kwargs): 

373 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

374 node, **kwargs 

375 ) 

376 

377 # node.attribe["style"] = "" 

378 try: 

379 prefix = settings.SITE_URL_PREFIX 

380 except AttributeError: 

381 prefix = "" 

382 

383 # src = f"{prefix}/media/img/{self.volume}/{self.pid}/src/media" 

384 src = f"{prefix}/media/img/{self.issue_pid}/{self.pid}/src/media" 

385 href = "" 

386 classe = "" 

387 for attrib in node.attrib: 

388 name = normalize(attrib) 

389 if name == "src": 

390 img = os.path.basename(node.attrib[name]) 

391 name, ext = os.path.splitext(img) 

392 # If an image was convreted to jpg, pandoc still wrote the html with the previous extension, 

393 # '.tiff' for exemple 

394 if ext in [".tiff", ".tif"]: 

395 img = name + ".jpg" 

396 src = f"{src}/{img}" 

397 elif name == "style": 

398 classe = "article-body-img" 

399 elif name == "data-custom-style": 

400 classe = node.attrib[name].replace(" ", "-") 

401 

402 html_text = f"<img src={src} class={classe}>{inner_html_text}</img>" 

403 tex_text = f"<img src={src} class={classe}>{inner_html_text}</img>" 

404 xml_text = f'<graphic xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="{href}">{inner_jats_xml_text}</graphic>' 

405 

406 return html_text, tex_text, xml_text 

407 

408 def parse_node_with_li(self, node, **kwargs): 

409 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

410 node, **kwargs 

411 ) 

412 parent_node = node.getparent() 

413 if parent_node.tag == "ul": 

414 html_text = f"<li >{inner_html_text}</li>" 

415 tex_text = f"<li >{inner_tex_text}</li>" 

416 else: 

417 html_text = f"<li class='article-list'>{inner_html_text}</li>" 

418 tex_text = f"<li class='article-list'>{inner_tex_text}</li>" 

419 

420 xml_text = f"<list-item><p>{inner_jats_xml_text}</p></list-item>" 

421 

422 return html_text, tex_text, xml_text 

423 

424 def parse_node_with_mixed_content(self, node, **kwargs): 

425 """ 

426 Parse and return the text of an XML node which mixes text and XML sub-nodes. 

427 Ex: <node>text1 <a>text_a</a> text2 <b>text_b</b>b_tail</node> 

428 Some inner nodes are removed, others are kept or replaced. 

429 

430 Cedrics XMLs store the MathML and the TeX formulas in 2 siblings. 

431 Parse the 2 nodes at the same time. 

432 

433 The JATS xml string is constructed at the same time because it is used during a PTF export 

434 

435 :param node: XML Node (with MathML), XML Node (with TexMath) 

436 :param kwargs: params of the function 

437 :return: HTML text, TeX test, XML text 

438 """ 

439 

440 html_text = tex_text = jats_xml_text = "" 

441 

442 if node is None: 442 ↛ 443line 442 didn't jump to line 443, because the condition on line 442 was never true

443 return html_text, tex_text, jats_xml_text 

444 

445 # The tail is the text following the end of the node 

446 # Ex: <node>text1<a>text_a</a>a_tail</node> 

447 # The HTML text has to include the tail 

448 # only if html_from_mixed_content was called recursively 

449 kwargs["is_top"] = kwargs["is_top"] if "is_top" in kwargs else True 

450 

451 # lxml replace HTML entities in node.tex and node.tail (like &lt;) 

452 # kwargs['escape'] allows to escape back the values 

453 kwargs["escape"] = kwargs["escape"] if "escape" in kwargs else True 

454 

455 tag = node.tag 

456 

457 inner_html_text = inner_tex_text = inner_jats_xml_text = "" 

458 

459 # I. Add the node's text. 

460 # Some tag have a corresponding html_from_@tag function to generate the HTML text. 

461 

462 fct_name = tag 

463 fct_name = "parse_node_with_" + fct_name.replace("-", "_") 

464 ftor = getattr(self, fct_name, None) 

465 if callable(ftor): 

466 inner_html_text, inner_tex_text, inner_jats_xml_text = ftor(node, **kwargs) 

467 else: 

468 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

469 node, **kwargs 

470 ) 

471 

472 html_text += inner_html_text 

473 tex_text += inner_tex_text 

474 jats_xml_text += inner_jats_xml_text 

475 

476 # III. Add the node's tail for children 

477 if node.tail: 

478 if self.check_citation and node.tag != "a": 478 ↛ 479line 478 didn't jump to line 479, because the condition on line 478 was never true

479 node.tail = create_innerlink_for_citation(node.tail, self.biblio) 

480 kwargs["escape"] = False 

481 html_text += escape(node.tail) if kwargs["escape"] else node.tail 

482 tex_text += escape(node.tail) if kwargs["escape"] else node.tail 

483 jats_xml_text += escape(node.tail) 

484 

485 return html_text, tex_text, jats_xml_text 

486 

487 def parse_node_with_ol(self, node, **kwargs): 

488 # # JATS requires <list> to be inside <p> 

489 # parent = node.getparent() 

490 # if parent is None or parent.tag != "p": 

491 # xml_text = f"<p>{xml_text}</p>" 

492 

493 return self.parse_list(node, **kwargs) 

494 

495 def parse_node_with_p(self, node, **kwargs): 

496 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

497 node, **kwargs 

498 ) 

499 

500 html_text = inner_html_text if self.ignore_p else f"<p>{inner_html_text}</p>" 

501 tex_text = inner_tex_text if self.ignore_p else f"<p>{inner_tex_text}</p>" 

502 if self.ignore_p: 

503 xml_text = inner_jats_xml_text 

504 elif len(inner_jats_xml_text) > 0: 504 ↛ 507line 504 didn't jump to line 507, because the condition on line 504 was never false

505 xml_text = '<p xml:space="preserve">' + inner_jats_xml_text + "</p>" 

506 else: 

507 xml_text = '<p xml:space="preserve"/>' 

508 

509 return html_text, tex_text, xml_text 

510 

511 def parse_node_with_span(self, node, **kwargs): 

512 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

513 node, **kwargs 

514 ) 

515 

516 the_class = node.get("class") 

517 display = the_class == "math display" 

518 if the_class in ["math inline", "math display"]: 518 ↛ 519line 518 didn't jump to line 519, because the condition on line 518 was never true

519 the_class = "mathjax-formula" 

520 

521 if the_class == "mathjax-formula": 

522 html_text, tex_text, xml_text = self.parse_formula(node, display=display) 

523 elif the_class is not None: 

524 html_text = f'<span class="{the_class}">{inner_html_text}</span>' 

525 tex_text = f'<span class="{the_class}">{inner_tex_text}</span>' 

526 xml_text = inner_jats_xml_text 

527 else: 

528 html_text = f"<span>{inner_html_text}</span>" 

529 tex_text = f"<span>{inner_tex_text}</span>" 

530 xml_text = inner_jats_xml_text 

531 

532 return html_text, tex_text, xml_text 

533 

534 def parse_node_with_strong(self, node, **kwargs): 

535 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

536 node, **kwargs 

537 ) 

538 

539 html_text = f"<strong>{inner_html_text}</strong>" 

540 tex_text = f"<strong>{inner_tex_text}</strong>" 

541 

542 if len(inner_jats_xml_text) > 0: 

543 xml_text = f"<bold>{inner_jats_xml_text}</bold>" 

544 else: 

545 xml_text = "<bold/>" 

546 

547 return html_text, tex_text, xml_text 

548 

549 def parse_node_with_sub(self, node, **kwargs): 

550 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

551 node, **kwargs 

552 ) 

553 

554 html_text = f"<sub>{inner_html_text}</sub>" 

555 tex_text = f"<sub>{inner_tex_text}</sub>" 

556 xml_text = f"<sub>{inner_jats_xml_text}</sub>" 

557 

558 return html_text, tex_text, xml_text 

559 

560 def parse_node_with_sup(self, node, **kwargs): 

561 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

562 node, **kwargs 

563 ) 

564 

565 html_text = f"<sup>{inner_html_text}</sup>" 

566 tex_text = f"<sup>{inner_tex_text}</sup>" 

567 xml_text = f"<sup>{inner_jats_xml_text}</sup>" 

568 

569 return html_text, tex_text, xml_text 

570 

571 def parse_node_with_table(self, node, **kwargs): 

572 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

573 node, **kwargs 

574 ) 

575 classe = "" 

576 for attrib in node.attrib: 

577 name = normalize(attrib) 

578 if name == "class": 

579 classe = node.attrib[name] 

580 # Next condition checks style identification with pandoc library used 

581 # for docx --> html conversion 

582 elif name == "data-custom-style": 

583 classe = node.attrib[name].replace(" ", "-") 

584 if "PCJ" in self.issue_pid: 

585 html_text = ( 

586 f"<div class='PCJ-table'><table class={classe}>{inner_html_text}</table></div>" 

587 ) 

588 tex_text = ( 

589 f"<div class='PCJ-table'><table class={classe}>{inner_tex_text}</table></div>" 

590 ) 

591 else: 

592 html_text = f"<table class={classe}>{inner_html_text}</table>" 

593 tex_text = f"<table class={classe}>{inner_tex_text}</table>" 

594 

595 xml_text = '<table xml:space="preserve">' + inner_jats_xml_text + "</table>" 

596 return html_text, tex_text, xml_text 

597 

598 def parse_node_with_tbody(self, node, **kwargs): 

599 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

600 node, **kwargs 

601 ) 

602 classe = "" 

603 for attrib in node.attrib: 

604 name = normalize(attrib) 

605 if name == "class": 

606 classe = node.attrib[name] 

607 html_text = f"<tbody class={classe}>{inner_html_text}</tbody>" 

608 tex_text = f"<tbody class={classe}>{inner_tex_text}</tbody>" 

609 

610 xml_text = '<tbody xml:space="preserve">' + inner_jats_xml_text + "</tbody>" 

611 return html_text, tex_text, xml_text 

612 

613 def parse_node_with_td(self, node, **kwargs): 

614 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

615 node, **kwargs 

616 ) 

617 classe = "" 

618 rowspan = "" 

619 colspan = "" 

620 for attrib in node.attrib: 

621 name = normalize(attrib) 

622 if name == "class": 

623 classe = node.attrib[name] 

624 elif name == "rowspan": 

625 rowspan = node.attrib[name] 

626 elif name == "colspan": 

627 colspan = node.attrib[name] 

628 if classe: 

629 html_text = f"<td class={classe} rowspan='{rowspan}' colspan='{colspan}'>{inner_html_text}</td>" 

630 tex_text = ( 

631 f"<td class={classe} rowspan='{rowspan}' colspan='{colspan}'>{inner_tex_text}</td>" 

632 ) 

633 else: 

634 html_text = f"<td rowspan='{rowspan}' colspan='{colspan}'>{inner_html_text}</td>" 

635 tex_text = f"<td rowspan='{rowspan}' colspan='{colspan}'>{inner_tex_text}</td>" 

636 

637 xml_text = '<td xml:space="preserve">' + inner_jats_xml_text + "</td>" 

638 return html_text, tex_text, xml_text 

639 

640 def parse_node_with_th(self, node, **kwargs): 

641 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

642 node, **kwargs 

643 ) 

644 classe = "" 

645 rowspan = "" 

646 colspan = "" 

647 for attrib in node.attrib: 

648 name = normalize(attrib) 

649 if name == "class": 

650 classe = node.attrib[name] 

651 elif name == "rowspan": 

652 rowspan = node.attrib[name] 

653 elif name == "colspan": 

654 colspan = node.attrib[name] 

655 if classe: 

656 html_text = f"<th class={classe} rowspan='{rowspan}' colspan='{colspan}'>{inner_html_text}</th>" 

657 tex_text = ( 

658 f"<th class={classe} rowspan='{rowspan}' colspan='{colspan}'>{inner_tex_text}</th>" 

659 ) 

660 else: 

661 html_text = f"<th rowspan='{rowspan}' colspan='{colspan}'>{inner_html_text}</th>" 

662 tex_text = f"<th rowspan='{rowspan}' colspan='{colspan}'>{inner_tex_text}</th>" 

663 

664 xml_text = '<th xml:space="preserve">' + inner_jats_xml_text + "</th>" 

665 return html_text, tex_text, xml_text 

666 

667 def parse_node_with_tr(self, node, **kwargs): 

668 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

669 node, **kwargs 

670 ) 

671 classe = "" 

672 

673 html_text = f"<tr class='{classe}'>{inner_html_text}</tr>" 

674 tex_text = f"<tr class='{classe}'>{inner_tex_text}</tr>" 

675 

676 xml_text = '<tr xml:space="preserve">' + inner_jats_xml_text + "</tr>" 

677 return html_text, tex_text, xml_text 

678 

679 def parse_node_with_ul(self, node, **kwargs): 

680 return self.parse_list(node, **kwargs) 

681 

682 def parse_tree(self, tree): 

683 self.value_html, self.value_tex, self.value_xml = self.parse_node_with_mixed_content( 

684 tree, is_top=True 

685 ) 

686 

687 

688if __name__ == "__main__": 688 ↛ 689line 688 didn't jump to line 689, because the condition on line 688 was never true

689 html_value = r'<p>Te&lt;st&nbsp;<span class="mathjax-formula">\(x = {-b \pm \sqrt{b^2-4ac} \over 2a}\)</span> done</p><ul><li>Item</li></ul><ol><li>Item 1<br />New line</li><li>&nbsp;</li></ol>' 

690 parser = CkeditorParser(html_value=html_value) 

691 result = parser.value_xml 

692 print(result)