Coverage for apps/ptf/cmds/xml/ckeditor/ckeditor_parser.py: 92%

187 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2024-02-28 09:09 +0000

1################################################################################################## 

2# 

3# README 

4# 

5# ckeditor_parser.py parses the HTML strings created by a CKEditor 

6# with tex formulas inside <span class="math-tex"> 

7# It returns the JATS equivalent. 

8# 

9# Ex: <p>Te&lt;st&nbsp;<span class="math-tex">\(x = {-b \pm \sqrt{b^2-4ac} \over 2a}\)</span> done</p> 

10# <ul><li>Item</li></ul><ol><li>Item 1<br />New line</li><li>&nbsp;</li></ol> 

11# 

12################################################################################################## 

13 

14if __name__ == "__main__": 14 ↛ 15line 14 didn't jump to line 15, because the condition on line 14 was never true

15 import os 

16 import sys 

17 

18 BASE_DIR = os.path.dirname( 

19 os.path.dirname( 

20 os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 

21 ) 

22 ) 

23 sys.path.append(BASE_DIR) 

24 

25from lxml import etree 

26 

27from ptf.cmds.xml.xml_utils import escape 

28from ptf.cmds.xml.xml_utils import normalize 

29from ptf.cmds.xml.xml_utils import replace_html_entities 

30 

31 

32class CkeditorParser: 

33 def __init__(self, *args, **kwargs): 

34 self.warnings = [] 

35 self.value_xml = "" 

36 self.value_html = "" 

37 self.value_tex = "" 

38 

39 if "tree" not in kwargs and "html_value" in kwargs: 39 ↛ 51line 39 didn't jump to line 51, because the condition on line 39 was never false

40 parser = etree.XMLParser( 

41 huge_tree=True, 

42 recover=True, 

43 remove_blank_text=False, 

44 remove_comments=True, 

45 resolve_entities=True, 

46 ) 

47 html_value = kwargs["html_value"].replace("\n\n", "") 

48 body = f"<body>{replace_html_entities(html_value)}</body>" 

49 tree = etree.fromstring(body.encode("utf-8"), parser=parser) 

50 else: 

51 tree = kwargs["tree"] 

52 

53 self.mml_formulas = kwargs["mml_formulas"] 

54 self.ignore_p = kwargs["ignore_p"] if "ignore_p" in kwargs else False 

55 

56 self.parse_tree(tree) 

57 

58 def parse_tree(self, tree): 

59 self.value_html, self.value_tex, self.value_xml = self.parse_node_with_mixed_content( 

60 tree, is_top=True 

61 ) 

62 

63 def parse_node_inner(self, node, **kwargs): 

64 """ 

65 Used by parse_node_with_mixed_content for nodes that have a different tag in JATS or HTML 

66 :param node: 

67 :param kwargs: 

68 :return: 

69 """ 

70 

71 kwargs["is_top"] = False 

72 inner_html_text = inner_tex_text = inner_jats_xml_text = "" 

73 

74 if node.text: 

75 text = node.text 

76 

77 if len(text) > 0 and text[0] == "\n" and node.tag in ("list", "item"): 77 ↛ 78line 77 didn't jump to line 78, because the condition on line 77 was never true

78 text = text[1:] 

79 

80 inner_jats_xml_text += escape(text) 

81 inner_html_text += escape(text) if kwargs["escape"] else text 

82 inner_tex_text += escape(text) if kwargs["escape"] else text 

83 

84 for i in range(len(node)): 

85 child = node[i] 

86 

87 ( 

88 child_html_text, 

89 child_tex_text, 

90 child_jats_xml_text, 

91 ) = self.parse_node_with_mixed_content(child, **kwargs) 

92 inner_html_text += child_html_text 

93 inner_tex_text += child_tex_text 

94 inner_jats_xml_text += child_jats_xml_text 

95 

96 return inner_html_text, inner_tex_text, inner_jats_xml_text 

97 

98 def parse_node_with_a(self, node, **kwargs): 

99 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

100 node, **kwargs 

101 ) 

102 

103 href = "" 

104 for attrib in node.attrib: 

105 name = normalize(attrib) 

106 if name == "href": 

107 href = node.attrib[attrib] 

108 

109 if not href: 

110 href = inner_tex_text 

111 

112 html_text = f'<a href="{href}">{inner_html_text}</a>' 

113 tex_text = f'<a href="{href}">{inner_tex_text}</a>' 

114 xml_text = f'<ext-link ext-link-type="uri" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="{href}">{inner_jats_xml_text}</ext-link>' 

115 

116 return html_text, tex_text, xml_text 

117 

118 def parse_node_with_strong(self, node, **kwargs): 

119 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

120 node, **kwargs 

121 ) 

122 

123 html_text = f"<strong>{inner_html_text}</strong>" 

124 tex_text = f"<strong>{inner_tex_text}</strong>" 

125 

126 if len(inner_jats_xml_text) > 0: 

127 xml_text = f"<bold>{inner_jats_xml_text}</bold>" 

128 else: 

129 xml_text = "<bold/>" 

130 

131 return html_text, tex_text, xml_text 

132 

133 def parse_node_with_em(self, node, **kwargs): 

134 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

135 node, **kwargs 

136 ) 

137 

138 html_text = f'<span class="italique">{inner_html_text}</span>' 

139 tex_text = f"<i>{inner_tex_text}</i>" 

140 

141 if len(inner_jats_xml_text) > 0: 

142 xml_text = f"<italic>{inner_jats_xml_text}</italic>" 

143 else: 

144 xml_text = "<italic/>" 

145 

146 return html_text, tex_text, xml_text 

147 

148 def parse_node_with_ol(self, node, **kwargs): 

149 return self.parse_list(node, **kwargs) 

150 

151 def parse_node_with_ul(self, node, **kwargs): 

152 return self.parse_list(node, **kwargs) 

153 

154 def parse_node_with_li(self, node, **kwargs): 

155 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

156 node, **kwargs 

157 ) 

158 

159 html_text = f"<li>{inner_html_text}</li>" 

160 tex_text = f"<li>{inner_tex_text}</li>" 

161 

162 xml_text = f"<list-item><p>{inner_jats_xml_text}</p></list-item>" 

163 

164 return html_text, tex_text, xml_text 

165 

166 def parse_node_with_br(self, node, **kwargs): 

167 html_text = tex_text = "<br/>" 

168 xml_text = "<break/>" 

169 

170 return html_text, tex_text, xml_text 

171 

172 def parse_list(self, node, **kwargs): 

173 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

174 node, **kwargs 

175 ) 

176 

177 list_type = "simple" if node.tag == "ul" else "number" 

178 

179 xml_text = f'<list list-type="{list_type}">' 

180 xml_text += inner_jats_xml_text 

181 xml_text += "</list>" 

182 

183 # # JATS requires <list> to be inside <p> 

184 # parent = node.getparent() 

185 # if parent is None or parent.tag != "p": 

186 # xml_text = f"<p>{xml_text}</p>" 

187 

188 html_text = f"<{node.tag}>{inner_html_text}</{node.tag}>" 

189 tex_text = f"<{node.tag}>{inner_tex_text}</{node.tag}>" 

190 

191 return html_text, tex_text, xml_text 

192 

193 def parse_node_with_sub(self, node, **kwargs): 

194 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

195 node, **kwargs 

196 ) 

197 

198 html_text = f"<sub>{inner_html_text}</sub>" 

199 tex_text = f"<sub>{inner_tex_text}</sub>" 

200 xml_text = f"<sub>{inner_jats_xml_text}</sub>" 

201 

202 return html_text, tex_text, xml_text 

203 

204 def parse_node_with_sup(self, node, **kwargs): 

205 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

206 node, **kwargs 

207 ) 

208 

209 html_text = f"<sup>{inner_html_text}</sup>" 

210 tex_text = f"<sup>{inner_tex_text}</sup>" 

211 xml_text = f"<sup>{inner_jats_xml_text}</sup>" 

212 

213 return html_text, tex_text, xml_text 

214 

215 def parse_node_with_span(self, node, **kwargs): 

216 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

217 node, **kwargs 

218 ) 

219 

220 the_class = node.get("class") 

221 

222 if the_class == "mathjax-formula": 

223 html_text, tex_text, xml_text = self.parse_formula(node, **kwargs) 

224 elif the_class is not None: 

225 html_text = f'<span class="{the_class}">{inner_html_text}</li>' 

226 tex_text = f'<span class="{the_class}">{inner_tex_text}</li>' 

227 xml_text = inner_jats_xml_text 

228 else: 

229 html_text = f"<span>{inner_html_text}</li>" 

230 tex_text = f"<span>{inner_tex_text}</li>" 

231 xml_text = inner_jats_xml_text 

232 

233 return html_text, tex_text, xml_text 

234 

235 def parse_formula(self, node, **kwargs): 

236 formula = node.text or "" 

237 

238 if len(formula) > 0 and formula.find("\\(") == 0: 

239 formula = formula[2:-2] 

240 

241 mml = "" 

242 if len(self.mml_formulas) > 0: 

243 mml = self.mml_formulas.pop(0) 

244 

245 is_inline = True 

246 parent = node.getparent() 

247 if parent is not None and parent.tag == "p" and not parent.text and not parent.tail: 

248 is_inline = False 

249 

250 formula = f"${formula}$" 

251 html_text = f'<span class="mathjax-formula" title="{formula}">{mml}</span>' 

252 tex_text = formula 

253 

254 if is_inline: 

255 xml_text = "<inline-formula><alternatives>" 

256 if len(mml) > 0: 

257 xml_text += mml 

258 xml_text += f"<tex-math>{escape(formula)}</tex-math>" 

259 xml_text += "</alternatives></inline-formula>" 

260 else: 

261 prefix = '<table class="formula mathjax-formula"><tr><td class="formula-inner">' 

262 suffix = '</td><td class="formula-label"></td></tr></table>' 

263 html_text = prefix + html_text + suffix 

264 tex_text = prefix + tex_text + suffix 

265 

266 xml_text = '<disp-formula xml:space="preserve">\n<alternatives>' 

267 if len(mml) > 0: 

268 xml_text += mml 

269 xml_text += f"<tex-math>{escape(formula)}</tex-math>" 

270 xml_text += "</alternatives></disp-formula>" 

271 

272 return html_text, tex_text, xml_text 

273 

274 def parse_node_with_mixed_content(self, node, **kwargs): 

275 """ 

276 Parse and return the text of an XML node which mixes text and XML sub-nodes. 

277 Ex: <node>text1 <a>text_a</a> text2 <b>text_b</b>b_tail</node> 

278 Some inner nodes are removed, others are kept or replaced. 

279 

280 Cedrics XMLs store the MathML and the TeX formulas in 2 siblings. 

281 Parse the 2 nodes at the same time. 

282 

283 The JATS xml string is constructed at the same time because it is used during a PTF export 

284 

285 :param node: XML Node (with MathML), XML Node (with TexMath) 

286 :param kwargs: params of the function 

287 :return: HTML text, TeX test, XML text 

288 """ 

289 

290 html_text = tex_text = jats_xml_text = "" 

291 

292 if node is None: 292 ↛ 293line 292 didn't jump to line 293, because the condition on line 292 was never true

293 return html_text, tex_text, jats_xml_text 

294 

295 # The tail is the text following the end of the node 

296 # Ex: <node>text1<a>text_a</a>a_tail</node> 

297 # The HTML text has to include the tail 

298 # only if html_from_mixed_content was called recursively 

299 kwargs["is_top"] = kwargs["is_top"] if "is_top" in kwargs else True 

300 

301 # lxml replace HTML entities in node.tex and node.tail (like &lt;) 

302 # kwargs['escape'] allows to escape back the values 

303 kwargs["escape"] = kwargs["escape"] if "escape" in kwargs else True 

304 

305 tag = node.tag 

306 

307 inner_html_text = inner_tex_text = inner_jats_xml_text = "" 

308 

309 # I. Add the node's text. 

310 # Some tag have a corresponding html_from_@tag function to generate the HTML text. 

311 

312 fct_name = tag 

313 fct_name = "parse_node_with_" + fct_name.replace("-", "_") 

314 ftor = getattr(self, fct_name, None) 

315 if callable(ftor): 

316 inner_html_text, inner_tex_text, inner_jats_xml_text = ftor(node, **kwargs) 

317 else: 

318 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

319 node, **kwargs 

320 ) 

321 

322 html_text += inner_html_text 

323 tex_text += inner_tex_text 

324 jats_xml_text += inner_jats_xml_text 

325 

326 # III. Add the node's tail for children 

327 if node.tail and not kwargs["is_top"]: # and tag not in ('p', 'list', 'item', 'label'): 

328 html_text += escape(node.tail) if kwargs["escape"] else node.tail 

329 tex_text += escape(node.tail) if kwargs["escape"] else node.tail 

330 jats_xml_text += escape(node.tail) 

331 

332 return html_text, tex_text, jats_xml_text 

333 

334 def parse_node_with_p(self, node, **kwargs): 

335 inner_html_text, inner_tex_text, inner_jats_xml_text = self.parse_node_inner( 

336 node, **kwargs 

337 ) 

338 

339 html_text = inner_html_text if self.ignore_p else f"<p>{inner_html_text}</p>" 

340 tex_text = inner_tex_text if self.ignore_p else f"<p>{inner_tex_text}</p>" 

341 

342 if self.ignore_p: 

343 xml_text = inner_jats_xml_text 

344 elif len(inner_jats_xml_text) > 0: 344 ↛ 347line 344 didn't jump to line 347, because the condition on line 344 was never false

345 xml_text = '<p xml:space="preserve">' + inner_jats_xml_text + "</p>" 

346 else: 

347 xml_text = '<p xml:space="preserve"/>' 

348 

349 return html_text, tex_text, xml_text 

350 

351 

352if __name__ == "__main__": 352 ↛ 353line 352 didn't jump to line 353, because the condition on line 352 was never true

353 html_value = r'<p>Te&lt;st&nbsp;<span class="mathjax-formula">\(x = {-b \pm \sqrt{b^2-4ac} \over 2a}\)</span> done</p><ul><li>Item</li></ul><ol><li>Item 1<br />New line</li><li>&nbsp;</li></ol>' 

354 parser = CkeditorParser(html_value=html_value) 

355 result = parser.value_xml 

356 print(result)