Coverage for apps/ptf/cmds/xml/xml_utils.py: 60%

366 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-05-19 19:20 +0000

1import html 

2import os 

3 

4from lxml import etree 

5from lxml import objectify 

6from lxml.html import fromstring 

7 

8 

9# Unicode to XML 

10def escape(string): 

11 return string.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;") 

12 

13 

14# Replace html entities like &phi; by their corresponding unicode characters 

15# except for XML reserved characters (& < >) 

16def replace_html_entities(text): 

17 # the mathtml 2 entities are not always identical to the HTML entities 

18 # See https://www.w3.org/TR/xml-entity-names/#changes20080721 

19 # Manually map the differences 

20 text = text.replace("&varepsilon;", chr(949)) 

21 text = text.replace("&OverBar;", chr(175)) 

22 text = text.replace("&UnderBar;", " " + chr(818)) 

23 

24 # cdrxml.xml files have XML/MathML (?) entities like &pĥiv; 

25 # There are converted to unicode caracters in recent /cedram_dev/exploitation files (AIF > 2013) 

26 # But are kept intact in old ones 

27 # Need to map the differences 

28 text = text.replace("&phiv;", chr(966)) 

29 text = text.replace("&phi;", chr(981)) 

30 

31 # text has html entities like &phi; that need to be replaced by the unicode character. 

32 # But html.replace() will also replace &lt; &gt; &amp; 

33 # The proper solution would be to not call get_xml_from_node and continue the recursive parsing of mathml nodes 

34 # A hack is used: we change the &lt; call html.unescape then restore the &lt; 

35 text = text.replace("&lt;", "&mylt;").replace("&gt;", "&mygt;").replace("&amp;", "&myamp;") 

36 text = html.unescape(text) 

37 text = text.replace("&mylt;", "&lt;").replace("&mygt;", "&gt;").replace("&myamp;", "&amp;") 

38 

39 # Bug in html.unescape ? Why does this module replace a unicode by another ? 

40 text = text.replace(chr(10216), chr(9001)).replace(chr(10217), chr(9002)) 

41 text = text.replace(chr(10214), chr(12314)).replace(chr(10215), chr(12315)) 

42 text = text.replace(chr(9183), chr(65080)) 

43 

44 return text 

45 

46 

47def normalize(name): 

48 if name[0] == "{": 

49 _, tag = name[1:].split("}") 

50 return tag 

51 return name 

52 

53 

54def get_xml_file_count(folder): 

55 count = 0 

56 for root, dirs, _files in os.walk(folder): 

57 for dir_ in dirs: 

58 file_ = os.path.join(folder, dir_, dir_ + ".xml") 

59 num_sep_this = root.count(os.path.sep) 

60 if num_sep_this < 3: 

61 if os.path.isfile(file_): 

62 count += 1 

63 return count 

64 

65 

66def get_xml_from_text(tag, text): 

67 node = etree.Element(tag) 

68 node.text = text 

69 result = etree.tostring(node, encoding="UTF-8").decode("utf-8") 

70 

71 return result 

72 

73 

74def remove_namespace(tree): 

75 for elem in tree.getiterator(): 

76 if not hasattr(elem.tag, "find"): 

77 continue # (1) 

78 i = elem.tag.find("}") 

79 if i >= 0: 

80 elem.tag = elem.tag[i + 1 :] 

81 objectify.deannotate(tree, cleanup_namespaces=True, xsi_nil=True) 

82 

83 

84def get_normalized_attrib(node, attrib_name): 

85 attrib_value = None 

86 if node is not None: 86 ↛ 92line 86 didn't jump to line 92, because the condition on line 86 was never false

87 for attrib in node.attrib: 

88 name = normalize(attrib) 

89 if name == attrib_name: 

90 attrib_value = node.attrib[attrib] 

91 

92 return attrib_value 

93 

94 

95def get_xml_from_node(node): 

96 text = "" 

97 if node is not None: 97 ↛ 101line 97 didn't jump to line 101, because the condition on line 97 was never false

98 text = etree.tostring( 

99 node, encoding="unicode", method="xml", xml_declaration=False, with_tail=False 

100 ) 

101 return text 

102 

103 

104def get_xml_from_node2(node, with_tail=False): 

105 tag = normalize(node.tag) 

106 

107 text = "<" + tag + ">" 

108 if node.text: 

109 text += node.text 

110 

111 for child in node: 

112 text += get_xml_from_node2(child, True) 

113 

114 text += "</" + tag + ">" 

115 

116 if node.tail and with_tail: 

117 text += node.tail 

118 

119 return text 

120 

121 

122# tostring is a useless fonction for 'text': it simply removes the HTML entities ! 

123def get_old_text_from_node(node): 

124 text = "" 

125 if node is not None: 

126 text = etree.tostring( 

127 node, encoding="unicode", method="text", xml_declaration=False, with_tail=False 

128 ) 

129 return text 

130 

131 

132def get_text_from_node(node, **kwargs): 

133 text = "" 

134 

135 is_top = kwargs["is_top"] = kwargs["is_top"] if "is_top" in kwargs else True 

136 

137 if node is not None: 137 ↛ 148line 137 didn't jump to line 148, because the condition on line 137 was never false

138 text += replace_html_entities(node.text) if node.text is not None else "" 

139 

140 kwargs["is_top"] = False 

141 

142 for child in node: 

143 text += get_text_from_node(child, **kwargs) 

144 

145 if not is_top and node.tail is not None: 

146 text += replace_html_entities(node.tail) 

147 

148 return text 

149 

150 

151def fix_mfenced_in_mathml(text): 

152 i = 0 

153 keep_testing = True 

154 while keep_testing: 

155 i = text.find("<mfenced", i) 

156 keep_testing = i > -1 

157 if i > 0 and text[i - 1] != ">": 157 ↛ 158line 157 didn't jump to line 158, because the condition on line 157 was never true

158 j = i - 1 

159 while j > 0 and text[j] != ">": 

160 j -= 1 

161 mfenced = text[j + 1 : i].strip() 

162 if 0 < len(mfenced) < 3: 

163 if len(mfenced) == 1: 

164 first = mfenced 

165 second = "" 

166 else: 

167 first = mfenced[0] 

168 second = mfenced[1] 

169 

170 left = text[: j + 1] 

171 right = text[i:] 

172 

173 if second == "": 

174 if mfenced in ("{", "("): 

175 open_c = mfenced 

176 close_c = "" 

177 else: 

178 close_c = mfenced 

179 open_c = "" 

180 else: 

181 ri = right.find('open=""') 

182 rj = right.find('close=""') 

183 if ri < rj: 

184 open_c = first 

185 close_c = second 

186 else: 

187 open_c = second 

188 close_c = first 

189 right = right.replace('open=""', 'open="' + open_c + '"', 1) 

190 right = right.replace('close=""', 'close="' + close_c + '"', 1) 

191 text = left + right 

192 i += 1 

193 

194 return text 

195 

196 # chars = ('∥', '|') 

197 # for c in chars: 

198 # if c + c in math_node_text: 

199 # l = math_node_text.split(c + c) 

200 # # Bug in lxml. A formula with open="∥" becomes wrong with tostring 

201 # # A proper solution would be to rewrite get_xml_from_node and stop using tostring 

202 # end_ = l[1].replace('open=""', 'open="' + c + '"', 1).replace('close=""', 'close="' + c + '"', 1) 

203 # math_node_text = l[0] + end_ 

204 

205 

206def add_mml_ns(node): 

207 if node is None: 

208 return 

209 

210 tag = normalize(node.tag) 

211 tag = etree.QName("http://www.w3.org/1998/Math/MathML", tag) 

212 node.tag = tag 

213 

214 for child in node: 

215 add_mml_ns(child) 

216 

217 

218def get_text_from_original_title_with_mathml(xml, **kwargs): 

219 # on ne garde que la lang principal 

220 parser = etree.XMLParser( 

221 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True 

222 ) 

223 etree.register_namespace("mml", "http://www.w3.org/1998/Math/MathML") 

224 text = xml.replace('xmlns:xlink="http://www.w3.org/1999/xlink"', "") 

225 tree = etree.fromstring(text.encode("utf-8"), parser=parser) 

226 

227 get_trans_title = kwargs.get("get_trans_title", False) 

228 

229 for node in tree: 229 ↛ exitline 229 didn't return from function 'get_text_from_original_title_with_mathml', because the loop on line 229 didn't complete

230 tag = normalize(node.tag) 

231 if get_trans_title and tag == "trans-title-group": 231 ↛ 232line 231 didn't jump to line 232, because the condition on line 231 was never true

232 for child in node: 

233 tag = normalize(child.tag) 

234 if tag == "trans-title": 

235 return get_text_from_node_with_mathml(child, **kwargs) 

236 elif not get_trans_title and tag in ( 236 ↛ 229line 236 didn't jump to line 229, because the condition on line 236 was never false

237 "title", 

238 "journal-title", 

239 "article-title", 

240 "book-title", 

241 ): 

242 return get_text_from_node_with_mathml(node, **kwargs) 

243 

244 

245def get_text_from_xml_with_mathml(xml, **kwargs): 

246 parser = etree.XMLParser( 

247 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True 

248 ) 

249 etree.register_namespace("mml", "http://www.w3.org/1998/Math/MathML") 

250 text = xml.replace('xmlns:xlink="http://www.w3.org/1999/xlink"', "") 

251 

252 tree = etree.fromstring(text.encode("utf-8"), parser=parser) 

253 value = get_text_from_node_with_mathml(tree, **kwargs) 

254 return value 

255 

256 

257def get_text_from_node_with_mathml(node, **kwargs): 

258 text = "" 

259 

260 if node is None: 260 ↛ 261line 260 didn't jump to line 261, because the condition on line 260 was never true

261 return text 

262 

263 kwargs["is_top"] = kwargs["is_top"] if "is_top" in kwargs else True 

264 kwargs["with_mathml"] = kwargs["with_mathml"] if "with_mathml" in kwargs else False 

265 

266 tag = normalize(node.tag) 

267 

268 if tag == "inline-formula" or tag == "disp-formula": 268 ↛ 269line 268 didn't jump to line 269, because the condition on line 268 was never true

269 remove_namespace(node) 

270 

271 for child in node: 

272 tag = normalize(child.tag) 

273 if tag == "alternatives": 

274 for alternative in child: 

275 tag = normalize(alternative.tag) 

276 if tag == "math" and kwargs["with_mathml"]: 

277 add_mml_ns(alternative) 

278 text = get_xml_from_node(alternative) 

279 elif tag == "tex-math" and not kwargs["with_mathml"]: 

280 text = get_xml_from_node(alternative) 

281 

282 else: 

283 if node.text: 283 ↛ 287line 283 didn't jump to line 287, because the condition on line 283 was never false

284 text += node.text 

285 text = escape(text) 

286 

287 kwargs["is_top"] = False 

288 

289 for child in node: 

290 child_text = get_text_from_node_with_mathml(child, **kwargs) 

291 text += child_text 

292 

293 if node.tail and not kwargs["is_top"]: 

294 text += node.tail 

295 

296 return text 

297 

298 

299def make_links_clickable(href, string): 

300 if not href: 

301 href = string 

302 

303 if href == "": 303 ↛ 304line 303 didn't jump to line 304, because the condition on line 303 was never true

304 return string 

305 

306 if href[0] == "/" or href.startswith("http"): 

307 if "<" in href: 307 ↛ 309line 307 didn't jump to line 309, because the condition on line 307 was never true

308 # TODO: Bug in Cedrics. URLs can have formulas (https://aif.centre-mersenne.org/item/AIF_2013__63_1_155_0/ [6]) 

309 href = href.split("<")[0] 

310 

311 i = string.find("<") 

312 if i > 0: 

313 string = string[i:] 

314 

315 if not string: 315 ↛ 316line 315 didn't jump to line 316, because the condition on line 315 was never true

316 string = href 

317 

318 if href[0] == "/" or href.startswith("http"): 

319 if href[0] == "/": 319 ↛ 320line 319 didn't jump to line 320, because the condition on line 319 was never true

320 return f'<a href="{href}">{string}</a>' 

321 else: 

322 return f'<a href="{href}" target="_blank">{string}</a>' 

323 

324 return string 

325 

326 

327def get_contrib_xml(contrib, is_ref=False): 

328 xml = "" 

329 if not is_ref: 

330 xml = f'<contrib contrib-type="{contrib["role"]}"' 

331 if "corresponding" in contrib and contrib["corresponding"]: 

332 xml += ' corresp="yes"' 

333 if "deceased_before_publication" in contrib and contrib["deceased_before_publication"]: 333 ↛ 334line 333 didn't jump to line 334, because the condition on line 333 was never true

334 xml += ' deceased="yes"' 

335 if ( 335 ↛ 340line 335 didn't jump to line 340

336 "equal_contrib" in contrib 

337 and contrib["equal_contrib"] != "" 

338 and contrib["equal_contrib"] 

339 ): 

340 xml += ' equal-contrib="yes"' 

341 xml += ">" 

342 

343 name = "" 

344 

345 if "prefix" in contrib and contrib["prefix"]: 345 ↛ 346line 345 didn't jump to line 346, because the condition on line 345 was never true

346 name += f'<prefix>{escape(contrib["prefix"])}</prefix>' 

347 if "last_name" in contrib and contrib["last_name"]: 

348 name += f'<surname>{escape(contrib["last_name"])}</surname>' 

349 if "first_name" in contrib and contrib["first_name"]: 

350 name += f'<given-names>{escape(contrib["first_name"])}</given-names>' 

351 if "suffix" in contrib and contrib["suffix"]: 351 ↛ 352line 351 didn't jump to line 352, because the condition on line 351 was never true

352 name += f'<suffix>{escape(contrib["suffix"])}</suffix>' 

353 

354 if name == "": 

355 if contrib["string_name"]: 355 ↛ 359line 355 didn't jump to line 359, because the condition on line 355 was never false

356 xml += f"<string-name>{contrib['string_name']}</string-name>" 

357 else: 

358 # TODO: Bug in Cedrics <nomcomplet> is ignored inside <bauteur> and <bediteur> 

359 xml += "<name/>" 

360 else: 

361 xml += f"<name>{name}</name>" 

362 

363 if "addresses" in contrib: 363 ↛ 367line 363 didn't jump to line 367, because the condition on line 363 was never false

364 for address in contrib["addresses"]: 

365 xml += "<address><addr-line>" + escape(address) + "</addr-line></address>" 

366 

367 if "email" in contrib and contrib["email"]: 

368 emails = contrib["email"].split("{{{") 

369 for email in emails: 

370 xml += "<email>" + escape(email) + "</email>" 

371 if "orcid" in contrib and contrib["orcid"]: 371 ↛ 372line 371 didn't jump to line 372, because the condition on line 371 was never true

372 xml += '<contrib-id contrib-id-type="orcid">' + escape(contrib["orcid"]) + "</contrib-id>" 

373 

374 if "idref" in contrib and contrib["idref"]: 374 ↛ 375line 374 didn't jump to line 375, because the condition on line 374 was never true

375 xml += '<contrib-id contrib-id-type="idref">' + escape(contrib["idref"]) + "</contrib-id>" 

376 if not is_ref: 

377 xml += "</contrib>" 

378 

379 return xml 

380 

381 

382def helper_update_name_params(params, use_initials=False): 

383 # Extract first/last name if they are empty 

384 if params["string_name"] and not params["last_name"]: 

385 array = params["string_name"].split(",") 

386 if len(array) > 1: 

387 params["last_name"] = array[0] 

388 params["first_name"] = array[1] 

389 

390 if len(params["first_name"]) > 128: 390 ↛ 391line 390 didn't jump to line 391, because the condition on line 390 was never true

391 params["first_name"] = params["first_name"][0:128] 

392 if len(params["last_name"]) > 128: 392 ↛ 393line 392 didn't jump to line 393, because the condition on line 392 was never true

393 params["last_name"] = params["last_name"][0:128] 

394 if len(params["string_name"]) > 256: 394 ↛ 395line 394 didn't jump to line 395, because the condition on line 394 was never true

395 params["string_name"] = params["string_name"][0:256] 

396 if len(params["mid"]) > 256: 396 ↛ 397line 396 didn't jump to line 397, because the condition on line 396 was never true

397 params["mid"] = params["mid"][0:256] 

398 

399 

400def normalise_span(value): 

401 # Supprime les spans en trop dans les textes 

402 

403 i = 0 

404 while i != -1: 

405 i = value.find("<span") 

406 if i > -1: 406 ↛ 407line 406 didn't jump to line 407, because the condition on line 406 was never true

407 j = value.find(">", i) 

408 if j > -1: 

409 value = value[0:i] + value[j + 1 :] 

410 value = value.replace("</span>", "") 

411 return value 

412 

413 

414def remove_html(string): 

415 if not string: 

416 return "" 

417 return "".join(fromstring(string).itertext()) 

418 

419 

420def normalize_space(value): 

421 # Supprime les espaces en trop dans les textes 

422 

423 # Common answers on the web " ".join(s.split()) 

424 # If does not work if there's a nbsp; 

425 # Python splits it, xslt ignores it 

426 

427 result = "" 

428 init_trim = True 

429 skips = (" ", "\t", "\n") 

430 

431 for c in value: 

432 if c in skips: 

433 if not init_trim: 

434 result += c 

435 init_trim = True 

436 else: 

437 result += c 

438 init_trim = False 

439 

440 if len(result) > 1 and result[-1] in skips: 

441 result = result[0:-1] 

442 

443 return result 

444 

445 

446def clean_doi(value): 

447 i = value.find("10.") 

448 if i > 0: 

449 value = value[i:] 

450 value = normalize_space(value) 

451 

452 return value 

453 

454 

455def int_to_Roman(num): 

456 val = [1000, 900, 500, 400, 100, 90, 50, 40, 10, 9, 5, 4, 1] 

457 syb = ["m", "cm", "d", "cd", "c", "xc", "l", "xl", "x", "ix", "v", "iv", "i"] 

458 roman_num = "" 

459 i = 0 

460 while num > 0: 

461 for _ in range(num // val[i]): 

462 roman_num += syb[i] 

463 num -= val[i] 

464 i += 1 

465 return roman_num 

466 

467 

468def roman_to_int(s): 

469 """ 

470 :type s: str 

471 :rtype: int 

472 """ 

473 roman = { 

474 "I": 1, 

475 "V": 5, 

476 "X": 10, 

477 "L": 50, 

478 "C": 100, 

479 "D": 500, 

480 "M": 1000, 

481 "IV": 4, 

482 "IX": 9, 

483 "XL": 40, 

484 "XC": 90, 

485 "CD": 400, 

486 "CM": 900, 

487 } 

488 i = 0 

489 num = 0 

490 s = s.upper() 

491 while i < len(s): 

492 if i + 1 < len(s) and s[i : i + 2] in roman: 

493 num += roman[s[i : i + 2]] 

494 i += 2 

495 else: 

496 num += roman[s[i]] 

497 i += 1 

498 return num 

499 

500 

501def get_extid_value_from_link_data(link_data): 

502 """ 

503 Some links have an id to an external database (MR, ZBL, DOI, Numdam). 

504 Extract the link_type and value 

505 

506 :param link_data: dict with link data (ref, mimetype, location...) 

507 :return: (link_type, value) 

508 """ 

509 

510 # rdoi: recommendation doi, used by PCI 

511 # preprint: id of the preprint, used by PCI 

512 referentials = [ 

513 "jfm-item-id", 

514 "zbl-item-id", 

515 "mr-item-id", 

516 "nmid", 

517 "numdam-id", 

518 "mathdoc-id", 

519 "sps-id", 

520 "dmlid", 

521 "eudml-item-id", 

522 "doi", 

523 "eid", 

524 "arxiv", 

525 "tel", 

526 "hal", 

527 "theses.fr", 

528 "rdoi", 

529 "preprint", 

530 "pmid", 

531 "ark", 

532 ] 

533 

534 # data['rel'] is the ext-link-type or the pub-id-type 

535 link_type = link_data["rel"] or "" 

536 

537 # The value attribute is not required. Use the node's text when href is empty. 

538 value = link_data["location"] 

539 if value == "": 

540 value = link_data["metadata"] 

541 value = value.strip() 

542 

543 if link_type == "" and value.find("doi.org") > 0: 

544 link_type = "doi" 

545 elif link_type == "" and value.find("arxiv.org") > 0: 545 ↛ 546line 545 didn't jump to line 546, because the condition on line 545 was never true

546 link_type = "arxiv" 

547 elif link_type == "" and value.find("hal-") > 0: 547 ↛ 548line 547 didn't jump to line 548, because the condition on line 547 was never true

548 link_type = "hal" 

549 

550 extid_value = (None, None) 

551 

552 if link_type in referentials: 

553 if link_type == "numdam-id": 

554 link_type = "mathdoc-id" 

555 

556 if link_type == "doi": 

557 value = clean_doi(value) 

558 elif link_type == "arxiv": 

559 if link_data["metadata"] != "": 559 ↛ 562line 559 didn't jump to line 562, because the condition on line 559 was never false

560 value = link_data["metadata"].replace("arXiv:", "") 

561 else: 

562 value = link_data["location"] 

563 value = value.replace("http://arxiv.org/abs/", "").replace( 

564 "https://arxiv.org/abs/", "" 

565 ) 

566 else: 

567 value = link_data["metadata"] 

568 

569 extid_value = (link_type, value) 

570 

571 return extid_value 

572 

573 

574def handle_pages(page_range): 

575 try: 

576 fpage, lpage = (int(page) for page in page_range.split("-")) 

577 except (AttributeError, ValueError): 

578 # means : page_range = None 

579 fpage, lpage = None, None 

580 return fpage, lpage 

581 

582 

583def split_kwds(text): 

584 list_ = text.split("$") 

585 

586 if len(list_) % 2 == 0: 

587 # Formulas are encapsulated inside $$ 

588 # If the list_ size is odd (number of '$' is odd), do not attempt to split keywords 

589 return [text] 

590 

591 kwds = [] 

592 cur_kwd = "" 

593 for i, item in enumerate(list_): 

594 if i % 2 == 0: 

595 items = item.replace(";", ",").split(",") 

596 if len(items) > 1: 

597 kwds.append(cur_kwd + items[0]) 

598 kwds.extend(items[1:-1]) 

599 cur_kwd = items[-1] 

600 else: 

601 cur_kwd += item 

602 else: 

603 cur_kwd += "$" + item + "$" 

604 

605 if cur_kwd: 

606 kwds.append(cur_kwd) 

607 

608 kwds = [kwd.strip() for kwd in kwds] 

609 return kwds 

610 

611 

612def get_elsevier_image_extensions(): 

613 return ["tif", "tiff", "gif", "png", "jpg", "jpeg", "jc3", "eps", "jc4"]