Coverage for apps/ptf/cmds/xml/xml

1import html

2import os

4from lxml import etree

5from lxml import objectify

6from lxml.html import fromstring

9# Unicode to XML

10def escape(string):

11 return string.replace("&", "&").replace("<", "<").replace(">", ">")

14# Replace html entities like φ by their corresponding unicode characters

15# except for XML reserved characters (& < >)

16def replace_html_entities(text):

17 # the mathtml 2 entities are not always identical to the HTML entities

18 # See https://www.w3.org/TR/xml-entity-names/#changes20080721

19 # Manually map the differences

20 text = text.replace("ϵ", chr(949))

21 text = text.replace("&OverBar;", chr(175))

22 text = text.replace("&UnderBar;", " " + chr(818))

24 # cdrxml.xml files have XML/MathML (?) entities like &pĥiv;

25 # There are converted to unicode caracters in recent /cedram_dev/exploitation files (AIF > 2013)

26 # But are kept intact in old ones

27 # Need to map the differences

28 text = text.replace("ϕ", chr(966))

29 text = text.replace("φ", chr(981))

31 # text has html entities like φ that need to be replaced by the unicode character.

32 # But html.replace() will also replace < > &

33 # The proper solution would be to not call get_xml_from_node and continue the recursive parsing of mathml nodes

34 # A hack is used: we change the < call html.unescape then restore the <

35 text = text.replace("<", "&mylt;").replace(">", "&mygt;").replace("&", "&myamp;")

36 text = html.unescape(text)

37 text = text.replace("&mylt;", "<").replace("&mygt;", ">").replace("&myamp;", "&")

39 # Bug in html.unescape ? Why does this module replace a unicode by another ?

40 text = text.replace(chr(10216), chr(9001)).replace(chr(10217), chr(9002))

41 text = text.replace(chr(10214), chr(12314)).replace(chr(10215), chr(12315))

42 text = text.replace(chr(9183), chr(65080))

44 return text

47def normalize(name):

48 if name[0] == "{":

49 _, tag = name[1:].split("}")

50 return tag

51 return name

54def get_xml_file_count(folder):

55 count = 0

56 for root, dirs, _files in os.walk(folder):

57 for dir_ in dirs:

58 file_ = os.path.join(folder, dir_, dir_ + ".xml")

59 num_sep_this = root.count(os.path.sep)

60 if num_sep_this < 3:

61 if os.path.isfile(file_):

62 count += 1

63 return count

66def get_xml_from_text(tag, text):

67 node = etree.Element(tag)

68 node.text = text

69 result = etree.tostring(node, encoding="UTF-8").decode("utf-8")

71 return result

74def remove_namespace(tree):

75 for elem in tree.getiterator():

76 if not hasattr(elem.tag, "find"):

77 continue # (1)

78 i = elem.tag.find("}")

79 if i >= 0:

80 elem.tag = elem.tag[i + 1 :]

81 objectify.deannotate(tree, cleanup_namespaces=True, xsi_nil=True)

84def get_normalized_attrib(node, attrib_name):

85 attrib_value = None

86 if node is not None: 86 ↛ 92line 86 didn't jump to line 92, because the condition on line 86 was never false

87 for attrib in node.attrib:

88 name = normalize(attrib)

89 if name == attrib_name:

90 attrib_value = node.attrib[attrib]

92 return attrib_value

95def get_xml_from_node(node):

96 text = ""

97 if node is not None: 97 ↛ 101line 97 didn't jump to line 101, because the condition on line 97 was never false

98 text = etree.tostring(

99 node, encoding="unicode", method="xml", xml_declaration=False, with_tail=False

100 )

101 return text

102

103

104def get_xml_from_node2(node, with_tail=False):

105 tag = normalize(node.tag)

106

107 text = "<" + tag + ">"

108 if node.text:

109 text += node.text

110

111 for child in node:

112 text += get_xml_from_node2(child, True)

113

114 text += "</" + tag + ">"

115

116 if node.tail and with_tail:

117 text += node.tail

118

119 return text

120

121

122# tostring is a useless fonction for 'text': it simply removes the HTML entities !

123def get_old_text_from_node(node):

124 text = ""

125 if node is not None:

126 text = etree.tostring(

127 node, encoding="unicode", method="text", xml_declaration=False, with_tail=False

128 )

129 return text

130

131

132def get_text_from_node(node, **kwargs):

133 text = ""

134

135 is_top = kwargs["is_top"] = kwargs["is_top"] if "is_top" in kwargs else True

136

137 if node is not None: 137 ↛ 148line 137 didn't jump to line 148, because the condition on line 137 was never false

138 text += replace_html_entities(node.text) if node.text is not None else ""

139

140 kwargs["is_top"] = False

141

142 for child in node:

143 text += get_text_from_node(child, **kwargs)

144

145 if not is_top and node.tail is not None:

146 text += replace_html_entities(node.tail)

147

148 return text

149

150

151def fix_mfenced_in_mathml(text):

152 i = 0

153 keep_testing = True

154 while keep_testing:

155 i = text.find("<mfenced", i)

156 keep_testing = i > -1

157 if i > 0 and text[i - 1] != ">": 157 ↛ 158line 157 didn't jump to line 158, because the condition on line 157 was never true

158 j = i - 1

159 while j > 0 and text[j] != ">":

160 j -= 1

161 mfenced = text[j + 1 : i].strip()

162 if 0 < len(mfenced) < 3:

163 if len(mfenced) == 1:

164 first = mfenced

165 second = ""

166 else:

167 first = mfenced[0]

168 second = mfenced[1]

169

170 left = text[: j + 1]

171 right = text[i:]

172

173 if second == "":

174 if mfenced in ("{", "("):

175 open_c = mfenced

176 close_c = ""

177 else:

178 close_c = mfenced

179 open_c = ""

180 else:

181 ri = right.find('open=""')

182 rj = right.find('close=""')

183 if ri < rj:

184 open_c = first

185 close_c = second

186 else:

187 open_c = second

188 close_c = first

189 right = right.replace('open=""', 'open="' + open_c + '"', 1)

190 right = right.replace('close=""', 'close="' + close_c + '"', 1)

191 text = left + right

192 i += 1

193

194 return text

195

196 # chars = ('∥', '|')

197 # for c in chars:

198 # if c + c in math_node_text:

199 # l = math_node_text.split(c + c)

200 # # Bug in lxml. A formula with open="∥" becomes wrong with tostring

201 # # A proper solution would be to rewrite get_xml_from_node and stop using tostring

202 # end_ = l[1].replace('open=""', 'open="' + c + '"', 1).replace('close=""', 'close="' + c + '"', 1)

203 # math_node_text = l[0] + end_

204

205

206def add_mml_ns(node):

207 if node is None:

208 return

209

210 tag = normalize(node.tag)

211 tag = etree.QName("http://www.w3.org/1998/Math/MathML", tag)

212 node.tag = tag

213

214 for child in node:

215 add_mml_ns(child)

216

217

218def get_text_from_original_title_with_mathml(xml, **kwargs):

219 # on ne garde que la lang principal

220 parser = etree.XMLParser(

221 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True

222 )

223 etree.register_namespace("mml", "http://www.w3.org/1998/Math/MathML")

224 text = xml.replace('xmlns:xlink="http://www.w3.org/1999/xlink"', "")

225 tree = etree.fromstring(text.encode("utf-8"), parser=parser)

226

227 get_trans_title = kwargs.get("get_trans_title", False)

228

229 for node in tree: 229 ↛ exitline 229 didn't return from function 'get_text_from_original_title_with_mathml', because the loop on line 229 didn't complete

230 tag = normalize(node.tag)

231 if get_trans_title and tag == "trans-title-group": 231 ↛ 232line 231 didn't jump to line 232, because the condition on line 231 was never true

232 for child in node:

233 tag = normalize(child.tag)

234 if tag == "trans-title":

235 return get_text_from_node_with_mathml(child, **kwargs)

236 elif not get_trans_title and tag in ( 236 ↛ 229line 236 didn't jump to line 229, because the condition on line 236 was never false

237 "title",

238 "journal-title",

239 "article-title",

240 "book-title",

241 ):

242 return get_text_from_node_with_mathml(node, **kwargs)

243

244

245def get_text_from_xml_with_mathml(xml, **kwargs):

246 parser = etree.XMLParser(

247 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True

248 )

249 etree.register_namespace("mml", "http://www.w3.org/1998/Math/MathML")

250 text = xml.replace('xmlns:xlink="http://www.w3.org/1999/xlink"', "")

251

252 tree = etree.fromstring(text.encode("utf-8"), parser=parser)

253 value = get_text_from_node_with_mathml(tree, **kwargs)

254 return value

255

256

257def get_text_from_node_with_mathml(node, **kwargs):

258 text = ""

259

260 if node is None: 260 ↛ 261line 260 didn't jump to line 261, because the condition on line 260 was never true

261 return text

262

263 kwargs["is_top"] = kwargs["is_top"] if "is_top" in kwargs else True

264 kwargs["with_mathml"] = kwargs["with_mathml"] if "with_mathml" in kwargs else False

265

266 tag = normalize(node.tag)

267

268 if tag == "inline-formula" or tag == "disp-formula": 268 ↛ 269line 268 didn't jump to line 269, because the condition on line 268 was never true

269 remove_namespace(node)

270

271 for child in node:

272 tag = normalize(child.tag)

273 if tag == "alternatives":

274 for alternative in child:

275 tag = normalize(alternative.tag)

276 if tag == "math" and kwargs["with_mathml"]:

277 add_mml_ns(alternative)

278 text = get_xml_from_node(alternative)

279 elif tag == "tex-math" and not kwargs["with_mathml"]:

280 text = get_xml_from_node(alternative)

281

282 else:

283 if node.text: 283 ↛ 287line 283 didn't jump to line 287, because the condition on line 283 was never false

284 text += node.text

285 text = escape(text)

286

287 kwargs["is_top"] = False

288

289 for child in node:

290 child_text = get_text_from_node_with_mathml(child, **kwargs)

291 text += child_text

292

293 if node.tail and not kwargs["is_top"]:

294 text += node.tail

295

296 return text

297

298

299def make_links_clickable(href, string):

300 if not href:

301 href = string

302

303 if href == "": 303 ↛ 304line 303 didn't jump to line 304, because the condition on line 303 was never true

304 return string

305

306 if href[0] == "/" or href.startswith("http"):

307 if "<" in href: 307 ↛ 309line 307 didn't jump to line 309, because the condition on line 307 was never true

308 # TODO: Bug in Cedrics. URLs can have formulas (https://aif.centre-mersenne.org/item/AIF_2013__63_1_155_0/ [6])

309 href = href.split("<")[0]

310

311 i = string.find("<")

312 if i > 0:

313 string = string[i:]

314

315 if not string: 315 ↛ 316line 315 didn't jump to line 316, because the condition on line 315 was never true

316 string = href

317

318 if href[0] == "/" or href.startswith("http"):

319 if href[0] == "/": 319 ↛ 320line 319 didn't jump to line 320, because the condition on line 319 was never true

320 return f'<a href="{href}">{string}</a>'

321 else:

322 return f'<a href="{href}" target="_blank">{string}</a>'

323

324 return string

325

326

327def get_contrib_xml(contrib, is_ref=False):

328 xml = ""

329 if not is_ref:

330 xml = f'<contrib contrib-type="{contrib["role"]}"'

331 if "corresponding" in contrib and contrib["corresponding"]:

332 xml += ' corresp="yes"'

333 if "deceased_before_publication" in contrib and contrib["deceased_before_publication"]: 333 ↛ 334line 333 didn't jump to line 334, because the condition on line 333 was never true

334 xml += ' deceased="yes"'

335 if ( 335 ↛ 340line 335 didn't jump to line 340

336 "equal_contrib" in contrib

337 and contrib["equal_contrib"] != ""

338 and contrib["equal_contrib"]

339 ):

340 xml += ' equal-contrib="yes"'

341 xml += ">"

342

343 name = ""

344

345 if "prefix" in contrib and contrib["prefix"]: 345 ↛ 346line 345 didn't jump to line 346, because the condition on line 345 was never true

346 name += f'<prefix>{escape(contrib["prefix"])}</prefix>'

347 if "last_name" in contrib and contrib["last_name"]:

348 name += f'<surname>{escape(contrib["last_name"])}</surname>'

349 if "first_name" in contrib and contrib["first_name"]:

350 name += f'<given-names>{escape(contrib["first_name"])}</given-names>'

351 if "suffix" in contrib and contrib["suffix"]: 351 ↛ 352line 351 didn't jump to line 352, because the condition on line 351 was never true

352 name += f'<suffix>{escape(contrib["suffix"])}</suffix>'

353

354 if name == "":

355 if contrib["string_name"]: 355 ↛ 359line 355 didn't jump to line 359, because the condition on line 355 was never false

356 xml += f"<string-name>{contrib['string_name']}</string-name>"

357 else:

358 # TODO: Bug in Cedrics <nomcomplet> is ignored inside <bauteur> and <bediteur>

359 xml += "<name/>"

360 else:

361 xml += f"<name>{name}</name>"

362

363 if "addresses" in contrib: 363 ↛ 367line 363 didn't jump to line 367, because the condition on line 363 was never false

364 for address in contrib["addresses"]:

365 xml += "<address><addr-line>" + escape(address) + "</addr-line></address>"

366

367 if "email" in contrib and contrib["email"]:

368 emails = contrib["email"].split("{{{")

369 for email in emails:

370 xml += "<email>" + escape(email) + "</email>"

371 if "orcid" in contrib and contrib["orcid"]: 371 ↛ 372line 371 didn't jump to line 372, because the condition on line 371 was never true

372 xml += '<contrib-id contrib-id-type="orcid">' + escape(contrib["orcid"]) + "</contrib-id>"

373

374 if "idref" in contrib and contrib["idref"]: 374 ↛ 375line 374 didn't jump to line 375, because the condition on line 374 was never true

375 xml += '<contrib-id contrib-id-type="idref">' + escape(contrib["idref"]) + "</contrib-id>"

376 if not is_ref:

377 xml += "</contrib>"

378

379 return xml

380

381

382def helper_update_name_params(params, use_initials=False):

383 # Extract first/last name if they are empty

384 if params["string_name"] and not params["last_name"]:

385 array = params["string_name"].split(",")

386 if len(array) > 1:

387 params["last_name"] = array[0]

388 params["first_name"] = array[1]

389

390 if len(params["first_name"]) > 128: 390 ↛ 391line 390 didn't jump to line 391, because the condition on line 390 was never true

391 params["first_name"] = params["first_name"][0:128]

392 if len(params["last_name"]) > 128: 392 ↛ 393line 392 didn't jump to line 393, because the condition on line 392 was never true

393 params["last_name"] = params["last_name"][0:128]

394 if len(params["string_name"]) > 256: 394 ↛ 395line 394 didn't jump to line 395, because the condition on line 394 was never true

395 params["string_name"] = params["string_name"][0:256]

396 if len(params["mid"]) > 256: 396 ↛ 397line 396 didn't jump to line 397, because the condition on line 396 was never true

397 params["mid"] = params["mid"][0:256]

398

399

400def normalise_span(value):

401 # Supprime les spans en trop dans les textes

402

403 i = 0

404 while i != -1:

405 i = value.find("<span")

406 if i > -1: 406 ↛ 407line 406 didn't jump to line 407, because the condition on line 406 was never true

407 j = value.find(">", i)

408 if j > -1:

409 value = value[0:i] + value[j + 1 :]

410 value = value.replace("</span>", "")

411 return value

412

413

414def remove_html(string):

415 if not string:

416 return ""

417 return "".join(fromstring(string).itertext())

418

419

420def normalize_space(value):

421 # Supprime les espaces en trop dans les textes

422

423 # Common answers on the web " ".join(s.split())

424 # If does not work if there's a nbsp;

425 # Python splits it, xslt ignores it

426

427 result = ""

428 init_trim = True

429 skips = (" ", "\t", "\n")

430

431 for c in value:

432 if c in skips:

433 if not init_trim:

434 result += c

435 init_trim = True

436 else:

437 result += c

438 init_trim = False

439

440 if len(result) > 1 and result[-1] in skips:

441 result = result[0:-1]

442

443 return result

444

445

446def clean_doi(value):

447 i = value.find("10.")

448 if i > 0:

449 value = value[i:]

450 value = normalize_space(value)

451

452 return value

453

454

455def int_to_Roman(num):

456 val = [1000, 900, 500, 400, 100, 90, 50, 40, 10, 9, 5, 4, 1]

457 syb = ["m", "cm", "d", "cd", "c", "xc", "l", "xl", "x", "ix", "v", "iv", "i"]

458 roman_num = ""

459 i = 0

460 while num > 0:

461 for _ in range(num // val[i]):

462 roman_num += syb[i]

463 num -= val[i]

464 i += 1

465 return roman_num

466

467

468def roman_to_int(s):

469 """

470 :type s: str

471 :rtype: int

472 """

473 roman = {

474 "I": 1,

475 "V": 5,

476 "X": 10,

477 "L": 50,

478 "C": 100,

479 "D": 500,

480 "M": 1000,

481 "IV": 4,

482 "IX": 9,

483 "XL": 40,

484 "XC": 90,

485 "CD": 400,

486 "CM": 900,

487 }

488 i = 0

489 num = 0

490 s = s.upper()

491 while i < len(s):

492 if i + 1 < len(s) and s[i : i + 2] in roman:

493 num += roman[s[i : i + 2]]

494 i += 2

495 else:

496 num += roman[s[i]]

497 i += 1

498 return num

499

500

501def get_extid_value_from_link_data(link_data):

502 """

503 Some links have an id to an external database (MR, ZBL, DOI, Numdam).

504 Extract the link_type and value

505

506 :param link_data: dict with link data (ref, mimetype, location...)

507 :return: (link_type, value)

508 """

509

510 # rdoi: recommendation doi, used by PCI

511 # preprint: id of the preprint, used by PCI

512 referentials = [

513 "jfm-item-id",

514 "zbl-item-id",

515 "mr-item-id",

516 "nmid",

517 "numdam-id",

518 "mathdoc-id",

519 "sps-id",

520 "dmlid",

521 "eudml-item-id",

522 "doi",

523 "eid",

524 "arxiv",

525 "tel",

526 "hal",

527 "theses.fr",

528 "rdoi",

529 "preprint",

530 "pmid",

531 "ark",

532 ]

533

534 # data['rel'] is the ext-link-type or the pub-id-type

535 link_type = link_data["rel"] or ""

536

537 # The value attribute is not required. Use the node's text when href is empty.

538 value = link_data["location"]

539 if value == "":

540 value = link_data["metadata"]

541 value = value.strip()

542

543 if link_type == "" and value.find("doi.org") > 0:

544 link_type = "doi"

545 elif link_type == "" and value.find("arxiv.org") > 0: 545 ↛ 546line 545 didn't jump to line 546, because the condition on line 545 was never true

546 link_type = "arxiv"

547 elif link_type == "" and value.find("hal-") > 0: 547 ↛ 548line 547 didn't jump to line 548, because the condition on line 547 was never true

548 link_type = "hal"

549

550 extid_value = (None, None)

551

552 if link_type in referentials:

553 if link_type == "numdam-id":

554 link_type = "mathdoc-id"

555

556 if link_type == "doi":

557 value = clean_doi(value)

558 elif link_type == "arxiv":

559 if link_data["metadata"] != "": 559 ↛ 562line 559 didn't jump to line 562, because the condition on line 559 was never false

560 value = link_data["metadata"].replace("arXiv:", "")

561 else:

562 value = link_data["location"]

563 value = value.replace("http://arxiv.org/abs/", "").replace(

564 "https://arxiv.org/abs/", ""

565 )

566 else:

567 value = link_data["metadata"]

568

569 extid_value = (link_type, value)

570

571 return extid_value

572

573

574def handle_pages(page_range):

575 try:

576 fpage, lpage = (int(page) for page in page_range.split("-"))

577 except (AttributeError, ValueError):

578 # means : page_range = None

579 fpage, lpage = None, None

580 return fpage, lpage

581

582

583def split_kwds(text):

584 list_ = text.split("$")

585

586 if len(list_) % 2 == 0:

587 # Formulas are encapsulated inside $$

588 # If the list_ size is odd (number of '$' is odd), do not attempt to split keywords

589 return [text]

590

591 kwds = []

592 cur_kwd = ""

593 for i, item in enumerate(list_):

594 if i % 2 == 0:

595 items = item.replace(";", ",").split(",")

596 if len(items) > 1:

597 kwds.append(cur_kwd + items[0])

598 kwds.extend(items[1:-1])

599 cur_kwd = items[-1]

600 else:

601 cur_kwd += item

602 else:

603 cur_kwd += "$" + item + "$"

604

605 if cur_kwd:

606 kwds.append(cur_kwd)

607

608 kwds = [kwd.strip() for kwd in kwds]

609 return kwds

610

611

612def get_elsevier_image_extensions():

613 return ["tif", "tiff", "gif", "png", "jpg", "jpeg", "jc3", "eps", "jc4"]

Coverage for apps/ptf/cmds/xml/xml_utils.py: 60%

366 statements