Coverage for apps/ptf/cmds/xml/xml_utils.py: 60%
366 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-05-19 19:20 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-05-19 19:20 +0000
1import html
2import os
4from lxml import etree
5from lxml import objectify
6from lxml.html import fromstring
9# Unicode to XML
10def escape(string):
11 return string.replace("&", "&").replace("<", "<").replace(">", ">")
14# Replace html entities like φ by their corresponding unicode characters
15# except for XML reserved characters (& < >)
16def replace_html_entities(text):
17 # the mathtml 2 entities are not always identical to the HTML entities
18 # See https://www.w3.org/TR/xml-entity-names/#changes20080721
19 # Manually map the differences
20 text = text.replace("ϵ", chr(949))
21 text = text.replace("‾", chr(175))
22 text = text.replace("_", " " + chr(818))
24 # cdrxml.xml files have XML/MathML (?) entities like &pĥiv;
25 # There are converted to unicode caracters in recent /cedram_dev/exploitation files (AIF > 2013)
26 # But are kept intact in old ones
27 # Need to map the differences
28 text = text.replace("ϕ", chr(966))
29 text = text.replace("φ", chr(981))
31 # text has html entities like φ that need to be replaced by the unicode character.
32 # But html.replace() will also replace < > &
33 # The proper solution would be to not call get_xml_from_node and continue the recursive parsing of mathml nodes
34 # A hack is used: we change the < call html.unescape then restore the <
35 text = text.replace("<", "&mylt;").replace(">", "&mygt;").replace("&", "&myamp;")
36 text = html.unescape(text)
37 text = text.replace("&mylt;", "<").replace("&mygt;", ">").replace("&myamp;", "&")
39 # Bug in html.unescape ? Why does this module replace a unicode by another ?
40 text = text.replace(chr(10216), chr(9001)).replace(chr(10217), chr(9002))
41 text = text.replace(chr(10214), chr(12314)).replace(chr(10215), chr(12315))
42 text = text.replace(chr(9183), chr(65080))
44 return text
47def normalize(name):
48 if name[0] == "{":
49 _, tag = name[1:].split("}")
50 return tag
51 return name
54def get_xml_file_count(folder):
55 count = 0
56 for root, dirs, _files in os.walk(folder):
57 for dir_ in dirs:
58 file_ = os.path.join(folder, dir_, dir_ + ".xml")
59 num_sep_this = root.count(os.path.sep)
60 if num_sep_this < 3:
61 if os.path.isfile(file_):
62 count += 1
63 return count
66def get_xml_from_text(tag, text):
67 node = etree.Element(tag)
68 node.text = text
69 result = etree.tostring(node, encoding="UTF-8").decode("utf-8")
71 return result
74def remove_namespace(tree):
75 for elem in tree.getiterator():
76 if not hasattr(elem.tag, "find"):
77 continue # (1)
78 i = elem.tag.find("}")
79 if i >= 0:
80 elem.tag = elem.tag[i + 1 :]
81 objectify.deannotate(tree, cleanup_namespaces=True, xsi_nil=True)
84def get_normalized_attrib(node, attrib_name):
85 attrib_value = None
86 if node is not None: 86 ↛ 92line 86 didn't jump to line 92, because the condition on line 86 was never false
87 for attrib in node.attrib:
88 name = normalize(attrib)
89 if name == attrib_name:
90 attrib_value = node.attrib[attrib]
92 return attrib_value
95def get_xml_from_node(node):
96 text = ""
97 if node is not None: 97 ↛ 101line 97 didn't jump to line 101, because the condition on line 97 was never false
98 text = etree.tostring(
99 node, encoding="unicode", method="xml", xml_declaration=False, with_tail=False
100 )
101 return text
104def get_xml_from_node2(node, with_tail=False):
105 tag = normalize(node.tag)
107 text = "<" + tag + ">"
108 if node.text:
109 text += node.text
111 for child in node:
112 text += get_xml_from_node2(child, True)
114 text += "</" + tag + ">"
116 if node.tail and with_tail:
117 text += node.tail
119 return text
122# tostring is a useless fonction for 'text': it simply removes the HTML entities !
123def get_old_text_from_node(node):
124 text = ""
125 if node is not None:
126 text = etree.tostring(
127 node, encoding="unicode", method="text", xml_declaration=False, with_tail=False
128 )
129 return text
132def get_text_from_node(node, **kwargs):
133 text = ""
135 is_top = kwargs["is_top"] = kwargs["is_top"] if "is_top" in kwargs else True
137 if node is not None: 137 ↛ 148line 137 didn't jump to line 148, because the condition on line 137 was never false
138 text += replace_html_entities(node.text) if node.text is not None else ""
140 kwargs["is_top"] = False
142 for child in node:
143 text += get_text_from_node(child, **kwargs)
145 if not is_top and node.tail is not None:
146 text += replace_html_entities(node.tail)
148 return text
151def fix_mfenced_in_mathml(text):
152 i = 0
153 keep_testing = True
154 while keep_testing:
155 i = text.find("<mfenced", i)
156 keep_testing = i > -1
157 if i > 0 and text[i - 1] != ">": 157 ↛ 158line 157 didn't jump to line 158, because the condition on line 157 was never true
158 j = i - 1
159 while j > 0 and text[j] != ">":
160 j -= 1
161 mfenced = text[j + 1 : i].strip()
162 if 0 < len(mfenced) < 3:
163 if len(mfenced) == 1:
164 first = mfenced
165 second = ""
166 else:
167 first = mfenced[0]
168 second = mfenced[1]
170 left = text[: j + 1]
171 right = text[i:]
173 if second == "":
174 if mfenced in ("{", "("):
175 open_c = mfenced
176 close_c = ""
177 else:
178 close_c = mfenced
179 open_c = ""
180 else:
181 ri = right.find('open=""')
182 rj = right.find('close=""')
183 if ri < rj:
184 open_c = first
185 close_c = second
186 else:
187 open_c = second
188 close_c = first
189 right = right.replace('open=""', 'open="' + open_c + '"', 1)
190 right = right.replace('close=""', 'close="' + close_c + '"', 1)
191 text = left + right
192 i += 1
194 return text
196 # chars = ('∥', '|')
197 # for c in chars:
198 # if c + c in math_node_text:
199 # l = math_node_text.split(c + c)
200 # # Bug in lxml. A formula with open="∥" becomes wrong with tostring
201 # # A proper solution would be to rewrite get_xml_from_node and stop using tostring
202 # end_ = l[1].replace('open=""', 'open="' + c + '"', 1).replace('close=""', 'close="' + c + '"', 1)
203 # math_node_text = l[0] + end_
206def add_mml_ns(node):
207 if node is None:
208 return
210 tag = normalize(node.tag)
211 tag = etree.QName("http://www.w3.org/1998/Math/MathML", tag)
212 node.tag = tag
214 for child in node:
215 add_mml_ns(child)
218def get_text_from_original_title_with_mathml(xml, **kwargs):
219 # on ne garde que la lang principal
220 parser = etree.XMLParser(
221 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True
222 )
223 etree.register_namespace("mml", "http://www.w3.org/1998/Math/MathML")
224 text = xml.replace('xmlns:xlink="http://www.w3.org/1999/xlink"', "")
225 tree = etree.fromstring(text.encode("utf-8"), parser=parser)
227 get_trans_title = kwargs.get("get_trans_title", False)
229 for node in tree: 229 ↛ exitline 229 didn't return from function 'get_text_from_original_title_with_mathml', because the loop on line 229 didn't complete
230 tag = normalize(node.tag)
231 if get_trans_title and tag == "trans-title-group": 231 ↛ 232line 231 didn't jump to line 232, because the condition on line 231 was never true
232 for child in node:
233 tag = normalize(child.tag)
234 if tag == "trans-title":
235 return get_text_from_node_with_mathml(child, **kwargs)
236 elif not get_trans_title and tag in ( 236 ↛ 229line 236 didn't jump to line 229, because the condition on line 236 was never false
237 "title",
238 "journal-title",
239 "article-title",
240 "book-title",
241 ):
242 return get_text_from_node_with_mathml(node, **kwargs)
245def get_text_from_xml_with_mathml(xml, **kwargs):
246 parser = etree.XMLParser(
247 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True
248 )
249 etree.register_namespace("mml", "http://www.w3.org/1998/Math/MathML")
250 text = xml.replace('xmlns:xlink="http://www.w3.org/1999/xlink"', "")
252 tree = etree.fromstring(text.encode("utf-8"), parser=parser)
253 value = get_text_from_node_with_mathml(tree, **kwargs)
254 return value
257def get_text_from_node_with_mathml(node, **kwargs):
258 text = ""
260 if node is None: 260 ↛ 261line 260 didn't jump to line 261, because the condition on line 260 was never true
261 return text
263 kwargs["is_top"] = kwargs["is_top"] if "is_top" in kwargs else True
264 kwargs["with_mathml"] = kwargs["with_mathml"] if "with_mathml" in kwargs else False
266 tag = normalize(node.tag)
268 if tag == "inline-formula" or tag == "disp-formula": 268 ↛ 269line 268 didn't jump to line 269, because the condition on line 268 was never true
269 remove_namespace(node)
271 for child in node:
272 tag = normalize(child.tag)
273 if tag == "alternatives":
274 for alternative in child:
275 tag = normalize(alternative.tag)
276 if tag == "math" and kwargs["with_mathml"]:
277 add_mml_ns(alternative)
278 text = get_xml_from_node(alternative)
279 elif tag == "tex-math" and not kwargs["with_mathml"]:
280 text = get_xml_from_node(alternative)
282 else:
283 if node.text: 283 ↛ 287line 283 didn't jump to line 287, because the condition on line 283 was never false
284 text += node.text
285 text = escape(text)
287 kwargs["is_top"] = False
289 for child in node:
290 child_text = get_text_from_node_with_mathml(child, **kwargs)
291 text += child_text
293 if node.tail and not kwargs["is_top"]:
294 text += node.tail
296 return text
299def make_links_clickable(href, string):
300 if not href:
301 href = string
303 if href == "": 303 ↛ 304line 303 didn't jump to line 304, because the condition on line 303 was never true
304 return string
306 if href[0] == "/" or href.startswith("http"):
307 if "<" in href: 307 ↛ 309line 307 didn't jump to line 309, because the condition on line 307 was never true
308 # TODO: Bug in Cedrics. URLs can have formulas (https://aif.centre-mersenne.org/item/AIF_2013__63_1_155_0/ [6])
309 href = href.split("<")[0]
311 i = string.find("<")
312 if i > 0:
313 string = string[i:]
315 if not string: 315 ↛ 316line 315 didn't jump to line 316, because the condition on line 315 was never true
316 string = href
318 if href[0] == "/" or href.startswith("http"):
319 if href[0] == "/": 319 ↛ 320line 319 didn't jump to line 320, because the condition on line 319 was never true
320 return f'<a href="{href}">{string}</a>'
321 else:
322 return f'<a href="{href}" target="_blank">{string}</a>'
324 return string
327def get_contrib_xml(contrib, is_ref=False):
328 xml = ""
329 if not is_ref:
330 xml = f'<contrib contrib-type="{contrib["role"]}"'
331 if "corresponding" in contrib and contrib["corresponding"]:
332 xml += ' corresp="yes"'
333 if "deceased_before_publication" in contrib and contrib["deceased_before_publication"]: 333 ↛ 334line 333 didn't jump to line 334, because the condition on line 333 was never true
334 xml += ' deceased="yes"'
335 if ( 335 ↛ 340line 335 didn't jump to line 340
336 "equal_contrib" in contrib
337 and contrib["equal_contrib"] != ""
338 and contrib["equal_contrib"]
339 ):
340 xml += ' equal-contrib="yes"'
341 xml += ">"
343 name = ""
345 if "prefix" in contrib and contrib["prefix"]: 345 ↛ 346line 345 didn't jump to line 346, because the condition on line 345 was never true
346 name += f'<prefix>{escape(contrib["prefix"])}</prefix>'
347 if "last_name" in contrib and contrib["last_name"]:
348 name += f'<surname>{escape(contrib["last_name"])}</surname>'
349 if "first_name" in contrib and contrib["first_name"]:
350 name += f'<given-names>{escape(contrib["first_name"])}</given-names>'
351 if "suffix" in contrib and contrib["suffix"]: 351 ↛ 352line 351 didn't jump to line 352, because the condition on line 351 was never true
352 name += f'<suffix>{escape(contrib["suffix"])}</suffix>'
354 if name == "":
355 if contrib["string_name"]: 355 ↛ 359line 355 didn't jump to line 359, because the condition on line 355 was never false
356 xml += f"<string-name>{contrib['string_name']}</string-name>"
357 else:
358 # TODO: Bug in Cedrics <nomcomplet> is ignored inside <bauteur> and <bediteur>
359 xml += "<name/>"
360 else:
361 xml += f"<name>{name}</name>"
363 if "addresses" in contrib: 363 ↛ 367line 363 didn't jump to line 367, because the condition on line 363 was never false
364 for address in contrib["addresses"]:
365 xml += "<address><addr-line>" + escape(address) + "</addr-line></address>"
367 if "email" in contrib and contrib["email"]:
368 emails = contrib["email"].split("{{{")
369 for email in emails:
370 xml += "<email>" + escape(email) + "</email>"
371 if "orcid" in contrib and contrib["orcid"]: 371 ↛ 372line 371 didn't jump to line 372, because the condition on line 371 was never true
372 xml += '<contrib-id contrib-id-type="orcid">' + escape(contrib["orcid"]) + "</contrib-id>"
374 if "idref" in contrib and contrib["idref"]: 374 ↛ 375line 374 didn't jump to line 375, because the condition on line 374 was never true
375 xml += '<contrib-id contrib-id-type="idref">' + escape(contrib["idref"]) + "</contrib-id>"
376 if not is_ref:
377 xml += "</contrib>"
379 return xml
382def helper_update_name_params(params, use_initials=False):
383 # Extract first/last name if they are empty
384 if params["string_name"] and not params["last_name"]:
385 array = params["string_name"].split(",")
386 if len(array) > 1:
387 params["last_name"] = array[0]
388 params["first_name"] = array[1]
390 if len(params["first_name"]) > 128: 390 ↛ 391line 390 didn't jump to line 391, because the condition on line 390 was never true
391 params["first_name"] = params["first_name"][0:128]
392 if len(params["last_name"]) > 128: 392 ↛ 393line 392 didn't jump to line 393, because the condition on line 392 was never true
393 params["last_name"] = params["last_name"][0:128]
394 if len(params["string_name"]) > 256: 394 ↛ 395line 394 didn't jump to line 395, because the condition on line 394 was never true
395 params["string_name"] = params["string_name"][0:256]
396 if len(params["mid"]) > 256: 396 ↛ 397line 396 didn't jump to line 397, because the condition on line 396 was never true
397 params["mid"] = params["mid"][0:256]
400def normalise_span(value):
401 # Supprime les spans en trop dans les textes
403 i = 0
404 while i != -1:
405 i = value.find("<span")
406 if i > -1: 406 ↛ 407line 406 didn't jump to line 407, because the condition on line 406 was never true
407 j = value.find(">", i)
408 if j > -1:
409 value = value[0:i] + value[j + 1 :]
410 value = value.replace("</span>", "")
411 return value
414def remove_html(string):
415 if not string:
416 return ""
417 return "".join(fromstring(string).itertext())
420def normalize_space(value):
421 # Supprime les espaces en trop dans les textes
423 # Common answers on the web " ".join(s.split())
424 # If does not work if there's a nbsp;
425 # Python splits it, xslt ignores it
427 result = ""
428 init_trim = True
429 skips = (" ", "\t", "\n")
431 for c in value:
432 if c in skips:
433 if not init_trim:
434 result += c
435 init_trim = True
436 else:
437 result += c
438 init_trim = False
440 if len(result) > 1 and result[-1] in skips:
441 result = result[0:-1]
443 return result
446def clean_doi(value):
447 i = value.find("10.")
448 if i > 0:
449 value = value[i:]
450 value = normalize_space(value)
452 return value
455def int_to_Roman(num):
456 val = [1000, 900, 500, 400, 100, 90, 50, 40, 10, 9, 5, 4, 1]
457 syb = ["m", "cm", "d", "cd", "c", "xc", "l", "xl", "x", "ix", "v", "iv", "i"]
458 roman_num = ""
459 i = 0
460 while num > 0:
461 for _ in range(num // val[i]):
462 roman_num += syb[i]
463 num -= val[i]
464 i += 1
465 return roman_num
468def roman_to_int(s):
469 """
470 :type s: str
471 :rtype: int
472 """
473 roman = {
474 "I": 1,
475 "V": 5,
476 "X": 10,
477 "L": 50,
478 "C": 100,
479 "D": 500,
480 "M": 1000,
481 "IV": 4,
482 "IX": 9,
483 "XL": 40,
484 "XC": 90,
485 "CD": 400,
486 "CM": 900,
487 }
488 i = 0
489 num = 0
490 s = s.upper()
491 while i < len(s):
492 if i + 1 < len(s) and s[i : i + 2] in roman:
493 num += roman[s[i : i + 2]]
494 i += 2
495 else:
496 num += roman[s[i]]
497 i += 1
498 return num
501def get_extid_value_from_link_data(link_data):
502 """
503 Some links have an id to an external database (MR, ZBL, DOI, Numdam).
504 Extract the link_type and value
506 :param link_data: dict with link data (ref, mimetype, location...)
507 :return: (link_type, value)
508 """
510 # rdoi: recommendation doi, used by PCI
511 # preprint: id of the preprint, used by PCI
512 referentials = [
513 "jfm-item-id",
514 "zbl-item-id",
515 "mr-item-id",
516 "nmid",
517 "numdam-id",
518 "mathdoc-id",
519 "sps-id",
520 "dmlid",
521 "eudml-item-id",
522 "doi",
523 "eid",
524 "arxiv",
525 "tel",
526 "hal",
527 "theses.fr",
528 "rdoi",
529 "preprint",
530 "pmid",
531 "ark",
532 ]
534 # data['rel'] is the ext-link-type or the pub-id-type
535 link_type = link_data["rel"] or ""
537 # The value attribute is not required. Use the node's text when href is empty.
538 value = link_data["location"]
539 if value == "":
540 value = link_data["metadata"]
541 value = value.strip()
543 if link_type == "" and value.find("doi.org") > 0:
544 link_type = "doi"
545 elif link_type == "" and value.find("arxiv.org") > 0: 545 ↛ 546line 545 didn't jump to line 546, because the condition on line 545 was never true
546 link_type = "arxiv"
547 elif link_type == "" and value.find("hal-") > 0: 547 ↛ 548line 547 didn't jump to line 548, because the condition on line 547 was never true
548 link_type = "hal"
550 extid_value = (None, None)
552 if link_type in referentials:
553 if link_type == "numdam-id":
554 link_type = "mathdoc-id"
556 if link_type == "doi":
557 value = clean_doi(value)
558 elif link_type == "arxiv":
559 if link_data["metadata"] != "": 559 ↛ 562line 559 didn't jump to line 562, because the condition on line 559 was never false
560 value = link_data["metadata"].replace("arXiv:", "")
561 else:
562 value = link_data["location"]
563 value = value.replace("http://arxiv.org/abs/", "").replace(
564 "https://arxiv.org/abs/", ""
565 )
566 else:
567 value = link_data["metadata"]
569 extid_value = (link_type, value)
571 return extid_value
574def handle_pages(page_range):
575 try:
576 fpage, lpage = (int(page) for page in page_range.split("-"))
577 except (AttributeError, ValueError):
578 # means : page_range = None
579 fpage, lpage = None, None
580 return fpage, lpage
583def split_kwds(text):
584 list_ = text.split("$")
586 if len(list_) % 2 == 0:
587 # Formulas are encapsulated inside $$
588 # If the list_ size is odd (number of '$' is odd), do not attempt to split keywords
589 return [text]
591 kwds = []
592 cur_kwd = ""
593 for i, item in enumerate(list_):
594 if i % 2 == 0:
595 items = item.replace(";", ",").split(",")
596 if len(items) > 1:
597 kwds.append(cur_kwd + items[0])
598 kwds.extend(items[1:-1])
599 cur_kwd = items[-1]
600 else:
601 cur_kwd += item
602 else:
603 cur_kwd += "$" + item + "$"
605 if cur_kwd:
606 kwds.append(cur_kwd)
608 kwds = [kwd.strip() for kwd in kwds]
609 return kwds
612def get_elsevier_image_extensions():
613 return ["tif", "tiff", "gif", "png", "jpg", "jpeg", "jc3", "eps", "jc4"]