Coverage for apps/ptf/cmds/xml/jats/jats_parser.py: 70%
2055 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-05-19 19:20 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-05-19 19:20 +0000
1##################################################################################################
2#
3# README
4#
5# jats_parser.py is a replacement of xmldata.py
6# The purpose is to parse a JATS xml (or BITS) tree from top to bottom.
7# Each node is read only once.
8#
9# JatsArticle, JatsIssue, JatsJournal, BitsBook are the objects created by xml_cmds.
10# The xml tree is parsed in the class constructor (__init__)
11# These classes have parse_<tag> functions to parse the xml nodes and set instance variables.
12# Some parse_<tag> functions are called directly.
13# Ex: if tag == "article-meta":
14# self.parse_article_meta(child)
15# Other parse_<tag> functions are called "automatically"
16# fct_name = 'parse_' + tag.replace('-', '_')
17# ftor = getattr(self, fct_name, None)
18# if callable(ftor):
19# ftor(child)
20#
21# JatsBase and JatsArticleBase are base classes.
22# They provide common instance variables and their corresponding parse_<tag> functions
23#
24# html_from_<tag> are used to generate the HTML text of a node with mixed content:
25# a node that mixes text, children and tail
26# These functions can also extract data and set instance variables (ex: self.figures)
27#
28# get_data_from_* parse a node, but simply return data (text, dict,...) without side effects
29#
30# At the end of this file, there are some functions that are/were called by ptf-tools.
31# They are kept here for simplicity: we can switch xmldata entirely with jats_parser
32#
33# TODO: the import OAI or the import of a collection could simply call the first function
34# (def parser(tree))
35#
36##################################################################################################
38import copy
39import inspect
40import os
41import re
43from lxml import etree
44from pylatexenc.latexencode import unicode_to_latex
46from django.conf import settings
47from django.urls import reverse
48from django.utils import timezone
50from matching import scrapping
51from ptf.cmds.xml.citation_html import add_span_class_to_html_from_article_title
52from ptf.cmds.xml.citation_html import add_span_class_to_html_from_authors
53from ptf.cmds.xml.citation_html import add_span_class_to_html_from_chapter_title
54from ptf.cmds.xml.citation_html import add_span_class_to_html_from_source
55from ptf.cmds.xml.citation_html import add_span_class_to_html_from_volume
56from ptf.cmds.xml.citation_html import get_citation_html
57from ptf.cmds.xml.xml_base import RefBase
58from ptf.cmds.xml.xml_base import XmlParserBase
59from ptf.cmds.xml.xml_utils import escape
60from ptf.cmds.xml.xml_utils import get_contrib_xml
61from ptf.cmds.xml.xml_utils import get_elsevier_image_extensions
62from ptf.cmds.xml.xml_utils import get_normalized_attrib
63from ptf.cmds.xml.xml_utils import get_text_from_node
64from ptf.cmds.xml.xml_utils import get_xml_from_node
65from ptf.cmds.xml.xml_utils import helper_update_name_params
66from ptf.cmds.xml.xml_utils import make_links_clickable
67from ptf.cmds.xml.xml_utils import normalize
68from ptf.cmds.xml.xml_utils import normalize_space
69from ptf.cmds.xml.xml_utils import split_kwds
70from ptf.display import resolver
71from ptf.model_data import ArticleData
72from ptf.model_data import BookData
73from ptf.model_data import BookPartData
74from ptf.model_data import CollectionData
75from ptf.model_data import Foo
76from ptf.model_data import IssueData
77from ptf.model_data import JournalData
78from ptf.model_data import MathdocPublicationData
79from ptf.model_data import PublisherData
80from ptf.model_data import create_contributor
81from ptf.model_data import create_extlink
84class JatsBase(XmlParserBase):
85 def __init__(self, *args, **kwargs):
86 super().__init__()
87 self.warnings = []
88 self.fns = []
89 self.tree = None
90 # Used to convert an XML value for CKEditor (ie abstract)
91 self.add_span_around_tex_formula = False
92 # Used to create a Tex file from an XML value (ie abstract)
93 self.for_tex_file = False
95 def parse_tree(self, tree):
96 self.tree = tree
97 self.lang = get_normalized_attrib(tree, "lang") or "und"
99 def parse_node_with_article_title(self, node, **kwargs):
100 tex, html = self.parse_inner_node(node, **kwargs)
102 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False
103 if is_mixed_citation:
104 html = add_span_class_to_html_from_article_title(html, **kwargs)
106 return tex, html
108 def parse_node_with_break(self, node, **kwargs):
109 tex = "\\newline\n" if self.for_tex_file else " "
110 html = "<br/>"
112 return tex, html
114 def parse_node_with_chem_struct_wrap(self, node, **kwargs):
115 table_id = label = None
116 inner_text = ""
118 if "id" in node.attrib:
119 table_id = node.attrib["id"]
121 for child in node:
122 tag = normalize(child.tag)
123 if tag == "label":
124 _, label = self.parse_node_with_mixed_content(child, **kwargs)
125 else:
126 _, child_text = self.parse_node_with_mixed_content(child, **kwargs)
127 inner_text += child_text
129 text = "<table "
130 if table_id:
131 text += f'id="{table_id}" '
132 text += f'class="formula"><tr><td class="formula-inner">{inner_text}</td>'
134 text += '<td class="formula-label">'
135 if label:
136 text += label
137 text += "</td></tr>"
138 text += "</table>"
140 return text, text
142 def parse_node_with_disp_quote(self, node, **kwargs):
143 tex, html = self.parse_inner_node(node, **kwargs)
145 html = f'<div class="disp-quote">{html}</div>'
146 tex = f'<div class="disp-quote">{tex}</div>'
148 return tex, html
150 def parse_node_with_boxed_text(self, node, **kwargs):
151 box_id = node.attrib["id"] if "id" in node.attrib else None
153 _, node_html = self.parse_inner_node(node, **kwargs)
155 if box_id:
156 html = f'<div id="{box_id}" class="boxed-text">'
157 else:
158 html = '<div class="boxed-text">'
160 html = f"{html}{node_html}</div>"
162 return "", html
164 def parse_node_with_fig(self, node, **kwargs):
165 """
166 Ex: <fig><label>LABEL</label><caption><title>TITLE</title>CAPTION</caption><graphic/></fig>
167 becomes: <figure><img><figcaption>LABEL : TITLE<p>CAPTION</p></figcaption></figure>
169 :param node: XML node of a fig
170 :return: the HTML text + the dict representing the image (mimetype, location,...)
171 """
172 html = ""
174 fig_id = label_html = title_html = caption_html = None
175 img_html = ""
177 if "id" in node.attrib:
178 fig_id = node.attrib["id"]
180 for child in node:
181 tag = normalize(child.tag)
182 if tag == "label":
183 _, label_html = self.parse_node_with_mixed_content(child, **kwargs)
184 elif tag == "caption":
185 for caption_child in child:
186 tag = normalize(caption_child.tag)
187 if tag == "title":
188 _, title_html = self.parse_node_with_mixed_content(caption_child, **kwargs)
189 elif tag == "p": 189 ↛ 203line 189 didn't jump to line 203, because the condition on line 189 was never false
190 _, caption_p_html = self.parse_node_with_mixed_content(
191 caption_child, **kwargs
192 )
193 if caption_html:
194 caption_html = caption_html.replace(
195 "<p>", '<p class="fig-first-caption">', 1
196 )
197 caption_html += caption_p_html.replace(
198 "<p>", '<p class="fig-small-caption">', 1
199 )
200 else:
201 caption_html = caption_p_html
202 else:
203 self.warnings.append(
204 {
205 self.pid: self.__class__.__name__
206 + "."
207 + inspect.currentframe().f_code.co_name
208 + " "
209 + tag
210 }
211 )
213 elif tag == "graphic":
214 _, graphic_html = self.parse_node_with_graphic(child, **kwargs)
215 img_html += graphic_html
216 elif tag == "attrib":
217 _, html = self.parse_node_with_mixed_content(child, **kwargs)
218 caption_html = f'{caption_html}<p class="fig-small-caption">{html}</p>'
219 elif tag == "permissions": 219 ↛ 225line 219 didn't jump to line 225, because the condition on line 219 was never false
220 for gchild in child:
221 if gchild.tag == "copyright-statement": 221 ↛ 220line 221 didn't jump to line 220, because the condition on line 221 was never false
222 _, html = self.parse_node_with_mixed_content(gchild, **kwargs)
223 caption_html = f'{caption_html}<p class="fig-small-caption">{html}</p>'
224 else:
225 self.warnings.append(
226 {
227 self.pid: self.__class__.__name__
228 + "."
229 + inspect.currentframe().f_code.co_name
230 + " "
231 + tag
232 }
233 )
235 if fig_id:
236 html = '<figure id="' + fig_id + '">'
237 else:
238 html = "<figure>"
240 if len(img_html) > 0: 240 ↛ 243line 240 didn't jump to line 243, because the condition on line 240 was never false
241 html += img_html
243 if label_html or title_html or (caption_html is not None and len(caption_html) > 0): 243 ↛ 257line 243 didn't jump to line 257, because the condition on line 243 was never false
244 html += "<figcaption>"
246 if label_html: 246 ↛ 248line 246 didn't jump to line 248, because the condition on line 246 was never false
247 html += label_html
248 if label_html and title_html:
249 html += " : "
250 if title_html:
251 html += title_html
252 if caption_html: 252 ↛ 255line 252 didn't jump to line 255, because the condition on line 252 was never false
253 html += caption_html
255 html += "</figcaption>"
257 html += "</figure>"
259 if ( 259 ↛ 265line 259 didn't jump to line 265
260 "append_floats" in kwargs
261 and kwargs["append_floats"]
262 and hasattr(self, "floats")
263 and fig_id is not None
264 ):
265 self.floats[fig_id] = html
267 return "", html
269 def parse_node_with_fn(self, node, **kwargs):
270 """
271 Ex: <fn><label>LABEL</label><p>TEXT</p></fn>
273 :param node: XML node of a fn
274 :return: ''. the text is stripped from the HTML. but a list of fn is built
275 """
276 html = fn_html = ""
278 label_html = fn_id = None
280 if "id" in node.attrib: 280 ↛ 281line 280 didn't jump to line 281, because the condition on line 280 was never true
281 fn_id = node.attrib["id"]
283 for child in node:
284 tag = normalize(child.tag)
285 if tag == "label":
286 _, label_html = self.parse_node_with_mixed_content(child, **kwargs)
287 elif tag == "p": 287 ↛ 291line 287 didn't jump to line 291
288 _, fn_html = self.parse_node_with_mixed_content(child, **kwargs)
289 fn_html = fn_html.replace("<p>", "").replace("</p>", "")
290 else:
291 warning = (
292 self.__class__.__name__
293 + "."
294 + inspect.currentframe().f_code.co_name
295 + " "
296 + tag
297 )
298 self.warnings.append({self.pid: warning})
300 if fn_id: 300 ↛ 301line 300 didn't jump to line 301, because the condition on line 300 was never true
301 html = '<p id="' + fn_id + '">'
302 else:
303 html = "<p>"
305 if label_html and ("keep_fn_label" not in kwargs or kwargs["keep_fn_label"]): 305 ↛ 308line 305 didn't jump to line 308, because the condition on line 305 was never false
306 html += f"<sup>{label_html}</sup> "
308 html += fn_html + "</p>"
310 if not kwargs["keep_fn"] and html not in self.fns: 310 ↛ 311line 310 didn't jump to line 311, because the condition on line 310 was never true
311 self.fns.append(html)
313 html = html if kwargs["keep_fn"] else ""
314 return "", html
316 def parse_node_with_graphic(self, node, **kwargs):
317 """
318 The href value of graphics used in our XML can have the following values
319 - relative path to the issue XML folder (Elsevier JATS)
320 - full path starting with "file:/" (Elsevier JATS created in early 2022)
321 - simple file name (with no relative path) in the RVT FullText XML
323 After the import, we want
324 - the files located in the src/tex/figures article folder
325 - the url pointing to the image, built thanks to kwargs['base_url']
327 addRelatedObjectPtfCmd will copy the images to the src/tex/figures folder if the location starts with file:/
328 => change the location to "file:/..." for Elsevier JATS (the xarticle has a pii attribute)
329 """
330 href = ""
332 for attrib in node.attrib:
333 name = normalize(attrib)
334 if name == "href":
335 href = node.attrib[attrib]
337 if href: 337 ↛ 383line 337 didn't jump to line 383, because the condition on line 337 was never false
338 basename = os.path.basename(href)
339 ext = basename.split(".")[-1]
340 if ext == "png": 340 ↛ 341line 340 didn't jump to line 341, because the condition on line 340 was never true
341 mimetype = "image/png"
342 else:
343 mimetype = "image/jpeg"
345 img_url = "src/tex/figures/" + basename
347 if ext in get_elsevier_image_extensions(): # Elsevier uses "jc3" instead of jpg. WTF ? 347 ↛ 350line 347 didn't jump to line 350, because the condition on line 347 was never false
348 img_url = img_url[0 : -len(ext)] + "jpg"
350 data_location = href if "file:/" in href else img_url
351 if ( 351 ↛ 357line 351 didn't jump to line 357
352 hasattr(self, "pii")
353 and hasattr(self, "issue")
354 and "file:/" not in href
355 and self.from_folder
356 ):
357 base_dir = self.issue.journal.pid
358 if os.path.dirname(href) != base_dir:
359 href = os.path.join(self.from_folder, base_dir, self.issue.pid, href)
360 data_location = "file:" + href
362 data = {
363 "rel": "html-image",
364 "mimetype": mimetype,
365 "location": data_location,
366 "base": None,
367 "metadata": node.text if node.text is not None else "",
368 }
370 if ext == "png": 370 ↛ 371line 370 didn't jump to line 371, because the condition on line 370 was never true
371 img_url = os.path.join(kwargs["base_url"], "png", img_url)
372 else:
373 img_url = os.path.join(kwargs["base_url"], "jpg", img_url)
374 img_text = '<a href="' + img_url + '" data-lightbox="image-'
375 img_text += str(len(self.figures)) + '" title="">'
376 img_text += '<img src="' + img_url + '" class="article-body-img" />'
377 img_text += "</a>"
379 if data not in self.figures: 379 ↛ 383line 379 didn't jump to line 383, because the condition on line 379 was never false
380 self.figures.append(data)
381 self.related_objects.append(data)
383 return "", img_text
385 def parse_node_with_inline_formula(self, node, **kwargs):
386 # MathJAX is doing a good job with formulae and is now the standard
387 # MathML could be ignored in HTML (the original XML value is preserved with value_xml)
388 # We could simply return the tex-math text
389 # But there are multiple errors in the TeX of the Mersenne articles.
390 # We first need to fix those mistakes before switching to TeX
392 tex_math = ""
393 math_text = ""
394 formula_id = label = None
396 if "id" in node.attrib:
397 formula_id = node.attrib["id"]
399 for child in node:
400 tag = normalize(child.tag)
401 if tag == "alternatives":
402 for alternative in child:
403 tag = normalize(alternative.tag)
404 if tag == "tex-math":
405 tex_math = alternative.text or ""
406 elif tag == "math":
407 # remove_namespace(child)
408 # Elsevier sometimes provide the formula a an alternative image. Remove it.
409 alternative.attrib.pop("altimg", None)
411 math_text = get_xml_from_node(alternative).replace("mml:", "")
412 math_text = math_text.replace(
413 'xmlns:xlink="http://www.w3.org/1999/xlink"', ""
414 )
415 math_text = math_text.replace(
416 'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"', ""
417 )
418 if node.tag == "disp-formula":
419 math_text = math_text.replace("<math", '<math display="block"')
420 elif tag == "label": 420 ↛ 423line 420 didn't jump to line 423, because the condition on line 420 was never false
421 label = child.text or ""
422 else:
423 self.warnings.append(
424 {
425 self.pid: self.__class__.__name__
426 + "."
427 + inspect.currentframe().f_code.co_name
428 + " "
429 + tag
430 }
431 )
433 if (math_text == "" and tex_math != "") or (math_text != "" and tex_math == ""):
434 stack = inspect.stack()
435 stack_str = " ".join(
436 [
437 frameinfo[3]
438 for frameinfo in stack[1:]
439 if frameinfo[3].find("parse_") == 0
440 and frameinfo[3].find("parse_node") == -1
441 and frameinfo[3].find("parse_inner") == -1
442 and frameinfo[3].find("parse_tree") == -1
443 and frameinfo[3].find("parse_article_meta") == -1
444 ]
445 )
446 print(f"{self.pid} no math formula for {stack_str}")
447 # raise ValueError("No formula alternative")
449 if node.tag != "disp-formula":
450 if tex_math != "" and tex_math[0] != "$": 450 ↛ 451line 450 didn't jump to line 451, because the condition on line 450 was never true
451 tex_math = "$" + tex_math
452 if tex_math != "" and tex_math[-1] != "$": 452 ↛ 453line 452 didn't jump to line 453, because the condition on line 452 was never true
453 tex_math = tex_math + "$"
455 tex = tex_math
457 html = ""
458 if label or node.tag == "disp-formula":
459 html += '<table class="formula"><tr><td class="formula-inner">'
461 html += '<span class="mathjax-formula" '
462 if formula_id:
463 html += 'id="' + formula_id + '" '
464 alt_text = tex_math.replace("\n", "") if node.tag == "disp-formula" else tex_math
465 if math_text:
466 html += f'data-tex="{alt_text}">{math_text}</span>'
467 else:
468 html += f'data-tex="{alt_text}">{tex_math}</span>'
470 if label or node.tag == "disp-formula":
471 html += '</td><td class="formula-label">'
472 if label:
473 html += label
474 html += "</td></tr>"
475 html += "</table>"
477 if self.add_span_around_tex_formula: 477 ↛ 478line 477 didn't jump to line 478, because the condition on line 477 was never true
478 tex = f'<span class="mathjax-formula">\\({tex[1:-1]}\\)</span>'
480 return tex, html
482 def parse_node_with_institution_id(self, node, **kwargs):
483 return "", ""
485 def parse_node_with_italic(self, node, **kwargs):
486 tex, html = self.parse_inner_node(node, **kwargs)
488 # is_mixed_citation = kwargs['is_mixed_citation'] if 'is_mixed_citation' in kwargs else False
489 # is_citation = kwargs['is_citation'] if 'is_citation' in kwargs else False
490 # is_comment = kwargs['is_comment'] if 'is_comment' in kwargs else False
491 #
492 # if inner_text == '' or kwargs['temp_tex'] or (is_citation and not is_mixed_citation and not is_comment):
493 # text = inner_text
494 # else:
495 # text = '<span class="italique">' + inner_text + '</span>'
497 html = f'<span class="italique">{html}</span>'
499 if self.for_tex_file: 499 ↛ 500line 499 didn't jump to line 500, because the condition on line 499 was never true
500 tex = "{\\it " + tex + "}"
501 else:
502 tex = f"<i>{tex}</i>"
504 return tex, html
506 def parse_node_with_list(self, node, **kwargs):
507 tex, html = self.parse_inner_node(node, **kwargs)
509 start = None
510 continued_from = node.get("continued-from")
511 if continued_from is not None: 511 ↛ 512line 511 didn't jump to line 512, because the condition on line 511 was never true
512 start = self.get_list_start_value(node) + 1
514 list_type = node.get("list-type")
515 if list_type == "bullet" or list_type == "simple":
516 if self.for_tex_file: 516 ↛ 517line 516 didn't jump to line 517, because the condition on line 516 was never true
517 tex = "\n\\begin{itemize}\n" + tex + "\\end{itemize}\n"
518 else:
519 tex = f"<ul>{tex}</ul>"
521 html = f"<ul>{html}</ul>"
522 else:
523 if self.for_tex_file: 523 ↛ 524line 523 didn't jump to line 524, because the condition on line 523 was never true
524 tex = "\n\\begin{enumerate}\n" + tex + "\\end{enumerate}\n"
525 else:
526 if list_type == "order" or list_type == "number":
527 if start is not None: 527 ↛ 528line 527 didn't jump to line 528, because the condition on line 527 was never true
528 html = f'<ol type="1" start="{str(start)}">{html}</ol>'
529 tex = f'<ol type="1" start="{str(start)}">{tex}</ol>'
530 else:
531 html = f'<ol type="1">{html}</ol>'
532 tex = f'<ol type="1">{tex}</ol>'
533 elif list_type == "alpha-lower":
534 html = f'<ol type="a">{html}</ol>'
535 tex = f'<ol type="a">{tex}</ol>'
536 elif list_type == "alpha-upper":
537 html = f'<ol type="A">{html}</ol>'
538 tex = f'<ol type="A">{tex}</ol>'
539 elif list_type == "roman-lower":
540 html = f'<ol type="i">{html}</ol>'
541 tex = f'<ol type="i">{tex}</ol>'
542 elif list_type == "roman-upper": 542 ↛ 543line 542 didn't jump to line 543, because the condition on line 542 was never true
543 html = f'<ol type="I">{html}</ol>'
544 tex = f'<ol type="I">{tex}</ol>'
545 else:
546 html = f'<ul class="no-bullet" style="list-style-type:none;">{html}</ul>'
547 tex = f'<ul class="no-bullet" style="list-style-type:none;">{tex}</ul>'
549 return tex, html
551 def parse_node_with_list_item(self, node, **kwargs):
552 """
553 <list-item><label>LABEL</label><p>TEXT</p> becomes
554 <li>LABEL TEXT</li>
555 (same with <title>)
557 :param node:
558 :return:
559 """
561 title_tex = (
562 title_html
563 ) = label_tex = label_html = p_tex = p_html = content_tex = content_html = ""
565 for child in node:
566 tag = normalize(child.tag)
567 if tag == "label":
568 label_tex, label_html = self.parse_node_with_mixed_content(child, **kwargs)
569 elif tag == "title": 569 ↛ 570line 569 didn't jump to line 570, because the condition on line 569 was never true
570 title_tex, title_html = self.parse_node_with_mixed_content(child, **kwargs)
571 elif tag == "p":
572 if p_html == "" and content_html == "": 572 ↛ 575line 572 didn't jump to line 575, because the condition on line 572 was never false
573 p_tex, p_html = self.parse_inner_node(child, **kwargs)
574 else:
575 content_tex, content_html = self.parse_inner_node(child, **kwargs)
576 content_html = f"<p>{content_html}</p>"
577 elif tag == "list": 577 ↛ 581line 577 didn't jump to line 581, because the condition on line 577 was never false
578 content_tex, content_html = self.parse_node_with_mixed_content(child, **kwargs)
579 # TODO if tag == "def-list":
580 else:
581 self.warnings.append(
582 {
583 self.pid: self.__class__.__name__
584 + "."
585 + inspect.currentframe().f_code.co_name
586 + " "
587 + tag
588 }
589 )
591 inner_tex = ""
592 if label_tex:
593 inner_tex += label_tex + " "
594 if title_tex: 594 ↛ 595line 594 didn't jump to line 595, because the condition on line 594 was never true
595 inner_tex += title_tex + " "
596 inner_tex += p_tex + content_tex
598 if self.for_tex_file: 598 ↛ 599line 598 didn't jump to line 599, because the condition on line 598 was never true
599 tex = "\\item " + inner_tex + "\n"
600 else:
601 tex = f"<li>{inner_tex}</li>"
603 html = "<li>"
604 if label_html:
605 html += label_html + " "
606 if title_html: 606 ↛ 607line 606 didn't jump to line 607, because the condition on line 606 was never true
607 html += title_html + " "
608 html += p_html + content_html + "</li>"
610 return tex, html
612 def parse_node_with_name_content(self, node, **kwargs):
613 tex, html = self.parse_inner_node(node, **kwargs)
614 return tex, html
616 def parse_node_with_p(self, node, **kwargs):
617 tex, html = self.parse_inner_node(node, **kwargs)
619 if not self.for_tex_file:
620 tex = f"<p>{tex}</p>"
622 node_type = node.get("specific-use")
623 if node_type:
624 html = f'<p class="{node_type}">{html}</p>'
625 else:
626 html = f"<p>{html}</p>"
628 if hasattr(self, "floats_to_insert") and hasattr(self, "floats"): 628 ↛ 629line 628 didn't jump to line 629, because the condition on line 628 was never true
629 while len(self.floats_to_insert) > 0:
630 float_id = self.floats_to_insert.pop(0)
631 if float_id in self.floats:
632 html += self.floats[float_id]
633 self.floats.pop(float_id)
635 return tex, html
637 def parse_node_with_sc(self, node, **kwargs):
638 tex, html = self.parse_inner_node(node, **kwargs)
639 html = f'<span class="smallcaps">{html}</span>'
641 return tex, html
643 def parse_node_with_sec(self, node, **kwargs):
644 """
645 <sec><title>TITLE</title><p>TEXT</p> becomes
646 <section><h@i>TITLE</h@i><p>TEXT</p> (i is the current level and is increased for children)
648 :param node:
649 :param kwargs:
650 :return:
651 """
653 label_tex = label_html = title_tex = title_html = None
654 sec_level = kwargs["sec_level"] = kwargs["sec_level"] if "sec_level" in kwargs else 2
656 inner_tex = inner_html = ""
657 kwargs["sec_level"] += 1
659 for child in node:
660 tag = normalize(child.tag)
661 if tag == "label":
662 label_tex, label_html = self.parse_node_with_mixed_content(child)
663 elif tag == "title":
664 title_tex, title_html = self.parse_node_with_mixed_content(child)
665 else:
666 child_tex, child_html = self.parse_node_with_mixed_content(child, **kwargs)
667 inner_tex += child_tex
668 inner_html += child_html
670 tex = ""
671 html = "<section>"
673 if label_html or title_html: 673 ↛ 686line 673 didn't jump to line 686, because the condition on line 673 was never false
674 html += f"<h{str(sec_level)}>"
675 if label_html: 675 ↛ 678line 675 didn't jump to line 678, because the condition on line 675 was never false
676 tex += label_tex
677 html += label_html
678 if label_html and title_html: 678 ↛ 681line 678 didn't jump to line 681, because the condition on line 678 was never false
679 tex += " "
680 html += " "
681 if title_html: 681 ↛ 684line 681 didn't jump to line 684, because the condition on line 681 was never false
682 tex += title_tex
683 html += title_html
684 html += f"</h{str(sec_level)}>"
686 tex += inner_tex
687 html += inner_html + "</section>"
689 return tex, html
691 def parse_node_with_string_name(self, node, **kwargs):
692 tex, html = self.parse_inner_node(node, **kwargs)
694 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False
695 if is_mixed_citation: 695 ↛ 698line 695 didn't jump to line 698, because the condition on line 695 was never false
696 html = add_span_class_to_html_from_authors(html.title(), **kwargs)
698 return tex, html
700 def parse_node_with_strong(self, node, **kwargs):
701 tex, html = self.parse_inner_node(node, **kwargs)
703 if self.for_tex_file: 703 ↛ 704line 703 didn't jump to line 704, because the condition on line 703 was never true
704 tex = "{\\bf " + tex + "}"
705 else:
706 tex = f"<strong>{tex}</strong>"
707 html = f"<strong>{html}</strong>"
709 return tex, html
711 def parse_node_with_styled_content(self, node, **kwargs):
712 tex, html = self.parse_inner_node(node, **kwargs)
714 if "style" in node.attrib: 714 ↛ 719line 714 didn't jump to line 719, because the condition on line 714 was never false
715 style = node.attrib["style"]
716 if style != "": 716 ↛ 719line 716 didn't jump to line 719, because the condition on line 716 was never false
717 html = f'<span style="{style}">{html}</span>'
719 return tex, html
721 def parse_node_with_sub(self, node, **kwargs):
722 tex, html = self.parse_inner_node(node, **kwargs)
724 if self.for_tex_file: 724 ↛ 725line 724 didn't jump to line 725, because the condition on line 724 was never true
725 tex = "\\textsubscript{" + tex + "}"
726 else:
727 tex = f"<sub>{tex}</sub>"
728 html = f"<sub>{html}</sub>"
730 return tex, html
732 def parse_node_with_sup(self, node, **kwargs):
733 tex, html = self.parse_inner_node(node, **kwargs)
735 if self.for_tex_file: 735 ↛ 736line 735 didn't jump to line 736, because the condition on line 735 was never true
736 tex = "\\textsuperscript{" + tex + "}"
737 else:
738 tex = f"<sup>{tex}</sup>"
739 html = f"<sup>{html}</sup>"
741 return tex, html
743 def parse_node_with_table_generic(self, node, **kwargs):
744 tex, html = self.parse_inner_node(node, **kwargs)
746 tag = normalize(node.tag)
747 if tag == "row": 747 ↛ 748line 747 didn't jump to line 748, because the condition on line 747 was never true
748 tag = "tr"
749 elif tag == "entry": 749 ↛ 750line 749 didn't jump to line 750, because the condition on line 749 was never true
750 tag = "td"
751 open_tag = "<" + tag
753 if tag == "table":
754 class_table = "table"
756 cols = node.xpath("colgroup/col")
757 i = 1
758 for col in cols:
759 if "width" in col.attrib:
760 class_table += f" nowrap-col-{i}"
761 i += 1
763 open_tag += f' class="{class_table}"'
764 if "rowspan" in node.attrib:
765 open_tag += ' rowspan="' + node.attrib["rowspan"] + '"'
766 if "colspan" in node.attrib:
767 open_tag += ' colspan="' + node.attrib["colspan"] + '"'
768 if "align" in node.attrib:
769 open_tag += ' align="' + node.attrib["align"] + '"'
770 if "valign" in node.attrib:
771 open_tag += ' class="td-valign-' + node.attrib["valign"] + '"'
772 if "style" in node.attrib:
773 open_tag += ' style="' + node.attrib["style"] + '"'
774 open_tag += ">"
776 html = f"{open_tag}{html}</{tag}>"
778 return "", html
780 def parse_node_with_table_wrap(self, node, **kwargs):
781 """
782 Create a <div class="table-wrap"> around the table
783 :param node:
784 :return:
785 """
787 table_id = label = caption = None
788 inner_text = ""
790 if "id" in node.attrib: 790 ↛ 793line 790 didn't jump to line 793, because the condition on line 790 was never false
791 table_id = node.attrib["id"]
793 for child in node:
794 tag = normalize(child.tag)
795 if tag == "label":
796 _, label = self.parse_node_with_mixed_content(child, **kwargs)
797 elif tag == "caption":
798 _, caption = self.parse_node_with_mixed_content(child, **kwargs)
799 else:
800 _, child_text = self.parse_node_with_mixed_content(child, **kwargs)
801 inner_text += child_text
803 if table_id: 803 ↛ 806line 803 didn't jump to line 806, because the condition on line 803 was never false
804 text = '<div class="table-wrap table-responsive" id="' + table_id + '">'
805 else:
806 text = '<div class="table-wrap table-responsive">'
808 if label or caption: 808 ↛ 811line 808 didn't jump to line 811, because the condition on line 808 was never false
809 text += '<div class="table-wrap-header">'
811 if label: 811 ↛ 814line 811 didn't jump to line 814, because the condition on line 811 was never false
812 text += "<strong>" + label + "</strong>"
814 if caption: 814 ↛ 820line 814 didn't jump to line 820, because the condition on line 814 was never false
815 if label: 815 ↛ 817line 815 didn't jump to line 817, because the condition on line 815 was never false
816 text += " "
817 if caption: 817 ↛ 820line 817 didn't jump to line 820, because the condition on line 817 was never false
818 text += caption
820 if label or caption: 820 ↛ 823line 820 didn't jump to line 823, because the condition on line 820 was never false
821 text += "</div>"
823 text += inner_text
824 text += "</div>"
826 if ( 826 ↛ 832line 826 didn't jump to line 832
827 "append_floats" in kwargs
828 and kwargs["append_floats"]
829 and hasattr(self, "floats")
830 and table_id is not None
831 ):
832 self.floats[table_id] = text
834 return "", text
836 def parse_node_with_table_wrap_foot(self, node, **kwargs):
837 """
838 Create a <div class="table-wrap-foot"> at bottom of the table
839 Keep the footnotes inside this div
840 :param node:
841 :return:
842 """
844 text = '<div class="table-wrap-foot">'
845 kwargs["keep_fn"] = True
847 for child in node:
848 tag = normalize(child.tag)
849 if tag == "fn-group": 849 ↛ 847line 849 didn't jump to line 847, because the condition on line 849 was never false
850 _, html = self.parse_node_with_mixed_content(child, **kwargs)
851 text += html
853 text += "</div>"
855 return "", text
857 def parse_node_with_toc(self, node, **kwargs):
858 tex, html = self.parse_inner_node(node, **kwargs)
860 html = f"<table>{html}</table>"
862 # text = '<ul class="no-bullet book-toc">'
863 # text += inner_text + '</ul>'
865 return "", html
867 def parse_node_with_toc_entry(self, node, **kwargs):
868 html = label = title = child_text = page = anchor = ""
869 inside_toc_entry = "inside_toc_entry" in kwargs and kwargs["inside_toc_entry"]
870 toc_class = "inside-toc" if inside_toc_entry else ""
871 # # toc-entry may be embedded inside toc-entry: create a wrapping <ul>
872 # html = '<tr class="inside-toc">'
873 # #html = '<ul class="no-bullet book-toc">'
875 for child in node:
876 tag = normalize(child.tag)
877 if tag == "title":
878 _, title = self.parse_node_with_mixed_content(child, **kwargs)
879 elif tag == "label":
880 _, label = self.parse_node_with_mixed_content(child, **kwargs)
881 elif tag == "nav-pointer":
882 _, page = self.parse_node_with_mixed_content(child, **kwargs)
883 elif tag == "nav-pointer-group": 883 ↛ 884line 883 didn't jump to line 884, because the condition on line 883 was never true
884 for grandchild in child:
885 if (
886 grandchild.tag == "nav-pointer"
887 and "specific-use" in grandchild.attrib
888 and grandchild.attrib["specific-use"] == "pagenum"
889 ):
890 _, page = self.parse_node_with_mixed_content(grandchild, **kwargs)
891 if (
892 grandchild.tag == "nav-pointer"
893 and "specific-use" in grandchild.attrib
894 and grandchild.attrib["specific-use"] == "pageindex"
895 ):
896 anchor = int(grandchild.text) + 1
897 elif tag == "toc-entry": 897 ↛ 875line 897 didn't jump to line 875, because the condition on line 897 was never false
898 _, text = self.parse_node_with_mixed_content(child, inside_toc_entry=True)
899 child_text += text
901 toc_text = f"{label} {title}"
902 page_text = f"p. {page}"
904 if anchor: 904 ↛ 905line 904 didn't jump to line 905, because the condition on line 904 was never true
905 href = reverse("item-pdf", kwargs={"pid": self.pid, "extension": "pdf"})
906 href += f"#page={anchor}"
907 toc_text = f'<a href="{href}">{toc_text}</a>'
908 page_text = f'<a href="{href}">{page_text}</a>'
910 html += f'<tr><td class="{toc_class}">{toc_text}</td><td class="toc-page">{page_text}</td></tr>'
911 if len(child_text) > 0:
912 html += child_text
913 # html += f'<li>{title} <span> p. {page}</span>{child_text}</li>'
915 # if 'inside_toc_entry' in kwargs and kwargs['inside_toc_entry']:
916 # html += '</tr>'
917 # #html += '</ul>'
919 return "", html
921 def parse_node_with_underline(self, node, **kwargs):
922 tex, html = self.parse_inner_node(node, **kwargs)
923 tex = f"<u>{tex}</u>"
924 html = f"<u>{html}</u>"
926 return tex, html
928 def parse_node_with_volume(self, node, **kwargs):
929 tex, html = self.parse_inner_node(node, **kwargs)
931 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False
932 if is_mixed_citation: 932 ↛ 935line 932 didn't jump to line 935, because the condition on line 932 was never false
933 html = add_span_class_to_html_from_volume(html, **kwargs)
935 return tex, html
937 def parse_node_with_xref(self, node, **kwargs):
938 tex = html = ""
940 if "ignore_xref" in kwargs and kwargs["ignore_xref"]: 940 ↛ 941line 940 didn't jump to line 941, because the condition on line 940 was never true
941 return tex, html
943 xref_id = node.get("rid")
944 if xref_id: 944 ↛ 958line 944 didn't jump to line 958, because the condition on line 944 was never false
945 rids = xref_id.split()
947 tex, html = self.parse_inner_node(node, **kwargs)
948 rid0 = rids[0]
949 if rid0.find("bib") == 0: 949 ↛ 950line 949 didn't jump to line 950, because the condition on line 949 was never true
950 rid0 = "r" + rid0[3:]
951 html = f'<a href="#{rid0}">{html}</a>'
953 for rid in rids:
954 ref_type = node.get("ref-type") or None
955 if ref_type in ["fig", "table", "textbox"] and hasattr(self, "floats_to_insert"): 955 ↛ 956line 955 didn't jump to line 956, because the condition on line 955 was never true
956 self.floats_to_insert.append(rid)
958 return tex, html
960 def parse_inner_node(self, node, **kwargs):
961 """
962 Used by html_from_mixed_content for nodes that have a different tag in HTML
963 :param node:
964 :param kwargs:
965 :return:
966 """
967 tex = html = ""
968 kwargs["is_top"] = False
969 kwargs["is_body_html"] = kwargs["is_body_html"] if "is_body_html" in kwargs else False
971 if node.text:
972 node_text = node.text
973 if self.for_tex_file:
974 node_text = unicode_to_latex(node_text)
975 tex = node_text
976 html = escape(node.text)
978 for child in node:
979 child_tex, child_html = self.parse_node_with_mixed_content(child, **kwargs)
980 tex += child_tex
981 html += child_html
983 return tex, html
985 def parse_node_with_mixed_content(self, node, **kwargs):
986 """
987 Parse and return the HTML text of an XML node which mixes text and XML sub-nodes.
988 Ex: <node>text1 <a>text_a</a> text2 <b>text_b</b>b_tail</node>
989 Some inner nodes are removed, others are kept or replaced by their HTML equivalent.
990 html_from_mixed_content is called recursively to get the HTML text of the children.
992 :param node: XML Node
993 :param kwargs: params of the function
994 :return: HTML text
995 """
997 if node is None: 997 ↛ 998line 997 didn't jump to line 998, because the condition on line 997 was never true
998 return "", ""
1000 # The tail is the text following the end of the node
1001 # Ex: <node>text1<a>text_a</a>a_tail</node>
1002 # The HTML text has to include the tail
1003 # only if html_from_mixed_content was called recursively
1004 kwargs["is_top"] = kwargs["is_top"] if "is_top" in kwargs else True
1006 # sec_level is used to add <h1>, <h2>,... in the HTML text while parsing nodes like <sec>
1007 kwargs["sec_level"] = kwargs["sec_level"] if "sec_level" in kwargs else 2
1009 # Text in <comment> is parsed to add HTML link.
1010 kwargs["add_HTML_link"] = kwargs["add_HTML_link"] if "add_HTML_link" in kwargs else False
1012 # base_url to image links
1013 kwargs["base_url"] = kwargs["base_url"] if "base_url" in kwargs else ""
1015 # footnotes are removed from the fulltext (and put at the end) except for those in a table
1016 kwargs["keep_fn"] = kwargs["keep_fn"] if "keep_fn" in kwargs else False
1018 kwargs["is_citation"] = kwargs["is_citation"] if "is_citation" in kwargs else False
1019 kwargs["is_comment"] = kwargs["is_comment"] if "is_comment" in kwargs else False
1020 # mixed-citation ignores ext-link
1021 kwargs["add_ext_link"] = kwargs["add_ext_link"] if "add_ext_link" in kwargs else False
1023 # TODO remove once jats_parser has been validated agains xmldata
1024 kwargs["temp_math"] = kwargs["temp_math"] if "temp_math" in kwargs else False
1025 kwargs["temp_tex"] = kwargs["temp_tex"] if "temp_tex" in kwargs else False
1026 kwargs["is_mixed_citation"] = (
1027 kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False
1028 )
1029 kwargs["is_body_html"] = kwargs["is_body_html"] if "is_body_html" in kwargs else False
1031 tag = normalize(node.tag)
1033 # pub-id/object-id are ignored by default are they are treated separately
1034 if not (kwargs["is_comment"]) and tag in ("pub-id", "object-id"):
1035 return "", ""
1037 if tag in ("mixed-citation", "toc"):
1038 kwargs["is_citation"] = True
1039 elif tag == "comment":
1040 kwargs["is_comment"] = True
1042 tex = html = inner_tex = inner_html = ""
1044 # I. Add the node's text.
1045 # Some tag have a corresponding parse_node_with_@tag function to generate the HTML text.
1047 # Check if the parse_node_with_@tag exists
1048 tag_mapped = {
1049 "statement": "sec",
1050 "disp-formula": "inline-formula",
1051 "chapter-title": "article-title",
1052 "bold": "strong",
1053 "table": "table-generic",
1054 "th": "table-generic",
1055 "tr": "table-generic",
1056 "td": "table-generic",
1057 "thead": "table-generic",
1058 "tbody": "table-generic",
1059 "colgroup": "table-generic",
1060 "col": "table-generic",
1061 "tgroup": "table-generic",
1062 "entry": "table-generic",
1063 "row": "table-generic",
1064 }
1066 fct_name = tag_mapped[tag] if tag in tag_mapped else tag
1067 fct_name = "parse_node_with_" + fct_name.replace("-", "_")
1068 ftor = getattr(self, fct_name, None)
1069 if callable(ftor):
1070 inner_tex, inner_html = ftor(node, **kwargs)
1071 elif tag in ("ext-link", "uri"):
1072 # Add HTML links
1073 inner_tex = inner_html = self.helper_add_link_from_node(node, **kwargs)
1074 # Update self.ext_links. Useful for <ext-link> deep in a <mixed_citation>,
1075 # and not caught by parse_citation_node
1076 if tag == "ext-link" and not kwargs["is_comment"] and kwargs["add_ext_link"]:
1077 is_extid_value = self.parse_ext_link(node, **kwargs)
1078 if is_extid_value and kwargs["is_mixed_citation"]:
1079 # an extid has been found in a mixed_citation, no need to add the text of the id here
1080 inner_tex = inner_html = ""
1081 elif tag == "supplementary-material": 1081 ↛ 1082line 1081 didn't jump to line 1082, because the condition on line 1081 was never true
1082 self.parse_supplementary_material(node, **kwargs)
1083 else:
1084 # II.1. Add the node text (before the children text)
1085 if node.text is not None:
1086 node_text = node.text
1087 if self.for_tex_file: 1087 ↛ 1088line 1087 didn't jump to line 1088, because the condition on line 1087 was never true
1088 node_text = unicode_to_latex(node_text)
1089 inner_tex += node_text
1090 inner_html += escape(node.text)
1092 # II.2. children
1093 # child_text = html_from_mixed_content(child, params)
1095 child_kwargs = kwargs.copy()
1096 child_kwargs["is_top"] = False
1098 for child in node:
1099 child_tex, child_html = self.parse_node_with_mixed_content(child, **child_kwargs)
1101 # Case where an ext-link has been removed in a mixed-citation
1102 # We may have "title. , (year)"
1103 # Remove the comma that is now useless
1104 if ( 1104 ↛ 1110line 1104 didn't jump to line 1110
1105 kwargs["is_mixed_citation"]
1106 and child_html
1107 and child_html[0] in [",", "."]
1108 and inner_html[-2:] == ". "
1109 ):
1110 inner_html = inner_html[0:-1]
1111 child_html = child_html[1:]
1112 inner_tex = inner_tex[0:-1]
1113 child_tex = child_tex[1:]
1115 inner_tex += child_tex
1116 inner_html += child_html
1118 # II.3. wrap the children text with html links
1119 if kwargs["add_HTML_link"] and node.text:
1120 match = re.match(r"[\n ]+", node.text)
1121 if not match:
1122 inner_html = make_links_clickable(node.text, inner_html)
1124 tex += inner_tex
1125 html += inner_html
1127 # III. Add the node's tail for children
1128 if node.tail and not kwargs["is_top"]:
1129 node_tail = node.tail
1130 if self.for_tex_file:
1131 node_tail = unicode_to_latex(node_tail)
1132 tex += node_tail
1133 html += escape(node.tail)
1135 return tex, html
1137 def parse_abstract(self, node, **kwargs):
1138 # tag = get_normalized_attrib(node, "abstract-node_type") or "abstract"
1139 tag = get_normalized_attrib(node, "abstract-type") or "abstract"
1140 if tag == "author": 1140 ↛ 1141line 1140 didn't jump to line 1141, because the condition on line 1140 was never true
1141 tag = "abstract"
1142 lang = get_normalized_attrib(node, "lang") or self.lang
1143 value_tex, value_html = self.parse_node_with_mixed_content(node)
1144 value_xml = get_xml_from_node(node)
1145 self.abstracts.append(
1146 {
1147 "tag": tag,
1148 "lang": lang,
1149 "value_xml": value_xml,
1150 "value_html": value_html,
1151 "value_tex": value_tex,
1152 }
1153 )
1155 def parse_aff_alternatives(self, node, **kwargs):
1156 xref_id = get_normalized_attrib(node, "id") or ""
1157 address = ""
1158 aff_to_all = True
1160 for child in node:
1161 tag = normalize(child.tag)
1163 if tag == "aff": 1163 ↛ 1174line 1163 didn't jump to line 1174, because the condition on line 1163 was never false
1164 # Skip the formatted aff and use only the complete address text
1165 # TODO support <aff> properly
1166 for aff in child:
1167 if aff.tag == "label" and address == "": 1167 ↛ 1168line 1167 didn't jump to line 1168, because the condition on line 1167 was never true
1168 label = get_text_from_node(aff)
1169 address = get_text_from_node(child)[len(label) :]
1170 aff_to_all = False
1171 if address == "" and child.text:
1172 address = child.text
1173 else:
1174 self.warnings.append(
1175 {
1176 self.pid: self.__class__.__name__
1177 + "."
1178 + inspect.currentframe().f_code.co_name
1179 + " "
1180 + tag
1181 }
1182 )
1184 if address != "": 1184 ↛ exitline 1184 didn't return from function 'parse_aff_alternatives', because the condition on line 1184 was never false
1185 for contrib in self.contributors:
1186 if address not in contrib["addresses"] and ( 1186 ↛ 1185line 1186 didn't jump to line 1185, because the condition on line 1186 was never false
1187 ("xrefs" in contrib and xref_id in contrib["xrefs"]) or aff_to_all
1188 ):
1189 contrib["addresses"].append(address)
1190 contrib["contrib_xml"] = get_contrib_xml(contrib)
1192 def parse_award_group(self, node, **kwargs):
1193 abbrev = award_id = None
1195 for child in node:
1196 tag = normalize(child.tag)
1198 if tag == "award-id":
1199 award_id = child.text
1200 elif tag == "funding-source": 1200 ↛ 1203line 1200 didn't jump to line 1203, because the condition on line 1200 was never false
1201 abbrev = get_text_from_node(child)
1202 else:
1203 self.warnings.append(
1204 {
1205 self.pid: self.__class__.__name__
1206 + "."
1207 + inspect.currentframe().f_code.co_name
1208 + " "
1209 + tag
1210 }
1211 )
1213 if abbrev is not None and award_id is not None: 1213 ↛ exitline 1213 didn't return from function 'parse_award_group', because the condition on line 1213 was never false
1214 self.awards.append({"abbrev": abbrev, "award_id": award_id})
1216 def parse_contrib_group(self, node, **kwargs):
1217 role = node.get("content-type") or ""
1218 if role and role[-1] == "s": 1218 ↛ 1221line 1218 didn't jump to line 1221, because the condition on line 1218 was never false
1219 role = role[0:-1]
1221 for child in node:
1222 tag = normalize(child.tag)
1224 if tag == "contrib": 1224 ↛ 1229line 1224 didn't jump to line 1229, because the condition on line 1224 was never false
1225 contrib = self.get_data_from_contrib(child)
1226 contrib["role"] = f"{role}|{contrib['role']}" if contrib["role"] else role
1227 contrib["contrib_xml"] = get_xml_from_node(child)
1228 self.contributors.append(contrib)
1229 elif tag == "aff-alternatives":
1230 self.parse_aff_alternatives(child)
1231 elif tag == "fn":
1232 _, html = self.parse_node_with_fn(child, keep_fn=True, keep_fn_label=False)
1233 xml = get_xml_from_node(child)
1234 self.footnotes_xml += xml
1235 self.footnotes_html += html
1236 else:
1237 self.warnings.append(
1238 {
1239 self.pid: self.__class__.__name__
1240 + "."
1241 + inspect.currentframe().f_code.co_name
1242 + " "
1243 + tag
1244 }
1245 )
1247 def parse_counts(self, node, **kwargs):
1248 for child in node:
1249 count_value = child.get("count")
1250 if count_value is None:
1251 count_value = child.text
1253 if count_value is not None: 1253 ↛ 1248line 1253 didn't jump to line 1248, because the condition on line 1253 was never false
1254 tag = normalize(child.tag)
1255 if tag == "book-page-count":
1256 tag = "page-count"
1258 self.counts.append((tag, count_value))
1260 def parse_ext_link(self, node, **kwargs):
1261 datas = self.get_data_from_ext_link(node)
1262 extid_value = self.add_extids_from_node_with_link(datas)
1264 add_ext_link = kwargs["add_ext_link"] if "add_ext_link" in kwargs else False
1265 if (
1266 add_ext_link
1267 and extid_value[0] is None
1268 and datas not in self.ext_links
1269 and datas["rel"] != "cover"
1270 ):
1271 self.ext_links.append(datas)
1273 return extid_value[0] is not None
1275 def parse_front_matter(self, node, **kwargs):
1276 self.frontmatter_xml = get_xml_from_node(node)
1277 self.frontmatter_foreword_html = ""
1279 for child in node:
1280 tag = normalize(child.tag)
1282 if tag == "foreword": 1282 ↛ 1283line 1282 didn't jump to line 1283, because the condition on line 1282 was never true
1283 _, self.frontmatter_foreword_html = self.parse_node_with_mixed_content(child)
1284 elif tag == "toc": 1284 ↛ 1279line 1284 didn't jump to line 1279, because the condition on line 1284 was never false
1285 _, self.frontmatter_toc_html = self.parse_node_with_mixed_content(child)
1287 def parse_id(self, node, **kwargs):
1288 node_id = node.text
1289 if "pub-id-type" in node.attrib:
1290 node_type = node.attrib["pub-id-type"]
1291 elif "book-id-type" in node.attrib:
1292 node_type = node.attrib["book-id-type"]
1293 elif "book-part-id-type" in node.attrib: 1293 ↛ 1296line 1293 didn't jump to line 1296, because the condition on line 1293 was never false
1294 node_type = node.attrib["book-part-id-type"]
1295 else:
1296 node_type = ""
1298 if node_type == "pii": 1298 ↛ 1300line 1298 didn't jump to line 1300, because the condition on line 1298 was never true
1299 # Elsevier ids get a special treatment: web scrapping to find the date_published
1300 if self.pid and len(self.pid) > 2 and self.pid[0:2] == "CR":
1301 self.pii = node_id
1302 elif node_type in ("numdam-id", "mathdoc-id"):
1303 self.pid = node_id
1304 elif node_type == "ark": 1304 ↛ 1305line 1304 didn't jump to line 1305, because the condition on line 1304 was never true
1305 self.extids.append((node_type, node_id))
1306 elif node_type in ("doi", "eid"):
1307 self.ids.append((node_type, node_id))
1308 if node_type == "doi": 1308 ↛ exitline 1308 didn't return from function 'parse_id', because the condition on line 1308 was never false
1309 self.doi = node_id
1311 def parse_kwd_group(self, node, **kwargs):
1312 kwds = []
1313 value_html = value_tex = ""
1314 for child in node:
1315 tag = normalize(child.tag)
1317 if tag == "kwd":
1318 kwds.append(child.text)
1319 elif tag == "unstructured-kwd-group": 1319 ↛ 1324line 1319 didn't jump to line 1324, because the condition on line 1319 was never false
1320 # value_xml = get_xml_from_node(child)
1321 value_tex, value_html = self.parse_node_with_mixed_content(child)
1322 kwds = split_kwds(value_tex)
1323 else:
1324 self.warnings.append(
1325 {
1326 self.pid: self.__class__.__name__
1327 + "."
1328 + inspect.currentframe().f_code.co_name
1329 + " "
1330 + tag
1331 }
1332 )
1334 content_type = node.get("content-node_type") or ""
1335 if content_type == "": 1335 ↛ 1337line 1335 didn't jump to line 1337, because the condition on line 1335 was never false
1336 content_type = node.get("kwd-group-type") or ""
1337 lang = get_normalized_attrib(node, "lang") or self.lang
1339 self.kwds.extend([{"type": content_type, "lang": lang, "value": kwd} for kwd in kwds])
1341 def parse_ref_list(self, node, **kwargs):
1342 for child in node:
1343 tag = normalize(child.tag)
1345 if tag == "ref":
1346 ref = JatsRef(tree=child, lang=self.lang)
1347 self.warnings.extend(ref.warnings)
1348 self.bibitems.append(ref)
1349 self.bibitem.append(ref.citation_html)
1350 elif tag == "p": 1350 ↛ 1352line 1350 didn't jump to line 1352, because the condition on line 1350 was never true
1351 # Elsevier can store supplementary-material inside ref-list / p
1352 self.parse_node_with_mixed_content(child)
1353 else:
1354 self.warnings.append(
1355 {
1356 self.pid: self.__class__.__name__
1357 + "."
1358 + inspect.currentframe().f_code.co_name
1359 + " "
1360 + tag
1361 }
1362 )
1364 def parse_related_article(self, node, **kwargs):
1365 rel_type = get_normalized_attrib(node, "related-article-type") or ""
1366 id_value = node.text
1368 if hasattr(self, "pii") and id_value and id_value.find("10.") == -1 and id_value != "NONE": 1368 ↛ 1371line 1368 didn't jump to line 1371, because the condition on line 1368 was never true
1369 # a pii is used instead of a DOI
1370 # Call Elsevier to get the doi
1371 doi = scrapping.fetch_article(self.doi, id_value, pii_doi_equivalence=True)
1372 id_value = doi
1374 obj = Foo()
1375 obj.rel_type = rel_type
1376 obj.id_value = id_value
1378 self.relations.append(obj)
1380 def parse_related_object(self, node, **kwargs):
1381 node_type = node.get("content-type") or ""
1382 rel = node.get("link-type") or ""
1383 href = get_normalized_attrib(node, "href") or ""
1384 base = get_normalized_attrib(node, "base") or ""
1385 text = get_xml_from_node(node)
1387 data = {
1388 "rel": rel,
1389 "mimetype": node_type,
1390 "location": href,
1391 "base": base,
1392 "metadata": text,
1393 }
1395 document_id_type = node.get("document-id-type") or ""
1396 if document_id_type: 1396 ↛ 1397line 1396 didn't jump to line 1397, because the condition on line 1396 was never true
1397 id_value = node.get("document-id") or ""
1398 if id_value != "NONE":
1399 if id_value and id_value.find("10.") == -1:
1400 # a pii is used instead of a DOI
1401 # Call Elsevier to get the doi
1402 doi = scrapping.fetch_article(self.doi, id_value, pii_doi_equivalence=True)
1403 id_value = doi
1405 obj = Foo()
1406 obj.rel_type = "refers to"
1407 obj.id_value = id_value
1409 self.relations.append(obj)
1410 else:
1411 self.related_objects.append(data)
1413 def parse_sec(self, node, **kwargs):
1414 for child in node:
1415 tag = normalize(child.tag)
1417 if tag == "title":
1418 pass
1419 elif tag == "ref-list":
1420 self.parse_ref_list(child)
1421 else:
1422 self.warnings.append(
1423 {
1424 self.pid: self.__class__.__name__
1425 + "."
1426 + inspect.currentframe().f_code.co_name
1427 + " "
1428 + tag
1429 }
1430 )
1432 def parse_self_uri(self, node, **kwargs):
1433 node_type = node.get("content-type") or "text/html"
1434 href = get_normalized_attrib(node, "href") or ""
1435 base = get_normalized_attrib(node, "base") or ""
1437 # The XML of the Elsevier archive do not declare the PDF location like the other Mathdoc collections:
1438 # The collection folder is missing: add it back
1439 if hasattr(self, "pii") and hasattr(self, "issue"): 1439 ↛ 1440line 1439 didn't jump to line 1440, because the condition on line 1439 was never true
1440 base_dir = self.issue.journal.pid
1441 if os.path.dirname(href) != base_dir:
1442 href = os.path.join(base_dir, self.issue.pid, href)
1444 if self.no_bib: 1444 ↛ 1445line 1444 didn't jump to line 1445, because the condition on line 1444 was never true
1445 href = "http://www.numdam.org/item/" + os.path.basename(href)
1447 data = {
1448 "rel": "full-text",
1449 "mimetype": node_type,
1450 "location": href,
1451 "base": base,
1452 "text": normalize_space(node.text) if node.text is not None else "",
1453 }
1455 # Ext-links, Related-objects used metadata instead of text. Strange difference ?
1456 # xml_cmds ignore "application/xml" in add_objects_with_location: they are ignored here.
1457 if node_type != "application/xml":
1458 self.streams.append(data)
1460 def parse_sub_article(self, node, **kwargs):
1461 # Used for translations
1462 trans_article = JatsArticle(tree=node)
1463 self.translations.append(trans_article)
1465 def parse_subj_group(self, node, **kwargs):
1466 lang = get_normalized_attrib(node, "lang") or self.lang
1467 type_ = node.get("subj-group-type") or ""
1469 for child in node:
1470 tag = normalize(child.tag)
1472 if tag == "subject": 1472 ↛ 1477line 1472 didn't jump to line 1477, because the condition on line 1472 was never false
1473 self.subjs.append(
1474 {"type": type_, "lang": lang, "value": get_text_from_node(child)}
1475 )
1476 else:
1477 self.warnings.append(
1478 {
1479 self.pid: self.__class__.__name__
1480 + "."
1481 + inspect.currentframe().f_code.co_name
1482 + " "
1483 + tag
1484 }
1485 )
1487 def parse_supplementary_material(self, node, **kwargs):
1488 caption = ""
1489 for child in node:
1490 if child.tag == "caption":
1491 _, caption = self.parse_node_with_mixed_content(child)
1493 location = get_normalized_attrib(node, "href") or None
1494 if location is None:
1495 location = get_normalized_attrib(node, "id") or ""
1497 mimetype = node.attrib.get("mimetype") or None
1498 if mimetype is None:
1499 mimetype = resolver.get_mimetype(location)
1501 material = {
1502 "rel": node.attrib.get("content-type") or "supplementary-material",
1503 "mimetype": mimetype,
1504 "location": location,
1505 "base": "",
1506 "metadata": "",
1507 "caption": caption if caption else "",
1508 }
1509 base_location = os.path.basename(location)
1510 found_list = [
1511 item
1512 for item in self.supplementary_materials
1513 if os.path.basename(item["location"]) == base_location
1514 ]
1515 if len(found_list) == 0:
1516 self.supplementary_materials.append(material)
1518 def parse_title(self, node, **kwargs):
1519 self.title_tex, self.title_html = self.parse_node_with_mixed_content(
1520 node, ignore_xref=True
1521 )
1522 # In xmldata.py, title_xml had the <title_group> tag:
1523 # self.title_xml can't be set in parse_title
1525 def parse_title_group(self, node, **kwargs):
1526 has_fn_group = False
1528 for child in node:
1529 tag = normalize(child.tag)
1531 if tag in ("title", "journal-title", "article-title", "book-title", "issue-title"):
1532 self.parse_title(child)
1533 elif tag == "subtitle": 1533 ↛ 1534line 1533 didn't jump to line 1534, because the condition on line 1533 was never true
1534 title_tex, title_html = self.parse_node_with_mixed_content(child)
1535 self.title_tex += " " + title_tex
1536 self.title_html += " " + title_html
1537 elif tag == "trans-title-group":
1538 self.parse_trans_title_group(child)
1539 elif tag == "abbrev-title":
1540 _, self.abbrev = self.parse_node_with_mixed_content(child)
1541 elif tag == "fn-group": 1541 ↛ 1542line 1541 didn't jump to line 1542, because the condition on line 1541 was never true
1542 has_fn_group = True
1543 for fn_node in child:
1544 if fn_node.tag == "fn":
1545 _, html = self.parse_node_with_fn(
1546 fn_node, keep_fn=True, keep_fn_label=False
1547 )
1548 xml = get_xml_from_node(fn_node)
1549 self.footnotes_xml += xml
1550 self.footnotes_html += html
1551 else:
1552 self.warnings.append(
1553 {
1554 self.pid: self.__class__.__name__
1555 + "."
1556 + inspect.currentframe().f_code.co_name
1557 + " "
1558 + tag
1559 }
1560 )
1562 if has_fn_group: 1562 ↛ 1565line 1562 didn't jump to line 1565, because the condition on line 1562 was never true
1563 # fn-group is now a funding statement and will be exported separately in the XML:
1564 # => remove it from the title-group
1565 new_node = etree.Element("title-group")
1566 for child in node:
1567 tag = normalize(child.tag)
1568 if tag != "fn-group":
1569 new_node.append(copy.deepcopy(child))
1570 self.title_xml = get_xml_from_node(new_node)
1571 else:
1572 self.title_xml = get_xml_from_node(node)
1574 def parse_trans_abstract(self, node, **kwargs):
1575 tag = get_normalized_attrib(node, "abstract-type") or "abstract"
1576 if tag == "author": 1576 ↛ 1577line 1576 didn't jump to line 1577, because the condition on line 1576 was never true
1577 tag = "abstract"
1578 lang = get_normalized_attrib(node, "lang") or "und"
1579 value_tex, value_html = self.parse_node_with_mixed_content(node)
1580 value_xml = get_xml_from_node(node)
1581 self.abstracts.append(
1582 {
1583 "tag": tag,
1584 "lang": lang,
1585 "value_xml": value_xml,
1586 "value_html": value_html,
1587 "value_tex": value_tex,
1588 }
1589 )
1591 def parse_trans_title(self, node, **kwargs):
1592 self.trans_title_tex, self.trans_title_html = self.parse_node_with_mixed_content(node)
1593 self.trans_title_xml = get_xml_from_node(node)
1595 def parse_trans_title_group(self, node, **kwargs):
1596 for child in node:
1597 tag = normalize(child.tag)
1599 if tag == "trans-title": 1599 ↛ 1602line 1599 didn't jump to line 1602, because the condition on line 1599 was never false
1600 self.parse_trans_title(child)
1601 else:
1602 self.warnings.append(
1603 {
1604 self.pid: self.__class__.__name__
1605 + "."
1606 + inspect.currentframe().f_code.co_name
1607 + " "
1608 + tag
1609 }
1610 )
1612 self.trans_lang = get_normalized_attrib(node, "lang") or "und"
1614 def get_data_from_contrib(self, node):
1615 """
1616 <contrib> creates 1 person, defined in <name>, <string-name> or <name-alternatives>
1617 In a <mixed-citation>, each <name> creates 1 person: we can't use the same code
1618 :param node:
1619 :return:
1620 """
1622 params = create_contributor()
1624 for child in node:
1625 if child.tag == "name":
1626 self.update_data_from_name(child, params)
1627 elif child.tag == "string-name":
1628 self.update_data_from_name(child, params)
1629 if params["first_name"] == "" and params["last_name"] == "": 1629 ↛ 1624line 1629 didn't jump to line 1624, because the condition on line 1629 was never false
1630 params["string_name"] = child.text or ""
1631 elif child.tag == "name-alternatives":
1632 params["mid"] = self.get_data_from_name_alternatives(child)
1633 elif child.tag == "contrib-id":
1634 type_ = child.get("contrib-id-type") or ""
1635 if type_ == "orcid": 1635 ↛ 1637line 1635 didn't jump to line 1637, because the condition on line 1635 was never false
1636 params["orcid"] = child.text or ""
1637 if type_ == "idref": 1637 ↛ 1638line 1637 didn't jump to line 1638, because the condition on line 1637 was never true
1638 params["idref"] = child.text or ""
1639 elif child.tag == "address":
1640 addr = get_text_from_node(child)
1641 params["addresses"].append(addr)
1642 elif child.tag == "email":
1643 params["email"] = child.text or ""
1644 elif child.tag == "xref": 1644 ↛ 1656line 1644 didn't jump to line 1656, because the condition on line 1644 was never false
1645 # Elsevier uses xref/aff-alternatives to store affiliations
1646 type_ = child.get("ref-type") or ""
1647 if type_ == "aff": 1647 ↛ 1624line 1647 didn't jump to line 1624, because the condition on line 1647 was never false
1648 xref = child.get("rid") or ""
1649 if xref == "": 1649 ↛ 1650line 1649 didn't jump to line 1650, because the condition on line 1649 was never true
1650 xref = get_text_from_node(child)
1651 if xref != "": 1651 ↛ 1624line 1651 didn't jump to line 1624, because the condition on line 1651 was never false
1652 if "xrefs" not in params: 1652 ↛ 1655line 1652 didn't jump to line 1655, because the condition on line 1652 was never false
1653 params["xrefs"] = [xref]
1654 else:
1655 params["xrefs"].append(xref)
1656 elif child.tag == "collab":
1657 params["string_name"] = child.text or ""
1658 elif child.tag == "role":
1659 pass
1660 # Role is used in BJHTUP11 as a textual description of the role (ex "Présidente").
1661 # The node value can not be assigned to params['role'] as we want a controlled vocabulary
1662 # (author /editor / organizer...)
1663 # Ignore the value
1664 # params["role"] = child.text or ""
1665 else:
1666 self.warnings.append(
1667 {
1668 self.pid: self.__class__.__name__
1669 + "."
1670 + inspect.currentframe().f_code.co_name
1671 + " "
1672 + child.tag
1673 }
1674 )
1676 # Remove the sort, it causes differences between the HTML and the PDF (discovered in PCJ)
1677 # Sort was introduced on 22/09/2020, based on differences between the Cedrics->JATS XSLT et the Cedrics import
1678 # params['addresses'].sort()
1680 helper_update_name_params(params)
1682 corresp = node.get("corresp") or ""
1683 if corresp == "yes":
1684 params["corresponding"] = True
1686 deceased_ = node.get("deceased") or "no"
1687 params["deceased_before_publication"] = deceased_ == "yes"
1689 equal_contrib_ = node.get("equal-contrib") or "no"
1690 params["equal_contrib"] = equal_contrib_ == "yes"
1692 return params
1694 def get_data_from_custom_meta(self, node):
1695 name = ""
1696 value = ""
1698 for child in node:
1699 tag = normalize(child.tag)
1701 if tag == "meta-name":
1702 name = child.text
1703 elif tag == "meta-value": 1703 ↛ 1706line 1703 didn't jump to line 1706, because the condition on line 1703 was never false
1704 value = child.text
1705 else:
1706 self.warnings.append(
1707 {
1708 self.pid: self.__class__.__name__
1709 + "."
1710 + inspect.currentframe().f_code.co_name
1711 + " "
1712 + tag
1713 }
1714 )
1716 return name, value
1718 def get_data_from_date(self, node, ignore_month=False):
1719 date_str = ""
1720 if "iso-8601-date" in node.attrib:
1721 date_str = node.attrib["iso-8601-date"]
1722 else:
1723 year = month = day = ""
1724 for child in node:
1725 tag = normalize(child.tag)
1727 if tag == "year": 1727 ↛ 1729line 1727 didn't jump to line 1729, because the condition on line 1727 was never false
1728 year = child.text
1729 elif tag == "month" and not ignore_month:
1730 month = child.text
1731 elif tag == "day":
1732 day = child.text
1733 else:
1734 self.warnings.append(
1735 {
1736 self.pid: self.__class__.__name__
1737 + "."
1738 + inspect.currentframe().f_code.co_name
1739 + " "
1740 + tag
1741 }
1742 )
1744 date_str = year
1745 if date_str and month: 1745 ↛ 1746line 1745 didn't jump to line 1746, because the condition on line 1745 was never true
1746 date_str += "-" + month
1747 if date_str and day: 1747 ↛ 1748line 1747 didn't jump to line 1748, because the condition on line 1747 was never true
1748 date_str += "-" + day
1750 return date_str
1752 def get_data_from_ext_link(self, node, **kwargs):
1753 link_type = node.get("ext-link-type") or ""
1754 href = get_normalized_attrib(node, "href") or ""
1755 base = get_normalized_attrib(node, "base") or ""
1757 kwargs["add_HTML_link"] = False
1758 _, metadata = self.parse_inner_node(node, **kwargs)
1760 data = {
1761 "rel": link_type,
1762 "mimetype": "",
1763 "location": href,
1764 "base": base,
1765 "metadata": metadata,
1766 }
1768 return data
1770 def get_data_from_history(self, node):
1771 history_dates = []
1772 # TODO: transform history_dates in a hash where date-type is the key
1773 # => Change database_cmds
1774 for child in node:
1775 if "date-type" in child.attrib:
1776 date_type = child.attrib["date-type"]
1777 date_str = self.get_data_from_date(child)
1778 history_dates.append({"type": date_type, "date": date_str})
1779 else:
1780 self.warnings.append(
1781 {
1782 self.pid: self.__class__.__name__
1783 + "."
1784 + inspect.currentframe().f_code.co_name
1785 + " "
1786 + child.tag
1787 }
1788 )
1790 return history_dates
1792 def update_data_from_name(self, node, contributor):
1793 for child in node:
1794 if child.text is not None: 1794 ↛ 1793line 1794 didn't jump to line 1793, because the condition on line 1794 was never false
1795 if child.tag == "given-names":
1796 contributor["first_name"] = child.text
1797 elif child.tag == "surname":
1798 contributor["last_name"] = child.text
1799 elif child.tag == "prefix": 1799 ↛ 1800line 1799 didn't jump to line 1800, because the condition on line 1799 was never true
1800 contributor["prefix"] = child.text
1801 elif child.tag == "suffix": 1801 ↛ 1804line 1801 didn't jump to line 1804, because the condition on line 1801 was never false
1802 contributor["suffix"] = child.text
1803 else:
1804 self.warnings.append(
1805 {
1806 self.pid: self.__class__.__name__
1807 + "."
1808 + inspect.currentframe().f_code.co_name
1809 + " "
1810 + child.tag
1811 }
1812 )
1814 def get_data_from_name_alternatives(self, node):
1815 mid = ""
1817 for child in node:
1818 if child.text is not None: 1818 ↛ 1817line 1818 didn't jump to line 1817, because the condition on line 1818 was never false
1819 if child.tag == "string-name": 1819 ↛ 1823line 1819 didn't jump to line 1823, because the condition on line 1819 was never false
1820 if child.get("specific-use") == "index": 1820 ↛ 1817line 1820 didn't jump to line 1817, because the condition on line 1820 was never false
1821 mid = child.text
1822 else:
1823 self.warnings.append(
1824 {
1825 self.pid: self.__class__.__name__
1826 + "."
1827 + inspect.currentframe().f_code.co_name
1828 + " "
1829 + child.tag
1830 }
1831 )
1833 return mid
1835 def get_data_from_uri(self, node, **kwargs):
1836 href = get_normalized_attrib(node, "href") or ""
1838 kwargs["add_HTML_link"] = False
1839 _, metadata = self.parse_inner_node(node, **kwargs)
1841 data = {"rel": None, "mimetype": "", "location": href, "base": "", "metadata": metadata}
1843 return data
1845 def helper_add_link_from_node(self, node, **kwargs):
1846 text = node.text or ""
1847 tag = normalize(node.tag)
1848 fct_name = "get_data_from_" + tag.replace("-", "_")
1849 meth = getattr(self, fct_name)
1850 data = meth(node, **kwargs)
1851 if not data["rel"] or data["rel"] == "uri":
1852 href = data["location"]
1853 if self.for_tex_file: 1853 ↛ 1854line 1853 didn't jump to line 1854, because the condition on line 1853 was never true
1854 text = "\\href{" + href + "}{" + data["metadata"] + "}"
1855 else:
1856 text = make_links_clickable(href, data["metadata"])
1857 return text
1859 def get_list_start_value(self, list_node):
1860 continued_from = list_node.get("continued-from")
1861 if continued_from is None:
1862 start = 0
1863 else:
1864 from_node = self.tree.find(f'.//*[@id="{continued_from}"]')
1865 if from_node is not None:
1866 start = len(from_node) + self.get_list_start_value(from_node)
1868 return start
1871class MathdocPublication(MathdocPublicationData, JatsBase):
1872 def __init__(self, *args, **kwargs):
1873 super().__init__(*args, **kwargs)
1874 self.parse_tree(kwargs["tree"])
1876 def parse_tree(self, tree):
1877 super().parse_tree(tree)
1879 for node in tree:
1880 tag = normalize(node.tag)
1882 if tag in ("publication-id", "collection-id"):
1883 node_type = node.get("publication-id-type")
1884 if node_type is None or node_type in ["numdam-id", "mathdoc-id"]:
1885 self.pid = node.text
1886 elif tag == "title-group":
1887 self.parse_title_group(node)
1888 elif tag == "issn":
1889 node_type = node.get("pub-type")
1890 if node_type == "ppub":
1891 self.issn = node.text
1892 self.ids.append(("issn", node.text))
1893 elif node_type == "epub": 1893 ↛ 1879line 1893 didn't jump to line 1879, because the condition on line 1893 was never false
1894 self.e_issn = node.text
1895 self.ids.append(("e-issn", node.text))
1896 elif tag == "ext-link":
1897 data = self.get_data_from_ext_link(node)
1898 self.ext_links.append(data)
1899 elif tag == "custom-meta-group":
1900 self.parse_custom_meta_group(node)
1901 elif tag == "description": 1901 ↛ 1902line 1901 didn't jump to line 1902, because the condition on line 1901 was never true
1902 self.parse_description(node)
1903 else:
1904 self.warnings.append(
1905 {
1906 self.pid: self.__class__.__name__
1907 + "."
1908 + inspect.currentframe().f_code.co_name
1909 + " "
1910 + tag
1911 }
1912 )
1914 def parse_custom_meta_group(self, node, **kwargs):
1915 for child in node:
1916 tag = normalize(child.tag)
1918 if tag == "custom-meta": 1918 ↛ 1928line 1918 didn't jump to line 1928, because the condition on line 1918 was never false
1919 name, value = self.get_data_from_custom_meta(child)
1921 if name == "serial-type":
1922 self.coltype = value
1923 elif name == "wall":
1924 self.wall = int(value)
1925 elif name == "provider": 1925 ↛ 1915line 1925 didn't jump to line 1915, because the condition on line 1925 was never false
1926 self.provider = value
1927 else:
1928 self.warnings.append(
1929 {
1930 self.pid: self.__class__.__name__
1931 + "."
1932 + inspect.currentframe().f_code.co_name
1933 + " "
1934 + tag
1935 }
1936 )
1938 def parse_description(self, node, **kwargs):
1939 # tag = get_normalized_attrib(node, "abstract-node_type") or "abstract"
1940 tag = "description"
1941 lang = get_normalized_attrib(node, "lang") or self.lang
1942 value_xml = get_xml_from_node(node)
1943 value_tex = value_html = value_xml.replace("<decription", "").replace("</description>", "")
1944 self.abstracts.append(
1945 {
1946 "tag": tag,
1947 "lang": lang,
1948 "value_xml": value_xml,
1949 "value_html": value_html,
1950 "value_tex": value_tex,
1951 }
1952 )
1955class JatsPublisher(PublisherData):
1956 def __init__(self, *args, **kwargs):
1957 super().__init__(*args, **kwargs)
1958 self.warnings = []
1959 self.parse_tree(kwargs["tree"])
1960 self.warnings = []
1962 def parse_tree(self, tree):
1963 for node in tree:
1964 tag = normalize(node.tag)
1966 if tag == "publisher-name": 1966 ↛ 1968line 1966 didn't jump to line 1968, because the condition on line 1966 was never false
1967 self.name = node.text
1968 elif tag == "publisher-loc":
1969 self.loc = node.text
1970 else:
1971 self.warnings.append(
1972 {
1973 self.pid: self.__class__.__name__
1974 + "."
1975 + inspect.currentframe().f_code.co_name
1976 + " "
1977 + tag
1978 }
1979 )
1982class JatsJournal(JournalData, JatsBase):
1983 def __init__(self, *args, **kwargs):
1984 super().__init__(*args, **kwargs)
1985 self.parse_tree(kwargs["tree"])
1987 def parse_tree(self, tree):
1988 super().parse_tree(tree)
1990 for node in tree:
1991 tag = normalize(node.tag)
1993 if tag == "journal-id":
1994 id_type = node.get("journal-id-type") or "numdam-id"
1995 if id_type == "numdam-id" or id_type == "mathdoc-id": 1995 ↛ 1990line 1995 didn't jump to line 1990, because the condition on line 1995 was never false
1996 self.pid = node.text
1997 elif tag == "journal-title-group":
1998 self.parse_title_group(node)
1999 elif tag == "publisher":
2000 self.publisher = JatsPublisher(tree=node)
2001 elif tag == "issn": 2001 ↛ 2010line 2001 didn't jump to line 2010, because the condition on line 2001 was never false
2002 node_type = node.get("pub-type") or "ppub"
2003 if node_type == "ppub":
2004 self.issn = node.text
2005 self.ids.append(("issn", node.text))
2006 elif node_type == "epub": 2006 ↛ 1990line 2006 didn't jump to line 1990, because the condition on line 2006 was never false
2007 self.e_issn = node.text
2008 self.ids.append(("e-issn", node.text))
2009 else:
2010 self.warnings.append(
2011 {
2012 self.pid: self.__class__.__name__
2013 + "."
2014 + inspect.currentframe().f_code.co_name
2015 + " "
2016 + tag
2017 }
2018 )
2021class JatsIssue(IssueData, JatsBase):
2022 def __init__(self, *args, **kwargs):
2023 super().__init__(*args, **kwargs)
2024 # from_folder is used to change the location of Elsevier graphics to a full path location
2025 self.from_folder = kwargs["from_folder"] if "from_folder" in kwargs else None
2026 self.no_bib = kwargs.get("no_bib", False)
2028 self.parse_tree(kwargs["tree"])
2030 def parse_tree(self, tree):
2031 super().parse_tree(tree)
2033 for node in tree:
2034 tag = normalize(node.tag)
2036 if tag == "journal-meta":
2037 self.journal = JatsJournal(tree=node)
2038 elif tag == "issue-meta":
2039 self.parse_issue_meta(node)
2040 elif tag == "body": 2040 ↛ 2064line 2040 didn't jump to line 2064, because the condition on line 2040 was never false
2041 for child in node:
2042 tag = normalize(child.tag)
2044 if tag == "article": 2044 ↛ 2054line 2044 didn't jump to line 2054, because the condition on line 2044 was never false
2045 article = JatsArticle(
2046 tree=child,
2047 issue=self,
2048 from_folder=self.from_folder,
2049 no_bib=self.no_bib,
2050 )
2051 self.warnings.extend(article.warnings)
2052 self.articles.append(article)
2053 else:
2054 self.warnings.append(
2055 {
2056 self.pid: self.__class__.__name__
2057 + "."
2058 + inspect.currentframe().f_code.co_name
2059 + " "
2060 + tag
2061 }
2062 )
2063 else:
2064 self.warnings.append(
2065 {
2066 self.pid: self.__class__.__name__
2067 + "."
2068 + inspect.currentframe().f_code.co_name
2069 + " "
2070 + tag
2071 }
2072 )
2074 if self.journal is not None: 2074 ↛ 2078line 2074 didn't jump to line 2078, because the condition on line 2074 was never false
2075 self.publisher = self.journal.publisher
2077 # Issue editors may be replicated in all the articles, remove them
2078 issue_editors = [contrib for contrib in self.contributors if contrib["role"] == "editor"]
2080 is_elsevier = False
2081 for xarticle in self.articles:
2082 if hasattr(xarticle, "pii"): 2082 ↛ 2083line 2082 didn't jump to line 2083, because the condition on line 2082 was never true
2083 is_elsevier = True
2085 editors = [contrib for contrib in xarticle.contributors if contrib["role"] == "editor"]
2086 is_equal = len(editors) == len(issue_editors)
2087 i = 0
2088 while is_equal and i < len(editors): 2088 ↛ 2089line 2088 didn't jump to line 2089, because the condition on line 2088 was never true
2089 if (
2090 editors[i]["last_name"] != issue_editors[i]["last_name"]
2091 or editors[i]["first_name"] != issue_editors[i]["first_name"]
2092 ):
2093 is_equal = False
2094 i += 1
2095 if is_equal:
2096 xarticle.contributors = [
2097 contrib for contrib in xarticle.contributors if contrib["role"] != "editor"
2098 ]
2100 if is_elsevier: 2100 ↛ 2102line 2100 didn't jump to line 2102, because the condition on line 2100 was never true
2101 # Fix location of icons
2102 for link in self.ext_links:
2103 if link["rel"] in ["icon", "small_icon"]:
2104 base_dir = self.journal.pid
2105 location = link["location"]
2106 if os.path.dirname(location) != base_dir:
2107 location = os.path.join(base_dir, self.pid, location)
2108 if self.from_folder:
2109 location = os.path.join(self.from_folder, location)
2110 location = "file:" + location
2111 link["location"] = location
2113 # Fix article types and subjects
2114 for xarticle in self.articles:
2115 article_type = "research-article"
2116 old_type = ""
2117 new_subjs = []
2119 if xarticle.fpage != "":
2120 try:
2121 value = int(xarticle.fpage)
2122 except ValueError:
2123 # fpage is not a number: the article is an editorial
2124 article_type = "editorial"
2126 if article_type == "research-article":
2127 for subj in xarticle.subjs:
2128 if subj["type"] == "type":
2129 # Fix article types
2130 value = subj["value"].lower()
2131 old_type = value
2132 if value == "discussion":
2133 article_type = "letter"
2134 elif value == "editorial":
2135 if xarticle.title_tex.lower().find("foreword") == 0:
2136 article_type = "foreword"
2137 else:
2138 article_type = "editorial"
2139 elif value in ["mini review", "review article", "book review"]:
2140 article_type = "review"
2141 elif value == "research article":
2142 article_type = "research-article"
2143 elif value == "short communication":
2144 article_type = "foreword"
2145 elif value == "correspondence":
2146 article_type = "letter"
2147 elif value.find("conference") == 0:
2148 article_type = "congress"
2149 elif subj["type"] == "heading" and not xarticle.title_tex:
2150 # The title may be stored in the heading: fix it
2151 xarticle.title_tex = xarticle.title_html = subj["value"]
2152 xarticle.title_xml = get_title_xml(subj["value"])
2153 elif subj["type"] == "heading":
2154 value = subj["value"].lower().strip()
2155 issue_title = self.title_tex.lower()
2156 if issue_title.find("dossier: ") == 0:
2157 issue_title = issue_title[9:]
2158 self.title_tex = self.title_html = self.title_tex[9:]
2159 self.title_xml = (
2160 "<issue-title>"
2161 + get_single_title_xml(issue_title)
2162 + "</issue-title>"
2163 )
2165 # Some heading values are in fact article type
2166 if value.find("erratum") == 0:
2167 article_type = "erratum"
2168 elif value.find("corrigendum") == 0:
2169 article_type = "corrigendum"
2170 elif value.find("foreword") == 0:
2171 article_type = "foreword"
2172 elif value.find("nécrologie") == 0 or value.find("obituary") == 0:
2173 article_type = "history-of-sciences"
2174 elif (
2175 value.find("block calendar/éphéméride") == 0
2176 or value.find("chronique") == 0
2177 ):
2178 article_type = "history-of-sciences"
2179 elif value.find("histoire") == 0 or value.find("historic") == 0:
2180 article_type = "history-of-sciences"
2181 elif value.find("tribute/hommage") == 0:
2182 article_type = "history-of-sciences"
2183 elif value.find("note historique") == 0:
2184 article_type = "historical-commentary"
2185 elif (
2186 value.find("le point sur") == 0 or value.find("le point-sur") == 0
2187 ):
2188 article_type = "review"
2189 elif (
2190 value.find("review") == 0
2191 or value.find("revue") == 0
2192 or value.find("concise review") == 0
2193 ):
2194 article_type = "review"
2195 elif value.find("conférence") == 0:
2196 article_type = "congress"
2197 elif (
2198 value.find("communication") == 0 or value.find("preliminary") == 0
2199 ):
2200 article_type = "preliminary-communication"
2201 elif value.find("perspective") == 0 and old_type in [
2202 "correspondence",
2203 "short communication",
2204 ]:
2205 article_type = "opinion"
2206 elif value.find("debate") == 0:
2207 article_type = "opinion"
2208 elif (
2209 value.find("index") == 0
2210 or value.find("keyword") == 0
2211 or value.find("sommaire") == 0
2212 ):
2213 article_type = "editorial"
2214 elif (
2215 value.find("table auteurs") == 0
2216 or value.find("table sommaire") == 0
2217 ):
2218 article_type = "editorial"
2219 elif value.find("page présentation des index") == 0:
2220 article_type = "editorial"
2221 elif value.find("fac-similé") == 0:
2222 # Article de crbiol, Pubmed les met en "Classical Article"
2223 article_type = "historical-commentary"
2224 # On ajoute le sujet dans ce cas pour garder la mention de "fac-similé" (== recopie)
2225 new_subjs.append(subj)
2226 # Ignore the issue titles
2227 elif (
2228 not self.title_tex
2229 or value.find(self.title_tex.lower().strip()) != 0
2230 ):
2231 # Exclude headings that are redundant with article types
2232 exclude_list = [
2233 "editorial",
2234 "éditorial",
2235 "avant-propos",
2236 "book review",
2237 "comment",
2238 "concise review paper",
2239 "answer",
2240 "commentaire",
2241 "commentary",
2242 "reply",
2243 "foreword",
2244 "full paper",
2245 "mémoire",
2246 ]
2247 if len([x for x in exclude_list if value.find(x) == 0]) == 0:
2248 new_subjs.append(subj)
2249 else:
2250 new_subjs.append(subj)
2252 # print(old_type, '-', old_heading, '-', article_type, '-', xarticle.pid, '-', xarticle.fpage)
2253 xarticle.atype = article_type
2254 xarticle.subjs = new_subjs
2256 def parse_custom_meta_group(self, node, **kwargs):
2257 for child in node:
2258 tag = normalize(child.tag)
2260 if tag == "custom-meta": 2260 ↛ 2268line 2260 didn't jump to line 2268, because the condition on line 2260 was never false
2261 name, value = self.get_data_from_custom_meta(child)
2263 if name == "provider":
2264 self.provider = value
2265 elif name == "efirst": 2265 ↛ 2257line 2265 didn't jump to line 2257, because the condition on line 2265 was never false
2266 self.with_online_first = value == "yes"
2267 else:
2268 self.warnings.append(
2269 {
2270 self.pid: self.__class__.__name__
2271 + "."
2272 + inspect.currentframe().f_code.co_name
2273 + " "
2274 + tag
2275 }
2276 )
2278 def parse_issue_meta(self, node, **kwargs):
2279 for child in node:
2280 tag = normalize(child.tag)
2282 if tag == "issue-id":
2283 self.parse_id(child)
2284 elif tag == "volume-series":
2285 self.vseries = child.text
2286 elif tag == "volume":
2287 self.volume = child.text
2288 elif tag == "issue":
2289 self.number = child.text
2290 elif tag == "pub-date":
2291 self.year = self.get_data_from_date(child, ignore_month=True)
2292 elif tag == "history":
2293 history_dates = self.get_data_from_history(child)
2294 for date in history_dates:
2295 if date["type"] == "last-modified":
2296 self.last_modified_iso_8601_date_str = date["date"]
2297 elif date["type"] == "prod-deployed-date":
2298 self.prod_deployed_date_iso_8601_date_str = date["date"]
2299 elif tag == "issue-title":
2300 content_type = child.get("content-type") or ""
2301 if content_type != "subtitle" and content_type != "cover-date": 2301 ↛ 2279line 2301 didn't jump to line 2279, because the condition on line 2301 was never false
2302 # Elsevier stores contributors in subtitles. Ignore.
2303 lang = get_normalized_attrib(child, "lang") or "und"
2304 if not self.title_tex and (
2305 self.lang == "und" or lang == "und" or lang == self.lang
2306 ):
2307 self.parse_title(child)
2308 # In xmldata, title_xml had the <title_group> tag:
2309 # self.title_xml can't be set in parse_title
2310 self.title_xml += get_xml_from_node(child)
2311 else:
2312 self.trans_lang = lang
2313 (
2314 self.trans_title_tex,
2315 self.trans_title_html,
2316 ) = self.parse_node_with_mixed_content(child)
2317 self.title_xml += get_xml_from_node(child)
2318 elif tag == "issue-title-group": 2318 ↛ 2319line 2318 didn't jump to line 2319, because the condition on line 2318 was never true
2319 self.parse_title_group(child)
2320 else:
2321 fct_name = "parse_" + tag.replace("-", "_")
2322 ftor = getattr(self, fct_name, None)
2323 if callable(ftor): 2323 ↛ 2326line 2323 didn't jump to line 2326, because the condition on line 2323 was never false
2324 ftor(child, add_ext_link=True)
2325 else:
2326 self.warnings.append(
2327 {
2328 self.pid: self.__class__.__name__
2329 + "."
2330 + inspect.currentframe().f_code.co_name
2331 + " "
2332 + tag
2333 }
2334 )
2336 if self.last_modified_iso_8601_date_str is None:
2337 self.last_modified_iso_8601_date_str = timezone.now().isoformat()
2340class JatsArticleBase(JatsBase):
2341 def parse_custom_meta_group(self, node, **kwargs):
2342 for child in node:
2343 tag = normalize(child.tag)
2345 if tag == "custom-meta": 2345 ↛ 2361line 2345 didn't jump to line 2361, because the condition on line 2345 was never false
2346 name, value = self.get_data_from_custom_meta(child)
2348 if name == "article-number":
2349 self.article_number = value
2350 elif name == "talk-number":
2351 self.talk_number = value
2352 elif name == "presented": 2352 ↛ 2353line 2352 didn't jump to line 2353, because the condition on line 2352 was never true
2353 presenter = create_contributor()
2354 presenter["role"] = "presenter"
2355 presenter["string_name"] = value.replace("Presented by ", "").replace(
2356 "Présenté par ", ""
2357 )
2358 presenter["contrib_xml"] = get_contrib_xml(presenter)
2359 self.contributors.append(presenter)
2360 else:
2361 self.warnings.append(
2362 {
2363 self.pid: self.__class__.__name__
2364 + "."
2365 + inspect.currentframe().f_code.co_name
2366 + " "
2367 + tag
2368 }
2369 )
2372class JatsArticle(ArticleData, JatsArticleBase):
2373 def __init__(self, *args, **kwargs): # , tree, pid=None):
2374 super().__init__(*args, **kwargs)
2375 self.pid = kwargs["pid"] if "pid" in kwargs else None
2376 self.issue = kwargs["issue"] if "issue" in kwargs else None
2378 self.add_span_around_tex_formula = (
2379 kwargs["add_span_around_tex_formula"]
2380 if "add_span_around_tex_formula" in kwargs
2381 else False
2382 )
2383 self.for_tex_file = kwargs["for_tex_file"] if "for_tex_file" in kwargs else False
2384 self.from_folder = kwargs["from_folder"] if "from_folder" in kwargs else None
2385 self.no_bib = kwargs.get("no_bib", False)
2387 self.parse_tree(kwargs["tree"])
2389 def parse_tree(self, tree):
2390 super().parse_tree(tree)
2392 self.atype = get_normalized_attrib(tree, "article-type") or ""
2394 # First loop to catch float-groups that are inserted inside the body
2395 for node in tree:
2396 tag = normalize(node.tag)
2398 if tag == "front":
2399 for child in node:
2400 tag = normalize(child.tag)
2402 if tag == "article-meta":
2403 self.parse_article_meta(child)
2404 else:
2405 self.warnings.append(
2406 {
2407 self.pid: self.__class__.__name__
2408 + "."
2409 + inspect.currentframe().f_code.co_name
2410 + " "
2411 + tag
2412 }
2413 )
2414 elif tag == "front-stub": 2414 ↛ 2415line 2414 didn't jump to line 2415, because the condition on line 2414 was never true
2415 self.parse_article_meta(node)
2416 elif tag == "floats-group": 2416 ↛ 2417line 2416 didn't jump to line 2417, because the condition on line 2416 was never true
2417 self.parse_floats_group(node)
2419 for node in tree:
2420 tag = normalize(node.tag)
2421 if tag == "back":
2422 for child in node:
2423 tag = normalize(child.tag)
2425 if tag == "ref-list" and not self.no_bib:
2426 print("Parse bib")
2427 self.parse_ref_list(child)
2428 elif tag == "ack": 2428 ↛ 2429line 2428 didn't jump to line 2429, because the condition on line 2428 was never true
2429 self.parse_ack(child)
2430 elif tag == "sec": 2430 ↛ 2431line 2430 didn't jump to line 2431, because the condition on line 2430 was never true
2431 self.parse_sec(child)
2432 elif tag == "app-group": 2432 ↛ 2433line 2432 didn't jump to line 2433, because the condition on line 2432 was never true
2433 self.parse_app_group(child)
2434 elif tag == "fn-group": 2434 ↛ 2435line 2434 didn't jump to line 2435, because the condition on line 2434 was never true
2435 self.parse_fn_group(child)
2436 else:
2437 self.warnings.append(
2438 {
2439 self.pid: self.__class__.__name__
2440 + "."
2441 + inspect.currentframe().f_code.co_name
2442 + " "
2443 + tag
2444 }
2445 )
2447 elif tag == "body":
2448 self.parse_body(node)
2449 elif tag == "sub-article": 2449 ↛ 2450line 2449 didn't jump to line 2450, because the condition on line 2449 was never true
2450 self.parse_sub_article(node)
2451 elif tag == "floats-group" or tag == "front": 2451 ↛ 2455line 2451 didn't jump to line 2455, because the condition on line 2451 was never false
2452 # Handled above
2453 pass
2454 else:
2455 self.warnings.append(
2456 {
2457 self.pid: self.__class__.__name__
2458 + "."
2459 + inspect.currentframe().f_code.co_name
2460 + " "
2461 + tag
2462 }
2463 )
2465 # Add the footnotes at the end
2466 if len(self.fns) > 0: 2466 ↛ 2467line 2466 didn't jump to line 2467, because the condition on line 2466 was never true
2467 fn_text = '<div class="footnotes">'
2468 for fn in self.fns:
2469 fn_text += fn
2470 fn_text += "</div>"
2472 self.body_html = fn_text if not self.body_html else self.body_html + fn_text
2474 if ( 2474 ↛ 2478line 2474 didn't jump to line 2478
2475 len(self.funding_statement_xml) > 0
2476 and self.funding_statement_xml.find('<name-content content-type="fn"') == -1
2477 ):
2478 self.funding_statement_xml = (
2479 f'<name-content content-type="fn">{self.funding_statement_xml}</name-content>'
2480 )
2482 # Case for XML with <body>, then <back> and <floats_group>
2483 # The figures/tables of the floats_group are added inside the body_html
2484 # (close to their first <xref>)
2485 # It's too complicated to do the same for the body_xml as we use the get_xml_from_node function.
2486 # Instead, we append the floats_group_xml to the body_xml
2487 if hasattr(self, "floats_group_xml"): 2487 ↛ 2488line 2487 didn't jump to line 2488, because the condition on line 2487 was never true
2488 self.body_xml += self.floats_group_xml
2490 # Special treatment for Elsevier articles: web scrapping to find the date_published
2491 # Moved to the import management commands since Elsevier blocks IP after 1000+ requests
2492 # if hasattr(self, 'pii') and self.date_published_iso_8601_date_str is None:
2493 # article_data = scrapping.fetch_article(self.doi, self.pii)
2494 # self.date_published_iso_8601_date_str = article_data.date_published_iso_8601_date_str
2496 if self.no_bib: 2496 ↛ 2498line 2496 didn't jump to line 2498, because the condition on line 2496 was never true
2497 # For Geodesic
2498 ext_link = create_extlink()
2499 ext_link["rel"] = "source"
2500 ext_link["location"] = "http://www.numdam.org/item/" + self.pid
2501 ext_link["metadata"] = "NUMDAM"
2502 self.ext_links.append(ext_link)
2504 def update_body_content(self, node, **kwargs):
2505 if len(node) == 0:
2506 # Most journals do not display the Full text
2507 # the <body> is then used to store the text for the search engine and has no children
2508 # Let's not compute body_html in this case.
2509 # We want the same behavior for journals that display the Full text,
2510 # but with old articles without Full text.
2511 return
2513 # <front> has to be put before <body> so self.pid is defined here
2514 if hasattr(settings, "SITE_URL_PREFIX"): 2514 ↛ 2515line 2514 didn't jump to line 2515, because the condition on line 2514 was never true
2515 prefix = settings.SITE_URL_PREFIX
2516 base_article = settings.ARTICLE_BASE_URL
2517 base_url = "/" + prefix + base_article + self.pid
2518 else:
2519 base_url = os.path.join(settings.ARTICLE_BASE_URL, self.pid)
2520 kwargs["base_url"] = base_url
2522 append_to_body = True
2523 current_len = len(self.supplementary_materials)
2525 if "use_sec" in kwargs and kwargs["use_sec"]: 2525 ↛ 2527line 2525 didn't jump to line 2527, because the condition on line 2525 was never true
2526 # Hack for Elsevier: convert <ack> into <sec> of the <body>
2527 body_tex, body_html = self.parse_node_with_sec(node, **kwargs)
2528 else:
2529 body_tex, body_html = self.parse_node_with_mixed_content(node, **kwargs)
2531 if len(self.supplementary_materials) != current_len: 2531 ↛ 2534line 2531 didn't jump to line 2534, because the condition on line 2531 was never true
2532 # Elsevier stores supplementary-material in app-group.
2533 # They are extracted, but ignored in the body_html if the appendix has only supplements
2534 append_to_body = False
2536 for child in node:
2537 if child.tag == "p":
2538 for gchild in child:
2539 if gchild.tag != "supplementary-material":
2540 append_to_body = True
2542 if append_to_body: 2542 ↛ exitline 2542 didn't return from function 'update_body_content', because the condition on line 2542 was never false
2543 self.body_tex = body_tex if not self.body_tex else self.body_tex + body_tex
2544 self.body_html = body_html if not self.body_html else self.body_html + body_html
2546 body_xml = get_xml_from_node(node)
2547 if not self.body_xml: 2547 ↛ 2550line 2547 didn't jump to line 2550, because the condition on line 2547 was never false
2548 self.body_xml = body_xml
2549 else:
2550 if "use_sec" in kwargs and kwargs["use_sec"]:
2551 self.body_xml = f"{self.body_xml[0:-7]}<sec>{body_xml[5:-6]}</sec></body>"
2552 else:
2553 self.body_xml = f"{self.body_xml[0:-7]}{body_xml}</body>"
2555 def parse_ack(self, node, **kwargs):
2556 content_type = node.get("content-type") or ""
2557 if content_type == "COI-statement":
2558 self.coi_statement = get_text_from_node(node)
2559 else:
2560 # Hack for Elsevier: convert <ack> into <sec> of the <body>
2561 self.update_body_content(node, use_sec=True)
2563 def parse_app(self, node, **kwargs):
2564 for child in node:
2565 tag = normalize(child.tag)
2567 if tag == "sec":
2568 # Elsevier can store all appendixes inside one <app> ?!?
2569 # One of them can store the supplements and has to be ignored in the body_html
2570 self.update_body_content(child)
2571 else:
2572 self.warnings.append(
2573 {
2574 self.pid: self.__class__.__name__
2575 + "."
2576 + inspect.currentframe().f_code.co_name
2577 + " "
2578 + tag
2579 }
2580 )
2582 def parse_app_group(self, node, **kwargs):
2583 for child in node:
2584 tag = normalize(child.tag)
2586 if tag == "app":
2587 self.parse_app(child)
2588 else:
2589 self.warnings.append(
2590 {
2591 self.pid: self.__class__.__name__
2592 + "."
2593 + inspect.currentframe().f_code.co_name
2594 + " "
2595 + tag
2596 }
2597 )
2599 def parse_article_categories(self, node, **kwargs):
2600 for child in node:
2601 tag = normalize(child.tag)
2603 if tag == "subj-group": 2603 ↛ 2606line 2603 didn't jump to line 2606, because the condition on line 2603 was never false
2604 self.parse_subj_group(child)
2605 else:
2606 self.warnings.append(
2607 {
2608 self.pid: self.__class__.__name__
2609 + "."
2610 + inspect.currentframe().f_code.co_name
2611 + " "
2612 + tag
2613 }
2614 )
2616 def parse_article_meta(self, node, **kwargs):
2617 for child in node:
2618 tag = normalize(child.tag)
2620 if tag == "article-id":
2621 self.parse_id(child)
2622 elif tag == "fpage":
2623 self.fpage = child.text
2624 self.page_type = child.get("content-type") or ""
2625 elif tag == "lpage":
2626 self.lpage = child.text or ""
2627 elif tag == "page-range":
2628 self.page_range = child.text
2629 elif tag in ("page-count", "size"): 2629 ↛ 2630line 2629 didn't jump to line 2630, because the condition on line 2629 was never true
2630 self.size = child.text
2631 elif tag == "elocation-id": 2631 ↛ 2632line 2631 didn't jump to line 2632, because the condition on line 2631 was never true
2632 self.elocation = child.text
2633 elif tag == "pub-date":
2634 date_type = child.get("date-type") or "pub"
2635 if date_type == "pub":
2636 self.date_published_iso_8601_date_str = self.get_data_from_date(child)
2637 else:
2638 date_str = self.get_data_from_date(child)
2639 self.history_dates.append({"type": "online", "date": date_str})
2640 elif tag == "history":
2641 self.history_dates += self.get_data_from_history(child)
2642 for date in self.history_dates:
2643 if date["type"] == "prod-deployed-date":
2644 self.prod_deployed_date_iso_8601_date_str = date["date"]
2645 elif tag in ["volume", "issue-id", "permissions", "pub-date-not-available"]:
2646 pass
2647 # TODO: store permissions in XML
2648 elif tag == "author-notes": 2648 ↛ 2650line 2648 didn't jump to line 2650, because the condition on line 2648 was never true
2649 # 2022/11/15 Mersenne meeting. ignore author-notes
2650 pass
2651 # self.parse_author_notes(child)
2652 else:
2653 fct_name = "parse_" + tag.replace("-", "_")
2654 ftor = getattr(self, fct_name, None)
2655 if callable(ftor):
2656 ftor(child, add_ext_link=True)
2657 else:
2658 self.warnings.append(
2659 {
2660 self.pid: self.__class__.__name__
2661 + "."
2662 + inspect.currentframe().f_code.co_name
2663 + " "
2664 + tag
2665 }
2666 )
2668 def parse_author_notes(self, node, **kwargs):
2669 for child in node:
2670 tag = normalize(child.tag)
2671 if tag == "fn":
2672 _, html = self.parse_node_with_fn(child, keep_fn=True, keep_fn_label=False)
2673 xml = get_xml_from_node(child)
2674 self.footnotes_xml += xml
2675 self.footnotes_html += html
2677 def parse_body(self, node, **kwargs):
2678 self.body = get_text_from_node(node)
2680 if hasattr(self, "floats"): 2680 ↛ 2681line 2680 didn't jump to line 2681, because the condition on line 2680 was never true
2681 self.floats_to_insert = []
2683 self.update_body_content(node, **kwargs)
2685 if not self.body_xml:
2686 self.body_xml = get_xml_from_node(node)
2688 def parse_boxed_text(self, node, **kwargs):
2689 """
2690 Parse <boxed-text> inside <floats-group> and fills the self.float_boxed_texts dictionary.
2691 The dictionary is then used during parse_body to embed the boxed-text inside the body HTML.
2692 """
2693 box_id = node.attrib["id"] if "id" in node.attrib else None
2695 _, html = self.parse_node_with_boxed_text(node, **kwargs)
2697 if box_id is not None:
2698 self.floats[box_id] = html
2700 def parse_floats_group(self, node, **kwargs):
2701 if hasattr(settings, "SITE_URL_PREFIX"):
2702 prefix = settings.SITE_URL_PREFIX
2703 base_article = settings.ARTICLE_BASE_URL
2704 base_url = "/" + prefix + base_article + self.pid
2705 else:
2706 base_url = os.path.join(settings.ARTICLE_BASE_URL, self.pid)
2708 self.floats = {}
2709 for child in node:
2710 tag = normalize(child.tag)
2712 if tag == "fig":
2713 self.parse_node_with_fig(child, append_floats=True, base_url=base_url)
2714 elif tag == "table-wrap":
2715 self.parse_node_with_table_wrap(child, append_floats=True, base_url=base_url)
2716 elif tag == "boxed-text":
2717 self.parse_boxed_text(child, base_url=base_url)
2718 else:
2719 self.warnings.append(
2720 {
2721 self.pid: self.__class__.__name__
2722 + "."
2723 + inspect.currentframe().f_code.co_name
2724 + " "
2725 + tag
2726 }
2727 )
2729 self.floats_group_xml = get_xml_from_node(node)
2731 def parse_fn_group(self, node, **kwargs):
2732 for child in node:
2733 tag = normalize(child.tag)
2735 if tag == "fn":
2736 _, html = self.parse_node_with_fn(child, keep_fn=True)
2737 xml = get_xml_from_node(child)
2739 self.footnotes_html += html
2740 self.footnotes_xml += xml
2741 else:
2742 self.warnings.append(
2743 {
2744 self.pid: self.__class__.__name__
2745 + "."
2746 + inspect.currentframe().f_code.co_name
2747 + " "
2748 + tag
2749 }
2750 )
2752 def parse_funding_group(self, node, **kwargs):
2753 for child in node:
2754 tag = normalize(child.tag)
2756 if tag == "award-group": 2756 ↛ 2758line 2756 didn't jump to line 2758, because the condition on line 2756 was never false
2757 self.parse_award_group(child)
2758 elif tag == "funding-statement":
2759 for funding_node in child:
2760 if funding_node.tag == "name-content":
2761 for funding_child in funding_node:
2762 if funding_child.tag == "fn":
2763 _, html = self.parse_node_with_fn(funding_child, keep_fn=True)
2764 self.funding_statement_html += html
2765 self.funding_statement_xml = get_xml_from_node(funding_node)
2767 # TODO: handle funding-statement with simple texts
2768 else:
2769 self.warnings.append(
2770 {
2771 self.pid: self.__class__.__name__
2772 + "."
2773 + inspect.currentframe().f_code.co_name
2774 + " "
2775 + tag
2776 }
2777 )
2779 def parse_issue(self, node, **kwargs):
2780 # Elsevier stores bs in the seq attribute
2781 self.seq = "0" if hasattr(self, "pii") else (node.get("seq") or "0")
2784class JatsRef(RefBase, JatsBase):
2785 def __init__(self, *args, **kwargs): # , tree, lang):
2786 super().__init__(*args, **kwargs) # lang)
2787 self.parse_tree(kwargs["tree"])
2789 def parse_tree(self, tree):
2790 super().parse_tree(tree)
2792 self.user_id = get_normalized_attrib(tree, "id") or ""
2794 for node in tree:
2795 tag = normalize(node.tag)
2797 if tag == "label":
2798 self.label = node.text or ""
2800 if self.label: 2800 ↛ 2835line 2800 didn't jump to line 2835, because the condition on line 2800 was never false
2801 if self.label[0] != "[":
2802 self.label = "[" + self.label + "]"
2804 elif tag == "mixed-citation" or tag == "note":
2805 self.parse_citation_node(node)
2807 self.citation_tex, self.citation_html = self.parse_node_with_mixed_content(
2808 node,
2809 is_citation=True,
2810 is_mixed_citation=True,
2811 add_ext_link=True,
2812 ref_type="misc",
2813 )
2815 if self.label:
2816 self.citation_html = self.label + " " + self.citation_html
2817 self.citation_tex = self.label + " " + self.citation_tex
2819 elif tag == "element-citation":
2820 self.parse_citation_node(node)
2822 self.citation_tex = self.citation_html = get_citation_html(self)
2823 else:
2824 self.warnings.append(
2825 {
2826 self.pid: self.__class__.__name__
2827 + "."
2828 + inspect.currentframe().f_code.co_name
2829 + " "
2830 + tag
2831 }
2832 )
2834 # With xmldata, citation_xml does not have '<ref>', but only the text of the children
2835 self.citation_xml += get_xml_from_node(node)
2837 def get_data_from_name_in_ref(self, node, role):
2838 params = create_contributor()
2839 params["role"] = role
2841 if node.tag == "name":
2842 self.update_data_from_name(node, params)
2843 elif node.tag == "string-name":
2844 self.update_data_from_name(node, params)
2845 if params["first_name"] == "" and params["last_name"] == "":
2846 params["string_name"] = node.text or ""
2847 elif node.tag == "name-alternatives": 2847 ↛ 2848line 2847 didn't jump to line 2848, because the condition on line 2847 was never true
2848 params["mid"] = self.get_data_from_name_alternatives(node)
2849 elif node.tag == "collab": 2849 ↛ 2850line 2849 didn't jump to line 2850, because the condition on line 2849 was never true
2850 params["string_name"] = node.text or ""
2852 use_initials = getattr(settings, "REF_JEP_STYLE", False)
2853 helper_update_name_params(params, use_initials)
2854 params["contrib_xml"] = "<etal/>" if node.tag == "etal" else get_xml_from_node(node)
2856 return params
2858 def parse_node_with_chapter_title(self, node, **kwargs):
2859 tex, html = self.parse_inner_node(node, **kwargs)
2861 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False
2862 if is_mixed_citation:
2863 html = add_span_class_to_html_from_chapter_title(html, **kwargs)
2865 return tex, html
2867 def parse_node_with_source(self, node, **kwargs):
2868 tex, html = self.parse_inner_node(node, **kwargs)
2870 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False
2871 if is_mixed_citation:
2872 html = add_span_class_to_html_from_source(html, **kwargs)
2874 return tex, html
2876 def parse_citation_node(self, node, **kwargs):
2877 self.type = get_normalized_attrib(node, "publication-type") or "misc"
2879 # Elsevier can store data about a translation after comments (<source>...)
2880 # Append these tags in the comment
2881 has_comment = False
2883 for child in node:
2884 tag = normalize(child.tag)
2886 if tag in ("page-count", "size"):
2887 if not self.size: 2887 ↛ 2883line 2887 didn't jump to line 2883, because the condition on line 2887 was never false
2888 self.size = child.text
2889 elif tag == "comment":
2890 has_comment = True
2891 # comments may have ext-links or uri. HTML <a> links will be added
2892 _, comment = self.parse_node_with_mixed_content(
2893 child, is_citation=True, is_comment=True, add_HTML_link=True
2894 )
2895 if self.comment:
2896 self.comment += " "
2897 self.comment += comment
2898 elif tag == "source":
2899 # TODO: migration to store source_tex and source_html
2900 _, source_tex = self.parse_node_with_mixed_content(child, is_citation=True)
2902 if self.type in ["book", "inproceedings"] and len(self.source_tex) > 0: 2902 ↛ 2904line 2902 didn't jump to line 2904, because the condition on line 2902 was never true
2903 # Multiple source for a book, store the extra source in series
2904 if self.series and has_comment:
2905 self.comment += " " + source_tex
2906 else:
2907 if self.series:
2908 self.series += ", "
2909 self.series += get_text_from_node(child)
2910 else:
2911 if self.source_tex and has_comment: 2911 ↛ 2912line 2911 didn't jump to line 2912, because the condition on line 2911 was never true
2912 self.comment += " " + source_tex
2913 else:
2914 self.source_tex = source_tex
2915 elif tag == "series":
2916 series = get_text_from_node(child)
2917 if self.series and has_comment: 2917 ↛ 2918line 2917 didn't jump to line 2918, because the condition on line 2917 was never true
2918 self.comment += ", " + series
2919 else:
2920 if self.series: 2920 ↛ 2921line 2920 didn't jump to line 2921, because the condition on line 2920 was never true
2921 self.series += ", "
2922 self.series += series
2923 elif tag == "annotation": 2923 ↛ 2924line 2923 didn't jump to line 2924, because the condition on line 2923 was never true
2924 if not self.annotation:
2925 self.annotation = get_text_from_node(child)
2926 elif tag == "article-title":
2927 # TODO: migration to store article_title_tex and article_title_html
2928 _, article_title_tex = self.parse_node_with_mixed_content(child, is_citation=True)
2930 if self.type == "book": 2930 ↛ 2932line 2930 didn't jump to line 2932, because the condition on line 2930 was never true
2931 # Elsevier uses article-title for books !?!
2932 if len(self.source_tex) == 0:
2933 if has_comment:
2934 self.comment += " " + article_title_tex
2935 else:
2936 self.source_tex = article_title_tex
2937 else:
2938 if self.series and has_comment:
2939 self.comment += ", " + article_title_tex
2940 else:
2941 self.series += get_text_from_node(child)
2942 elif self.type == "inproceedings":
2943 if self.chapter_title_tex and has_comment: 2943 ↛ 2944line 2943 didn't jump to line 2944, because the condition on line 2943 was never true
2944 self.comment += " " + article_title_tex
2945 else:
2946 self.chapter_title_tex = article_title_tex
2947 else:
2948 if self.article_title_tex and has_comment: 2948 ↛ 2949line 2948 didn't jump to line 2949, because the condition on line 2948 was never true
2949 self.comment += " " + article_title_tex
2950 else:
2951 self.article_title_tex = article_title_tex
2952 elif tag == "chapter-title":
2953 # TODO: migration to store chapter_title_tex and chapter_title_html
2954 _, chapter_title_tex = self.parse_node_with_mixed_content(child, is_citation=True)
2955 if self.chapter_title_tex and has_comment: 2955 ↛ 2956line 2955 didn't jump to line 2956, because the condition on line 2955 was never true
2956 self.comment += " " + chapter_title_tex
2957 else:
2958 self.chapter_title_tex = chapter_title_tex
2959 elif tag == "conf-name":
2960 _, conf_tex = self.parse_node_with_mixed_content(child, is_citation=True)
2961 if self.source_tex and has_comment: 2961 ↛ 2962line 2961 didn't jump to line 2962, because the condition on line 2961 was never true
2962 self.comment += ", " + conf_tex
2963 else:
2964 self.source_tex = conf_tex
2965 elif tag in ("name", "string-name", "name-alternatives", "etal", "collab"):
2966 params = self.get_data_from_name_in_ref(child, "author")
2967 self.contributors.append(params)
2968 elif tag == "person-group":
2969 self.parse_person_group(child)
2970 elif tag == "ext-link":
2971 self.parse_ext_link(child, add_ext_link=True)
2972 elif tag == "pub-id":
2973 self.parse_pub_id(child)
2974 elif tag == "date": 2974 ↛ 2975line 2974 didn't jump to line 2975, because the condition on line 2974 was never true
2975 self.year = get_text_from_node(child)
2976 elif tag == "date-in-citation": 2976 ↛ 2977line 2976 didn't jump to line 2977, because the condition on line 2976 was never true
2977 date_ = child.get("iso-8601-date") or ""
2978 if date_:
2979 if self.comment:
2980 self.comment += ", "
2981 self.comment += "Accessed " + date_
2982 elif tag == "isbn": 2982 ↛ 2983line 2982 didn't jump to line 2983, because the condition on line 2982 was never true
2983 if self.annotation:
2984 self.annotation += ", "
2985 self.annotation += "ISBN: " + child.text
2986 elif tag == "issn": 2986 ↛ 2987line 2986 didn't jump to line 2987, because the condition on line 2986 was never true
2987 if self.annotation:
2988 self.annotation += ", "
2989 self.annotation += "ISSN: " + child.text
2990 elif child.text is not None:
2991 variable_name = tag.replace("-", "_")
2992 if has_comment and hasattr(self, variable_name) and getattr(self, variable_name): 2992 ↛ 2993line 2992 didn't jump to line 2993, because the condition on line 2992 was never true
2993 if tag == "fpage":
2994 self.comment += ", pp. "
2995 elif tag == "lpage":
2996 self.comment += "-"
2997 else:
2998 self.comment += ", "
2999 self.comment += child.text
3000 elif not hasattr(self, variable_name) or not getattr(self, variable_name):
3001 setattr(self, variable_name, child.text)
3003 def parse_person_group(self, node, **kwargs):
3004 role = node.get("person-group-type") or ""
3005 if role and role[-1] == "s": 3005 ↛ 3006line 3005 didn't jump to line 3006, because the condition on line 3005 was never true
3006 role = role[:-1]
3008 for child in node:
3009 tag = normalize(child.tag)
3011 if tag in ("name", "string-name", "name-alternatives", "etal", "collab"): 3011 ↛ 3015line 3011 didn't jump to line 3015, because the condition on line 3011 was never false
3012 contrib = self.get_data_from_name_in_ref(child, role)
3013 self.contributors.append(contrib)
3014 else:
3015 self.warnings.append(
3016 {
3017 self.pid: self.__class__.__name__
3018 + "."
3019 + inspect.currentframe().f_code.co_name
3020 + " "
3021 + tag
3022 }
3023 )
3025 def parse_pub_id(self, node, **kwargs):
3026 node_type = node.get("pub-id-type") or ""
3028 data = {
3029 "rel": node_type,
3030 "mimetype": "",
3031 "location": "",
3032 "base": "",
3033 "metadata": node.text,
3034 }
3036 self.add_extids_from_node_with_link(data)
3038 def split_label(self):
3039 """
3040 Used when sorting non-digit bibitems
3041 """
3042 label = self.label.lower()
3043 if len(label) > 1:
3044 label = label[1:-1]
3046 try:
3047 self.label_prefix, self.label_suffix = re.split(r"[\d]+", label)
3048 except ValueError:
3049 # Special case where label is similar as "Sma" instead of "Sma15"
3050 self.label_prefix, self.label_suffix = [label, ""]
3053class BitsCollection(CollectionData, JatsBase):
3054 def __init__(self, *args, **kwargs):
3055 super().__init__(*args, **kwargs)
3056 self.parse_tree(kwargs["tree"])
3058 def parse_tree(self, tree):
3059 super().parse_tree(tree)
3061 if tree is not None: 3061 ↛ 3104line 3061 didn't jump to line 3104, because the condition on line 3061 was never false
3062 tag = normalize(tree.tag)
3063 collection_meta_node = None
3064 if tag == "collection-meta":
3065 self.parse_collection_meta(tree)
3066 collection_meta_node = tree
3067 elif tag == "in-collection": 3067 ↛ 3091line 3067 didn't jump to line 3091, because the condition on line 3067 was never false
3068 for node in tree:
3069 tag = normalize(node.tag)
3071 if tag == "collection-meta":
3072 self.parse_collection_meta(node)
3073 collection_meta_node = node
3074 elif tag == "volume":
3075 self.parse_volume(node)
3076 elif tag == "volume-series": 3076 ↛ 3078line 3076 didn't jump to line 3078, because the condition on line 3076 was never false
3077 self.parse_volume_series(node)
3078 elif tag == "volume-title":
3079 self.parse_volume_title(node)
3080 else:
3081 self.warnings.append(
3082 {
3083 self.pid: self.__class__.__name__
3084 + "."
3085 + inspect.currentframe().f_code.co_name
3086 + " "
3087 + tag
3088 }
3089 )
3091 if collection_meta_node is not None: 3091 ↛ 3094line 3091 didn't jump to line 3094, because the condition on line 3091 was never false
3092 self.set_seq(collection_meta_node)
3093 else:
3094 self.warnings.append(
3095 {
3096 self.pid: self.__class__.__name__
3097 + "."
3098 + inspect.currentframe().f_code.co_name
3099 + " "
3100 + tag
3101 }
3102 )
3104 self.collection = Foo()
3105 self.collection.pid = self.pid
3107 def parse_collection_meta(self, node, **kwargs):
3108 self.coltype = node.get("collection-type")
3110 for child in node:
3111 tag = normalize(child.tag)
3113 if tag == "collection-id":
3114 self.pid = child.text
3115 elif tag == "title-group":
3116 self.parse_title_group(child)
3117 elif tag == "issn":
3118 node_type = child.get("pub-type")
3119 if node_type == "ppub": 3119 ↛ 3120line 3119 didn't jump to line 3120, because the condition on line 3119 was never true
3120 self.issn = child.text
3121 self.ids.append(("issn", child.text))
3122 elif node_type == "epub": 3122 ↛ 3123line 3122 didn't jump to line 3123, because the condition on line 3122 was never true
3123 self.e_issn = child.text
3124 self.ids.append(("e-issn", child.text))
3125 elif tag == "ext-link": 3125 ↛ 3126line 3125 didn't jump to line 3126, because the condition on line 3125 was never true
3126 data = self.get_data_from_ext_link(child)
3127 self.ext_links.append(data)
3128 elif tag == "volume-in-collection":
3129 self.parse_volume_in_collection(child)
3130 else:
3131 self.warnings.append(
3132 {
3133 self.pid: self.__class__.__name__
3134 + "."
3135 + inspect.currentframe().f_code.co_name
3136 + " "
3137 + tag
3138 }
3139 )
3141 def parse_volume(self, node, **kwargs):
3142 self.volume = node.text
3144 def parse_volume_in_collection(self, node, **kwargs):
3145 for child in node:
3146 tag = normalize(child.tag)
3148 if tag == "volume-number":
3149 self.parse_volume(child)
3150 elif tag == "volume-series":
3151 self.parse_volume_series(child)
3152 elif tag == "volume-title": 3152 ↛ 3155line 3152 didn't jump to line 3155, because the condition on line 3152 was never false
3153 self.parse_volume_title(child)
3154 else:
3155 self.warnings.append(
3156 {
3157 self.pid: self.__class__.__name__
3158 + "."
3159 + inspect.currentframe().f_code.co_name
3160 + " "
3161 + tag
3162 }
3163 )
3165 def parse_volume_series(self, node, **kwargs):
3166 self.vseries = node.text
3168 def parse_volume_title(self, node, **kwargs):
3169 self.title_tex, self.title_html = self.parse_node_with_mixed_content(node)
3170 self.title_xml = get_xml_from_node(node)
3172 def set_seq(self, node):
3173 try:
3174 # First, use the seq attribute, if any
3175 self.seq = int(node.get("seq") or "")
3176 except ValueError:
3177 # Second, use self.volume (which can be like "158-159")
3178 if not self.volume: 3178 ↛ 3179line 3178 didn't jump to line 3179, because the condition on line 3178 was never true
3179 self.seq = 0
3180 else:
3181 text = self.volume.split("-")[0]
3182 try:
3183 self.seq = int(text)
3184 except ValueError:
3185 self.seq = 0
3187 # Third, use self.vseries as an offset
3188 try:
3189 # pas plus de 10000 ouvrages dans une série (gasp)
3190 self.seq = int(self.vseries) * 10000 + self.seq
3191 except ValueError:
3192 pass
3195class BitsBook(BookData, JatsBase):
3196 def __init__(self, *args, **kwargs):
3197 super().__init__(*args, **kwargs)
3198 self.no_bib = kwargs.get("no_bib", False)
3200 self.parse_tree(kwargs["tree"])
3202 def parse_tree(self, tree):
3203 super().parse_tree(tree)
3205 book_type = get_normalized_attrib(tree, "book-type") or "Book"
3206 self.ctype = "book-" + book_type
3208 for node in tree:
3209 if type(tree) == type(node): 3209 ↛ 3208line 3209 didn't jump to line 3208, because the condition on line 3209 was never false
3210 tag = normalize(node.tag)
3212 if tag in ("collection-meta", "in-collection"):
3213 col = BitsCollection(tree=node)
3214 self.incollection.append(col)
3215 elif tag == "book-meta":
3216 self.parse_book_meta(node)
3217 elif tag == "book-body":
3218 self.parse_book_body(node)
3219 elif tag == "front-matter":
3220 self.parse_front_matter(node)
3221 elif tag == "book-back":
3222 for child in node:
3223 tag = normalize(child.tag)
3224 if tag == "ref-list":
3225 self.parse_ref_list(child)
3226 else:
3227 self.warnings.append(
3228 {
3229 self.pid: self.__class__.__name__
3230 + "."
3231 + inspect.currentframe().f_code.co_name
3232 + " "
3233 + tag
3234 }
3235 )
3236 else:
3237 self.warnings.append(
3238 {
3239 self.pid: self.__class__.__name__
3240 + "."
3241 + inspect.currentframe().f_code.co_name
3242 + " "
3243 + tag
3244 }
3245 )
3247 self.set_contribs()
3248 self.set_title()
3250 def parse_book_body(self, node, **kwargs):
3251 for child in node:
3252 if type(child) == type(node): 3252 ↛ 3251line 3252 didn't jump to line 3251, because the condition on line 3252 was never false
3253 tag = normalize(child.tag)
3255 if tag == "book-part": 3255 ↛ 3260line 3255 didn't jump to line 3260, because the condition on line 3255 was never false
3256 book_part = BitsBookPart(tree=child)
3257 self.warnings.extend(book_part.warnings)
3258 self.parts.append(book_part)
3259 else:
3260 self.warnings.append(
3261 {
3262 self.pid: self.__class__.__name__
3263 + "."
3264 + inspect.currentframe().f_code.co_name
3265 + " "
3266 + tag
3267 }
3268 )
3270 if not self.parts:
3271 self.body = get_text_from_node(node)
3273 def parse_book_meta(self, node, **kwargs):
3274 for child in node:
3275 tag = normalize(child.tag)
3277 if tag == "book-id":
3278 self.parse_id(child)
3279 elif tag == "pub-date":
3280 self.year = self.get_data_from_date(child)
3281 elif tag == "book-volume-number": 3281 ↛ 3282line 3281 didn't jump to line 3282, because the condition on line 3281 was never true
3282 self.volume = child.text
3283 self.volume_int = child.text
3284 elif tag == "pub-history":
3285 history_dates = self.get_data_from_history(child)
3286 for date in history_dates:
3287 if date["type"] == "last-modified":
3288 self.last_modified_iso_8601_date_str = date["date"]
3289 elif date["type"] == "prod-deployed-date": 3289 ↛ 3290line 3289 didn't jump to line 3290, because the condition on line 3289 was never true
3290 self.prod_deployed_date_iso_8601_date_str = date["date"]
3291 elif tag == "book-title-group":
3292 self.parse_title_group(child)
3293 elif tag == "publisher":
3294 self.publisher = JatsPublisher(tree=child)
3295 else:
3296 fct_name = "parse_" + tag.replace("-", "_")
3297 ftor = getattr(self, fct_name, None)
3298 if callable(ftor):
3299 ftor(child, add_ext_link=True)
3300 else:
3301 self.warnings.append(
3302 {
3303 self.pid: self.__class__.__name__
3304 + "."
3305 + inspect.currentframe().f_code.co_name
3306 + " "
3307 + tag
3308 }
3309 )
3311 if self.last_modified_iso_8601_date_str is None: 3311 ↛ 3312line 3311 didn't jump to line 3312, because the condition on line 3311 was never true
3312 self.last_modified_iso_8601_date_str = timezone.now().isoformat()
3314 def parse_custom_meta_group(self, node, **kwargs):
3315 for child in node:
3316 tag = normalize(child.tag)
3318 if tag == "custom-meta": 3318 ↛ 3315line 3318 didn't jump to line 3315, because the condition on line 3318 was never false
3319 name, value = self.get_data_from_custom_meta(child)
3321 if name == "provider": 3321 ↛ 3315line 3321 didn't jump to line 3315, because the condition on line 3321 was never false
3322 self.provider = value
3324 def set_contribs(self):
3325 """
3326 Update the contrib_groups if the XML does not declare any
3327 - with the authors of the first part
3328 - if the book is a monograph
3329 - if all parts are written by the same authors
3331 :return:
3332 """
3334 authors = [contrib for contrib in self.contributors if contrib["role"] == "author"]
3335 if not authors:
3336 if self.ctype == "book-monograph" and self.parts:
3337 first_part = self.parts[0]
3338 self.contributors = first_part.contributors
3339 elif ( 3339 ↛ exitline 3339 didn't return from function 'set_contribs', because the condition on line 3339 was never false
3340 self.ctype == "book-edited-book" or self.ctype == "book-lecture-notes"
3341 ) and self.parts:
3342 # check if authors of the book-parts are identical
3343 equal = True
3344 book_part_contributors = self.parts[0].contributors
3345 i = 1
3346 while equal and i < len(self.parts):
3347 part = self.parts[i]
3348 if part.contributors != book_part_contributors: 3348 ↛ 3350line 3348 didn't jump to line 3350, because the condition on line 3348 was never false
3349 equal = False
3350 i += 1
3351 if equal: 3351 ↛ 3352line 3351 didn't jump to line 3352, because the condition on line 3351 was never true
3352 if self.ctype == "book-edited-book":
3353 self.ctype = "book-monograph"
3354 self.contributors = book_part_contributors
3355 else:
3356 contrib = create_contributor()
3357 contrib["string_name"] = "Collectif"
3358 contrib["role"] = "author"
3359 contrib["contrib_xml"] = get_contrib_xml(contrib)
3360 self.contributors.append(contrib)
3362 def set_title(self):
3363 if self.title_xml == "" and len(self.incollection) > 0:
3364 self.title_xml = self.incollection[0].title_xml
3365 self.title_html = self.incollection[0].title_html
3366 self.title_tex = self.incollection[0].title_tex
3369class BitsBookPart(BookPartData, JatsArticleBase):
3370 def __init__(self, *args, **kwargs):
3371 super().__init__(*args, **kwargs)
3372 self.no_bib = kwargs.get("no_bib", False)
3373 self.parse_tree(kwargs["tree"])
3375 def parse_tree(self, tree):
3376 super().parse_tree(tree)
3378 self.atype = get_normalized_attrib(tree, "book-part-type") or ""
3379 try:
3380 self.seq = int(get_normalized_attrib(tree, "seq") or "")
3381 except ValueError:
3382 pass
3384 for node in tree:
3385 tag = normalize(node.tag)
3387 if tag == "book-part-meta":
3388 self.parse_book_part_meta(node)
3389 elif tag == "body":
3390 self.parse_body(node)
3391 elif tag == "front-matter": 3391 ↛ 3392line 3391 didn't jump to line 3392, because the condition on line 3391 was never true
3392 self.parse_front_matter(node)
3393 elif tag == "back": 3393 ↛ 3410line 3393 didn't jump to line 3410, because the condition on line 3393 was never false
3394 for child in node:
3395 tag = normalize(child.tag)
3397 if tag == "ref-list": 3397 ↛ 3400line 3397 didn't jump to line 3400, because the condition on line 3397 was never false
3398 self.parse_ref_list(child)
3399 else:
3400 self.warnings.append(
3401 {
3402 self.pid: self.__class__.__name__
3403 + "."
3404 + inspect.currentframe().f_code.co_name
3405 + " "
3406 + tag
3407 }
3408 )
3409 else:
3410 self.warnings.append(
3411 {
3412 self.pid: self.__class__.__name__
3413 + "."
3414 + inspect.currentframe().f_code.co_name
3415 + " "
3416 + tag
3417 }
3418 )
3420 # Workaround a numdam-plus bug where a book-part can have a trans-title without a title
3421 # TODO: Fix numdam-plus, the books impacted and remove the hack
3422 self.set_title()
3424 def parse_book_part_meta(self, node, **kwargs):
3425 for child in node:
3426 tag = normalize(child.tag)
3428 if tag == "book-part-id":
3429 self.parse_id(child)
3430 elif tag == "fpage":
3431 self.fpage = child.text
3432 self.page_type = get_normalized_attrib(child, "content-type") or ""
3433 elif tag == "lpage":
3434 self.lpage = child.text
3435 elif tag == "page-range": 3435 ↛ 3436line 3435 didn't jump to line 3436, because the condition on line 3435 was never true
3436 self.page_range = child.text
3437 else:
3438 fct_name = "parse_" + tag.replace("-", "_")
3439 ftor = getattr(self, fct_name, None)
3440 if callable(ftor): 3440 ↛ 3443line 3440 didn't jump to line 3443, because the condition on line 3440 was never false
3441 ftor(child)
3442 else:
3443 self.warnings.append(
3444 {
3445 self.pid: self.__class__.__name__
3446 + "."
3447 + inspect.currentframe().f_code.co_name
3448 + " "
3449 + tag
3450 }
3451 )
3453 def parse_body(self, node, **kwargs):
3454 for child in node:
3455 tag = normalize(child.tag)
3457 if tag == "book-part":
3458 book_part = BitsBookPart(tree=child)
3459 self.warnings.extend(book_part.warnings)
3460 self.parts.append(book_part)
3461 else:
3462 self.warnings.append(
3463 {
3464 self.pid: self.__class__.__name__
3465 + "."
3466 + inspect.currentframe().f_code.co_name
3467 + " "
3468 + tag
3469 }
3470 )
3472 self.body = get_text_from_node(node)
3474 def set_title(self):
3475 """
3476 Bug in some books: some chapters may have a trans-title, but no title !
3477 Hack and manually set the title*
3478 :return:
3479 """
3481 if self.trans_title_html and not self.title_html:
3482 self.title_html = self.trans_title_html
3483 self.title_tex = self.trans_title_tex
3486######################################################################################
3487#
3488# Functions used by ptf-tools
3489#
3490######################################################################################
3493def update_bibitem_xml(bibitem, new_ids):
3494 xml = "<ref>" + bibitem.citation_xml + "</ref>"
3495 the_parser = etree.XMLParser(
3496 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True
3497 )
3498 tree = etree.fromstring(xml, parser=the_parser)
3500 node = tree.find("element-citation")
3501 if node is None:
3502 node = tree.find("mixed-citation")
3503 if node is not None: 3503 ↛ 3544line 3503 didn't jump to line 3544, because the condition on line 3503 was never false
3504 children_to_remove = []
3505 for child in node:
3506 if child.tag == "ext-link":
3507 child_type = child.get("ext-link-type")
3508 if child_type and child_type in [
3509 "zbl-item-id",
3510 "mr-item-id",
3511 "doi",
3512 "numdam-id",
3513 "mathdoc-id",
3514 "eid",
3515 ]:
3516 children_to_remove.append(child)
3517 elif child.tag == "pub-id":
3518 child_type = child.get("pub-id-type")
3519 if child_type and child_type in [
3520 "zbl-item-id",
3521 "mr-item-id",
3522 "doi",
3523 "numdam-id",
3524 "mathdoc-id",
3525 ]:
3526 children_to_remove.append(child)
3528 for child in children_to_remove:
3529 node.remove(child)
3531 for id_type, value_dict in new_ids.items():
3532 if value_dict["checked"] and not value_dict["false_positive"]:
3533 if id_type in ["doi", "arxiv", "tel", "hal", "theses.fr"]:
3534 new_node = etree.Element("pub-id")
3535 new_node.set("pub-id-type", id_type)
3536 else:
3537 new_node = etree.Element("ext-link")
3538 new_node.set("ext-link-type", id_type)
3540 new_node.text = value_dict["id_value"]
3541 node.append(new_node)
3543 # TODO Modify the call to update_bibitem_xml and pass the parent's lang
3544 result = JatsRef(tree=tree, lang="und")
3545 return result
3548def check_bibitem_xml(bibitem):
3549 xml = "<ref>" + bibitem.citation_xml + "</ref>"
3550 the_parser = etree.XMLParser(
3551 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True
3552 )
3553 tree = etree.fromstring(xml, parser=the_parser)
3555 result = JatsRef(tree=tree, lang="und")
3556 return result
3559# Create XML strings based on internal data
3562def get_single_title_xml(title):
3563 has_italic = title.find("<i>") > -1 and title.find("</i>") > -1
3564 has_superscript = title.find("<sup>") > -1 and title.find("</sup>") > -1
3565 has_subscript = title.find("<sub>") > -1 and title.find("</sub>") > -1
3567 if has_italic: 3567 ↛ 3568line 3567 didn't jump to line 3568, because the condition on line 3567 was never true
3568 title = title.replace("<i>", "|||i|||").replace("</i>", "|||/i|||")
3569 if has_superscript: 3569 ↛ 3570line 3569 didn't jump to line 3570, because the condition on line 3569 was never true
3570 title = title.replace("<sup>", "|||sup|||").replace("</sup>", "|||/sup|||")
3571 if has_subscript: 3571 ↛ 3572line 3571 didn't jump to line 3572, because the condition on line 3571 was never true
3572 title = title.replace("<sub>", "|||sub|||").replace("</sub>", "|||/sub|||")
3574 title = escape(title)
3576 if has_italic: 3576 ↛ 3577line 3576 didn't jump to line 3577, because the condition on line 3576 was never true
3577 title = title.replace("|||i|||", "<italic>").replace("|||/i|||", "</italic>")
3579 if has_superscript: 3579 ↛ 3580line 3579 didn't jump to line 3580, because the condition on line 3579 was never true
3580 title = title.replace("|||sup|||", "<sup>").replace("|||/sup|||", "</sup>")
3582 if has_subscript: 3582 ↛ 3583line 3582 didn't jump to line 3583, because the condition on line 3582 was never true
3583 title = title.replace("|||sub|||", "<sub>").replace("|||/sub|||", "</sub>")
3585 return title
3588def get_title_xml(title, trans_title=None, trans_lang=None, with_tex_values=True):
3589 """
3590 Get the title_xml given a simple title
3591 If the title has formulas, use CKeditorParser first, then call this function with the value_xml returned by the parser
3592 and set with_tex_values to False
3593 TODO: enhance CkeditorParser to accept both title and trans_title to build the xml in 1 shot.
3594 """
3595 if with_tex_values:
3596 title = get_single_title_xml(title)
3598 xml = '<title-group xmlns:xlink="http://www.w3.org/1999/xlink">'
3599 xml += f'<article-title xml:space="preserve">{title}</article-title>'
3601 if trans_title and trans_lang:
3602 if with_tex_values:
3603 trans_title = get_single_title_xml(trans_title)
3604 xml += f'<trans-title-group xml:lang="{trans_lang}"><trans-title>{trans_title}</trans-title></trans-title-group>'
3606 xml += "</title-group>"
3608 return xml
3611def get_issue_title_xml(title, lang, trans_title=None, trans_lang=None):
3612 """
3613 Get the title_xml given a simple title
3614 """
3615 title = get_single_title_xml(title)
3616 xml = f'<issue-title xml:lang="{lang}" xml:space="preserve">{title}</issue-title>'
3618 if trans_title and trans_lang:
3619 trans_title = get_single_title_xml(trans_title)
3620 xml += f'<issue-title xml:lang="{trans_lang}" xml:space="preserve">{trans_title}</issue-title>'
3622 return xml
3625def get_name_params(first_name, last_name, prefix, suffix, orcid):
3626 params = {
3627 "first_name": first_name,
3628 "last_name": last_name,
3629 "prefix": prefix,
3630 "suffix": suffix,
3631 "orcid": orcid,
3632 }
3633 helper_update_name_params(params)
3635 return params
3638def get_tex_from_xml(xml, tag, **kwargs):
3639 parser_ = etree.XMLParser(
3640 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True
3641 )
3642 etree.register_namespace("mml", "http://www.w3.org/1998/Math/MathML")
3643 # text = xml.replace('xmlns:xlink="http://www.w3.org/1999/xlink"', '')
3644 text = xml
3646 if tag in ["abstract", "title"]: 3646 ↛ 3649line 3646 didn't jump to line 3649, because the condition on line 3646 was never false
3647 text = f"<article><front><article-meta>{text}</article-meta></front></article>"
3649 tree = etree.fromstring(text.encode("utf-8"), parser=parser_)
3650 xarticle = JatsArticle(tree=tree, **kwargs)
3652 result = ""
3653 if tag == "abstract": 3653 ↛ 3655line 3653 didn't jump to line 3655, because the condition on line 3653 was never false
3654 result = xarticle.abstracts[0]["value_tex"]
3655 elif tag == "title":
3656 result = xarticle.title_tex, xarticle.trans_title_tex
3658 return result