Coverage for apps/ptf/cmds/xml/jats/jats_parser.py: 70%

2059 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2024-07-18 09:02 +0000

1################################################################################################## 

2# 

3# README 

4# 

5# jats_parser.py is a replacement of xmldata.py 

6# The purpose is to parse a JATS xml (or BITS) tree from top to bottom. 

7# Each node is read only once. 

8# 

9# JatsArticle, JatsIssue, JatsJournal, BitsBook are the objects created by xml_cmds. 

10# The xml tree is parsed in the class constructor (__init__) 

11# These classes have parse_<tag> functions to parse the xml nodes and set instance variables. 

12# Some parse_<tag> functions are called directly. 

13# Ex: if tag == "article-meta": 

14# self.parse_article_meta(child) 

15# Other parse_<tag> functions are called "automatically" 

16# fct_name = 'parse_' + tag.replace('-', '_') 

17# ftor = getattr(self, fct_name, None) 

18# if callable(ftor): 

19# ftor(child) 

20# 

21# JatsBase and JatsArticleBase are base classes. 

22# They provide common instance variables and their corresponding parse_<tag> functions 

23# 

24# html_from_<tag> are used to generate the HTML text of a node with mixed content: 

25# a node that mixes text, children and tail 

26# These functions can also extract data and set instance variables (ex: self.figures) 

27# 

28# get_data_from_* parse a node, but simply return data (text, dict,...) without side effects 

29# 

30# At the end of this file, there are some functions that are/were called by ptf-tools. 

31# They are kept here for simplicity: we can switch xmldata entirely with jats_parser 

32# 

33# TODO: the import OAI or the import of a collection could simply call the first function 

34# (def parser(tree)) 

35# 

36################################################################################################## 

37 

38import copy 

39import inspect 

40import os 

41import re 

42 

43from lxml import etree 

44from pylatexenc.latexencode import unicode_to_latex 

45 

46from django.conf import settings 

47from django.urls import reverse 

48from django.utils import timezone 

49 

50from matching import scrapping 

51from ptf.cmds.xml.citation_html import add_span_class_to_html_from_article_title 

52from ptf.cmds.xml.citation_html import add_span_class_to_html_from_authors 

53from ptf.cmds.xml.citation_html import add_span_class_to_html_from_chapter_title 

54from ptf.cmds.xml.citation_html import add_span_class_to_html_from_source 

55from ptf.cmds.xml.citation_html import add_span_class_to_html_from_volume 

56from ptf.cmds.xml.citation_html import get_citation_html 

57from ptf.cmds.xml.xml_base import RefBase 

58from ptf.cmds.xml.xml_base import XmlParserBase 

59from ptf.cmds.xml.xml_utils import escape 

60from ptf.cmds.xml.xml_utils import get_contrib_xml 

61from ptf.cmds.xml.xml_utils import get_elsevier_image_extensions 

62from ptf.cmds.xml.xml_utils import get_normalized_attrib 

63from ptf.cmds.xml.xml_utils import get_text_from_node 

64from ptf.cmds.xml.xml_utils import get_xml_from_node 

65from ptf.cmds.xml.xml_utils import helper_update_name_params 

66from ptf.cmds.xml.xml_utils import make_links_clickable 

67from ptf.cmds.xml.xml_utils import normalize 

68from ptf.cmds.xml.xml_utils import normalize_space 

69from ptf.cmds.xml.xml_utils import split_kwds 

70from ptf.display import resolver 

71from ptf.model_data import ArticleData 

72from ptf.model_data import BookData 

73from ptf.model_data import BookPartData 

74from ptf.model_data import CollectionData 

75from ptf.model_data import Foo 

76from ptf.model_data import IssueData 

77from ptf.model_data import JournalData 

78from ptf.model_data import MathdocPublicationData 

79from ptf.model_data import PublisherData 

80from ptf.model_data import create_contributor 

81from ptf.model_data import create_extlink 

82 

83 

84class JatsBase(XmlParserBase): 

85 def __init__(self, *args, **kwargs): 

86 super().__init__() 

87 self.warnings = [] 

88 self.fns = [] 

89 self.tree = None 

90 # Used to convert an XML value for CKEditor (ie abstract) 

91 self.add_span_around_tex_formula = False 

92 # Used to create a Tex file from an XML value (ie abstract) 

93 self.for_tex_file = False 

94 

95 def parse_tree(self, tree): 

96 self.tree = tree 

97 self.lang = get_normalized_attrib(tree, "lang") or "und" 

98 

99 def post_parse_tree(self): 

100 if self.no_bib: 100 ↛ 102line 100 didn't jump to line 102, because the condition on line 100 was never true

101 # For Geodesic 

102 ext_link = create_extlink() 

103 ext_link["rel"] = "source" 

104 ext_link["location"] = "http://www.numdam.org/item/" + self.pid 

105 ext_link[ 

106 "metadata" 

107 ] = "NUMDAM" # Used as the source id to find the source in the GDML Views 

108 self.ext_links.append(ext_link) 

109 

110 def parse_node_with_article_title(self, node, **kwargs): 

111 tex, html = self.parse_inner_node(node, **kwargs) 

112 

113 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False 

114 if is_mixed_citation: 

115 html = add_span_class_to_html_from_article_title(html, **kwargs) 

116 

117 return tex, html 

118 

119 def parse_node_with_break(self, node, **kwargs): 

120 tex = "\\newline\n" if self.for_tex_file else " " 

121 html = "<br/>" 

122 

123 return tex, html 

124 

125 def parse_node_with_chem_struct_wrap(self, node, **kwargs): 

126 table_id = label = None 

127 inner_text = "" 

128 

129 if "id" in node.attrib: 

130 table_id = node.attrib["id"] 

131 

132 for child in node: 

133 tag = normalize(child.tag) 

134 if tag == "label": 

135 _, label = self.parse_node_with_mixed_content(child, **kwargs) 

136 else: 

137 _, child_text = self.parse_node_with_mixed_content(child, **kwargs) 

138 inner_text += child_text 

139 

140 text = "<table " 

141 if table_id: 

142 text += f'id="{table_id}" ' 

143 text += f'class="formula"><tr><td class="formula-inner">{inner_text}</td>' 

144 

145 text += '<td class="formula-label">' 

146 if label: 

147 text += label 

148 text += "</td></tr>" 

149 text += "</table>" 

150 

151 return text, text 

152 

153 def parse_node_with_disp_quote(self, node, **kwargs): 

154 tex, html = self.parse_inner_node(node, **kwargs) 

155 

156 html = f'<div class="disp-quote">{html}</div>' 

157 tex = f'<div class="disp-quote">{tex}</div>' 

158 

159 return tex, html 

160 

161 def parse_node_with_boxed_text(self, node, **kwargs): 

162 box_id = node.attrib["id"] if "id" in node.attrib else None 

163 

164 _, node_html = self.parse_inner_node(node, **kwargs) 

165 

166 if box_id: 

167 html = f'<div id="{box_id}" class="boxed-text">' 

168 else: 

169 html = '<div class="boxed-text">' 

170 

171 html = f"{html}{node_html}</div>" 

172 

173 return "", html 

174 

175 def parse_node_with_fig(self, node, **kwargs): 

176 """ 

177 Ex: <fig><label>LABEL</label><caption><title>TITLE</title>CAPTION</caption><graphic/></fig> 

178 becomes: <figure><img><figcaption>LABEL : TITLE<p>CAPTION</p></figcaption></figure> 

179 

180 :param node: XML node of a fig 

181 :return: the HTML text + the dict representing the image (mimetype, location,...) 

182 """ 

183 html = "" 

184 

185 fig_id = label_html = title_html = caption_html = None 

186 img_html = "" 

187 

188 if "id" in node.attrib: 

189 fig_id = node.attrib["id"] 

190 

191 for child in node: 

192 tag = normalize(child.tag) 

193 if tag == "label": 

194 _, label_html = self.parse_node_with_mixed_content(child, **kwargs) 

195 elif tag == "caption": 

196 for caption_child in child: 

197 tag = normalize(caption_child.tag) 

198 if tag == "title": 

199 _, title_html = self.parse_node_with_mixed_content(caption_child, **kwargs) 

200 elif tag == "p": 200 ↛ 214line 200 didn't jump to line 214, because the condition on line 200 was never false

201 _, caption_p_html = self.parse_node_with_mixed_content( 

202 caption_child, **kwargs 

203 ) 

204 if caption_html: 

205 caption_html = caption_html.replace( 

206 "<p>", '<p class="fig-first-caption">', 1 

207 ) 

208 caption_html += caption_p_html.replace( 

209 "<p>", '<p class="fig-small-caption">', 1 

210 ) 

211 else: 

212 caption_html = caption_p_html 

213 else: 

214 self.warnings.append( 

215 { 

216 self.pid: self.__class__.__name__ 

217 + "." 

218 + inspect.currentframe().f_code.co_name 

219 + " " 

220 + tag 

221 } 

222 ) 

223 

224 elif tag == "graphic": 

225 _, graphic_html = self.parse_node_with_graphic(child, **kwargs) 

226 img_html += graphic_html 

227 elif tag == "attrib": 

228 _, html = self.parse_node_with_mixed_content(child, **kwargs) 

229 caption_html = f'{caption_html}<p class="fig-small-caption">{html}</p>' 

230 elif tag == "permissions": 230 ↛ 236line 230 didn't jump to line 236, because the condition on line 230 was never false

231 for gchild in child: 

232 if gchild.tag == "copyright-statement": 232 ↛ 231line 232 didn't jump to line 231, because the condition on line 232 was never false

233 _, html = self.parse_node_with_mixed_content(gchild, **kwargs) 

234 caption_html = f'{caption_html}<p class="fig-small-caption">{html}</p>' 

235 else: 

236 self.warnings.append( 

237 { 

238 self.pid: self.__class__.__name__ 

239 + "." 

240 + inspect.currentframe().f_code.co_name 

241 + " " 

242 + tag 

243 } 

244 ) 

245 

246 if fig_id: 

247 html = '<figure id="' + fig_id + '">' 

248 else: 

249 html = "<figure>" 

250 

251 if len(img_html) > 0: 251 ↛ 254line 251 didn't jump to line 254, because the condition on line 251 was never false

252 html += img_html 

253 

254 if label_html or title_html or (caption_html is not None and len(caption_html) > 0): 254 ↛ 268line 254 didn't jump to line 268, because the condition on line 254 was never false

255 html += "<figcaption>" 

256 

257 if label_html: 257 ↛ 259line 257 didn't jump to line 259, because the condition on line 257 was never false

258 html += label_html 

259 if label_html and title_html: 

260 html += " : " 

261 if title_html: 

262 html += title_html 

263 if caption_html: 263 ↛ 266line 263 didn't jump to line 266, because the condition on line 263 was never false

264 html += caption_html 

265 

266 html += "</figcaption>" 

267 

268 html += "</figure>" 

269 

270 if ( 270 ↛ 276line 270 didn't jump to line 276

271 "append_floats" in kwargs 

272 and kwargs["append_floats"] 

273 and hasattr(self, "floats") 

274 and fig_id is not None 

275 ): 

276 self.floats[fig_id] = html 

277 

278 return "", html 

279 

280 def parse_node_with_fn(self, node, **kwargs): 

281 """ 

282 Ex: <fn><label>LABEL</label><p>TEXT</p></fn> 

283 

284 :param node: XML node of a fn 

285 :return: ''. the text is stripped from the HTML. but a list of fn is built 

286 """ 

287 html = fn_html = "" 

288 

289 label_html = fn_id = None 

290 

291 if "id" in node.attrib: 291 ↛ 292line 291 didn't jump to line 292, because the condition on line 291 was never true

292 fn_id = node.attrib["id"] 

293 

294 for child in node: 

295 tag = normalize(child.tag) 

296 if tag == "label": 

297 _, label_html = self.parse_node_with_mixed_content(child, **kwargs) 

298 elif tag == "p": 298 ↛ 302line 298 didn't jump to line 302

299 _, fn_html = self.parse_node_with_mixed_content(child, **kwargs) 

300 fn_html = fn_html.replace("<p>", "").replace("</p>", "") 

301 else: 

302 warning = ( 

303 self.__class__.__name__ 

304 + "." 

305 + inspect.currentframe().f_code.co_name 

306 + " " 

307 + tag 

308 ) 

309 self.warnings.append({self.pid: warning}) 

310 

311 if fn_id: 311 ↛ 312line 311 didn't jump to line 312, because the condition on line 311 was never true

312 html = '<p id="' + fn_id + '">' 

313 else: 

314 html = "<p>" 

315 

316 if label_html and ("keep_fn_label" not in kwargs or kwargs["keep_fn_label"]): 316 ↛ 319line 316 didn't jump to line 319, because the condition on line 316 was never false

317 html += f"<sup>{label_html}</sup> " 

318 

319 html += fn_html + "</p>" 

320 

321 if not kwargs["keep_fn"] and html not in self.fns: 321 ↛ 322line 321 didn't jump to line 322, because the condition on line 321 was never true

322 self.fns.append(html) 

323 

324 html = html if kwargs["keep_fn"] else "" 

325 return "", html 

326 

327 def parse_node_with_graphic(self, node, **kwargs): 

328 """ 

329 The href value of graphics used in our XML can have the following values 

330 - relative path to the issue XML folder (Elsevier JATS) 

331 - full path starting with "file:/" (Elsevier JATS created in early 2022) 

332 - simple file name (with no relative path) in the RVT FullText XML 

333 

334 After the import, we want 

335 - the files located in the src/tex/figures article folder 

336 - the url pointing to the image, built thanks to kwargs['base_url'] 

337 

338 addRelatedObjectPtfCmd will copy the images to the src/tex/figures folder if the location starts with file:/ 

339 => change the location to "file:/..." for Elsevier JATS (the xarticle has a pii attribute) 

340 """ 

341 href = "" 

342 

343 for attrib in node.attrib: 

344 name = normalize(attrib) 

345 if name == "href": 

346 href = node.attrib[attrib] 

347 

348 if href: 348 ↛ 394line 348 didn't jump to line 394, because the condition on line 348 was never false

349 basename = os.path.basename(href) 

350 ext = basename.split(".")[-1] 

351 if ext == "png": 351 ↛ 352line 351 didn't jump to line 352, because the condition on line 351 was never true

352 mimetype = "image/png" 

353 else: 

354 mimetype = "image/jpeg" 

355 

356 img_url = "src/tex/figures/" + basename 

357 

358 if ext in get_elsevier_image_extensions(): # Elsevier uses "jc3" instead of jpg. WTF ? 358 ↛ 361line 358 didn't jump to line 361, because the condition on line 358 was never false

359 img_url = img_url[0 : -len(ext)] + "jpg" 

360 

361 data_location = href if "file:/" in href else img_url 

362 if ( 362 ↛ 368line 362 didn't jump to line 368

363 hasattr(self, "pii") 

364 and hasattr(self, "issue") 

365 and "file:/" not in href 

366 and self.from_folder 

367 ): 

368 base_dir = self.issue.journal.pid 

369 if os.path.dirname(href) != base_dir: 

370 href = os.path.join(self.from_folder, base_dir, self.issue.pid, href) 

371 data_location = "file:" + href 

372 

373 data = { 

374 "rel": "html-image", 

375 "mimetype": mimetype, 

376 "location": data_location, 

377 "base": None, 

378 "metadata": node.text if node.text is not None else "", 

379 } 

380 

381 if ext == "png": 381 ↛ 382line 381 didn't jump to line 382, because the condition on line 381 was never true

382 img_url = os.path.join(kwargs["base_url"], "png", img_url) 

383 else: 

384 img_url = os.path.join(kwargs["base_url"], "jpg", img_url) 

385 img_text = '<a href="' + img_url + '" data-lightbox="image-' 

386 img_text += str(len(self.figures)) + '" title="">' 

387 img_text += '<img src="' + img_url + '" class="article-body-img" />' 

388 img_text += "</a>" 

389 

390 if data not in self.figures: 390 ↛ 394line 390 didn't jump to line 394, because the condition on line 390 was never false

391 self.figures.append(data) 

392 self.related_objects.append(data) 

393 

394 return "", img_text 

395 

396 def parse_node_with_inline_formula(self, node, **kwargs): 

397 # MathJAX is doing a good job with formulae and is now the standard 

398 # MathML could be ignored in HTML (the original XML value is preserved with value_xml) 

399 # We could simply return the tex-math text 

400 # But there are multiple errors in the TeX of the Mersenne articles. 

401 # We first need to fix those mistakes before switching to TeX 

402 

403 tex_math = "" 

404 math_text = "" 

405 formula_id = label = None 

406 

407 if "id" in node.attrib: 

408 formula_id = node.attrib["id"] 

409 

410 for child in node: 

411 tag = normalize(child.tag) 

412 if tag == "alternatives": 

413 for alternative in child: 

414 tag = normalize(alternative.tag) 

415 if tag == "tex-math": 

416 tex_math = alternative.text or "" 

417 elif tag == "math": 

418 # remove_namespace(child) 

419 # Elsevier sometimes provide the formula a an alternative image. Remove it. 

420 alternative.attrib.pop("altimg", None) 

421 

422 math_text = get_xml_from_node(alternative).replace("mml:", "") 

423 math_text = math_text.replace( 

424 'xmlns:xlink="http://www.w3.org/1999/xlink"', "" 

425 ) 

426 math_text = math_text.replace( 

427 'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"', "" 

428 ) 

429 if node.tag == "disp-formula": 

430 math_text = math_text.replace("<math", '<math display="block"') 

431 elif tag == "label": 431 ↛ 434line 431 didn't jump to line 434, because the condition on line 431 was never false

432 label = child.text or "" 

433 else: 

434 self.warnings.append( 

435 { 

436 self.pid: self.__class__.__name__ 

437 + "." 

438 + inspect.currentframe().f_code.co_name 

439 + " " 

440 + tag 

441 } 

442 ) 

443 

444 if (math_text == "" and tex_math != "") or (math_text != "" and tex_math == ""): 

445 stack = inspect.stack() 

446 stack_str = " ".join( 

447 [ 

448 frameinfo[3] 

449 for frameinfo in stack[1:] 

450 if frameinfo[3].find("parse_") == 0 

451 and frameinfo[3].find("parse_node") == -1 

452 and frameinfo[3].find("parse_inner") == -1 

453 and frameinfo[3].find("parse_tree") == -1 

454 and frameinfo[3].find("parse_article_meta") == -1 

455 ] 

456 ) 

457 print(f"{self.pid} no math formula for {stack_str}") 

458 # raise ValueError("No formula alternative") 

459 

460 if node.tag != "disp-formula": 

461 if tex_math != "" and tex_math[0] != "$": 461 ↛ 462line 461 didn't jump to line 462, because the condition on line 461 was never true

462 tex_math = "$" + tex_math 

463 if tex_math != "" and tex_math[-1] != "$": 463 ↛ 464line 463 didn't jump to line 464, because the condition on line 463 was never true

464 tex_math = tex_math + "$" 

465 

466 tex = tex_math 

467 

468 html = "" 

469 if label or node.tag == "disp-formula": 

470 html += '<table class="formula"><tr><td class="formula-inner">' 

471 

472 html += '<span class="mathjax-formula" ' 

473 if formula_id: 

474 html += 'id="' + formula_id + '" ' 

475 alt_text = tex_math.replace("\n", "") if node.tag == "disp-formula" else tex_math 

476 if math_text: 

477 html += f'data-tex="{alt_text}">{math_text}</span>' 

478 else: 

479 html += f'data-tex="{alt_text}">{tex_math}</span>' 

480 

481 if label or node.tag == "disp-formula": 

482 html += '</td><td class="formula-label">' 

483 if label: 

484 html += label 

485 html += "</td></tr>" 

486 html += "</table>" 

487 

488 if self.add_span_around_tex_formula: 488 ↛ 489line 488 didn't jump to line 489, because the condition on line 488 was never true

489 tex = f'<span class="mathjax-formula">\\({tex[1:-1]}\\)</span>' 

490 

491 return tex, html 

492 

493 def parse_node_with_institution_id(self, node, **kwargs): 

494 return "", "" 

495 

496 def parse_node_with_italic(self, node, **kwargs): 

497 tex, html = self.parse_inner_node(node, **kwargs) 

498 

499 # is_mixed_citation = kwargs['is_mixed_citation'] if 'is_mixed_citation' in kwargs else False 

500 # is_citation = kwargs['is_citation'] if 'is_citation' in kwargs else False 

501 # is_comment = kwargs['is_comment'] if 'is_comment' in kwargs else False 

502 # 

503 # if inner_text == '' or kwargs['temp_tex'] or (is_citation and not is_mixed_citation and not is_comment): 

504 # text = inner_text 

505 # else: 

506 # text = '<span class="italique">' + inner_text + '</span>' 

507 

508 html = f'<span class="italique">{html}</span>' 

509 

510 if self.for_tex_file: 510 ↛ 511line 510 didn't jump to line 511, because the condition on line 510 was never true

511 tex = "{\\it " + tex + "}" 

512 else: 

513 tex = f"<i>{tex}</i>" 

514 

515 return tex, html 

516 

517 def parse_node_with_list(self, node, **kwargs): 

518 tex, html = self.parse_inner_node(node, **kwargs) 

519 

520 start = None 

521 continued_from = node.get("continued-from") 

522 if continued_from is not None: 522 ↛ 523line 522 didn't jump to line 523, because the condition on line 522 was never true

523 start = self.get_list_start_value(node) + 1 

524 

525 list_type = node.get("list-type") 

526 if list_type == "bullet" or list_type == "simple": 

527 if self.for_tex_file: 527 ↛ 528line 527 didn't jump to line 528, because the condition on line 527 was never true

528 tex = "\n\\begin{itemize}\n" + tex + "\\end{itemize}\n" 

529 else: 

530 tex = f"<ul>{tex}</ul>" 

531 

532 html = f"<ul>{html}</ul>" 

533 else: 

534 if self.for_tex_file: 534 ↛ 535line 534 didn't jump to line 535, because the condition on line 534 was never true

535 tex = "\n\\begin{enumerate}\n" + tex + "\\end{enumerate}\n" 

536 else: 

537 if list_type == "order" or list_type == "number": 

538 if start is not None: 538 ↛ 539line 538 didn't jump to line 539, because the condition on line 538 was never true

539 html = f'<ol type="1" start="{str(start)}">{html}</ol>' 

540 tex = f'<ol type="1" start="{str(start)}">{tex}</ol>' 

541 else: 

542 html = f'<ol type="1">{html}</ol>' 

543 tex = f'<ol type="1">{tex}</ol>' 

544 elif list_type == "alpha-lower": 

545 html = f'<ol type="a">{html}</ol>' 

546 tex = f'<ol type="a">{tex}</ol>' 

547 elif list_type == "alpha-upper": 

548 html = f'<ol type="A">{html}</ol>' 

549 tex = f'<ol type="A">{tex}</ol>' 

550 elif list_type == "roman-lower": 

551 html = f'<ol type="i">{html}</ol>' 

552 tex = f'<ol type="i">{tex}</ol>' 

553 elif list_type == "roman-upper": 553 ↛ 554line 553 didn't jump to line 554, because the condition on line 553 was never true

554 html = f'<ol type="I">{html}</ol>' 

555 tex = f'<ol type="I">{tex}</ol>' 

556 else: 

557 html = f'<ul class="no-bullet" style="list-style-type:none;">{html}</ul>' 

558 tex = f'<ul class="no-bullet" style="list-style-type:none;">{tex}</ul>' 

559 

560 return tex, html 

561 

562 def parse_node_with_list_item(self, node, **kwargs): 

563 """ 

564 <list-item><label>LABEL</label><p>TEXT</p> becomes 

565 <li>LABEL TEXT</li> 

566 (same with <title>) 

567 

568 :param node: 

569 :return: 

570 """ 

571 

572 title_tex = ( 

573 title_html 

574 ) = label_tex = label_html = p_tex = p_html = content_tex = content_html = "" 

575 

576 for child in node: 

577 tag = normalize(child.tag) 

578 if tag == "label": 

579 label_tex, label_html = self.parse_node_with_mixed_content(child, **kwargs) 

580 elif tag == "title": 580 ↛ 581line 580 didn't jump to line 581, because the condition on line 580 was never true

581 title_tex, title_html = self.parse_node_with_mixed_content(child, **kwargs) 

582 elif tag == "p": 

583 if p_html == "" and content_html == "": 583 ↛ 586line 583 didn't jump to line 586, because the condition on line 583 was never false

584 p_tex, p_html = self.parse_inner_node(child, **kwargs) 

585 else: 

586 content_tex, content_html = self.parse_inner_node(child, **kwargs) 

587 content_html = f"<p>{content_html}</p>" 

588 elif tag == "list": 588 ↛ 592line 588 didn't jump to line 592, because the condition on line 588 was never false

589 content_tex, content_html = self.parse_node_with_mixed_content(child, **kwargs) 

590 # TODO if tag == "def-list": 

591 else: 

592 self.warnings.append( 

593 { 

594 self.pid: self.__class__.__name__ 

595 + "." 

596 + inspect.currentframe().f_code.co_name 

597 + " " 

598 + tag 

599 } 

600 ) 

601 

602 inner_tex = "" 

603 if label_tex: 

604 inner_tex += label_tex + " " 

605 if title_tex: 605 ↛ 606line 605 didn't jump to line 606, because the condition on line 605 was never true

606 inner_tex += title_tex + " " 

607 inner_tex += p_tex + content_tex 

608 

609 if self.for_tex_file: 609 ↛ 610line 609 didn't jump to line 610, because the condition on line 609 was never true

610 tex = "\\item " + inner_tex + "\n" 

611 else: 

612 tex = f"<li>{inner_tex}</li>" 

613 

614 html = "<li>" 

615 if label_html: 

616 html += label_html + " " 

617 if title_html: 617 ↛ 618line 617 didn't jump to line 618, because the condition on line 617 was never true

618 html += title_html + " " 

619 html += p_html + content_html + "</li>" 

620 

621 return tex, html 

622 

623 def parse_node_with_name_content(self, node, **kwargs): 

624 tex, html = self.parse_inner_node(node, **kwargs) 

625 return tex, html 

626 

627 def parse_node_with_p(self, node, **kwargs): 

628 tex, html = self.parse_inner_node(node, **kwargs) 

629 

630 if not self.for_tex_file: 

631 tex = f"<p>{tex}</p>" 

632 

633 node_type = node.get("specific-use") 

634 if node_type: 

635 html = f'<p class="{node_type}">{html}</p>' 

636 else: 

637 html = f"<p>{html}</p>" 

638 

639 if hasattr(self, "floats_to_insert") and hasattr(self, "floats"): 639 ↛ 640line 639 didn't jump to line 640, because the condition on line 639 was never true

640 while len(self.floats_to_insert) > 0: 

641 float_id = self.floats_to_insert.pop(0) 

642 if float_id in self.floats: 

643 html += self.floats[float_id] 

644 self.floats.pop(float_id) 

645 

646 return tex, html 

647 

648 def parse_node_with_sc(self, node, **kwargs): 

649 tex, html = self.parse_inner_node(node, **kwargs) 

650 html = f'<span class="smallcaps">{html}</span>' 

651 

652 return tex, html 

653 

654 def parse_node_with_sec(self, node, **kwargs): 

655 """ 

656 <sec><title>TITLE</title><p>TEXT</p> becomes 

657 <section><h@i>TITLE</h@i><p>TEXT</p> (i is the current level and is increased for children) 

658 

659 :param node: 

660 :param kwargs: 

661 :return: 

662 """ 

663 

664 label_tex = label_html = title_tex = title_html = None 

665 sec_level = kwargs["sec_level"] = kwargs["sec_level"] if "sec_level" in kwargs else 2 

666 

667 inner_tex = inner_html = "" 

668 kwargs["sec_level"] += 1 

669 

670 for child in node: 

671 tag = normalize(child.tag) 

672 if tag == "label": 

673 label_tex, label_html = self.parse_node_with_mixed_content(child) 

674 elif tag == "title": 

675 title_tex, title_html = self.parse_node_with_mixed_content(child) 

676 else: 

677 child_tex, child_html = self.parse_node_with_mixed_content(child, **kwargs) 

678 inner_tex += child_tex 

679 inner_html += child_html 

680 

681 tex = "" 

682 html = "<section>" 

683 

684 if label_html or title_html: 684 ↛ 697line 684 didn't jump to line 697, because the condition on line 684 was never false

685 html += f"<h{str(sec_level)}>" 

686 if label_html: 686 ↛ 689line 686 didn't jump to line 689, because the condition on line 686 was never false

687 tex += label_tex 

688 html += label_html 

689 if label_html and title_html: 689 ↛ 692line 689 didn't jump to line 692, because the condition on line 689 was never false

690 tex += " " 

691 html += " " 

692 if title_html: 692 ↛ 695line 692 didn't jump to line 695, because the condition on line 692 was never false

693 tex += title_tex 

694 html += title_html 

695 html += f"</h{str(sec_level)}>" 

696 

697 tex += inner_tex 

698 html += inner_html + "</section>" 

699 

700 return tex, html 

701 

702 def parse_node_with_string_name(self, node, **kwargs): 

703 tex, html = self.parse_inner_node(node, **kwargs) 

704 

705 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False 

706 if is_mixed_citation: 706 ↛ 709line 706 didn't jump to line 709, because the condition on line 706 was never false

707 html = add_span_class_to_html_from_authors(html.title(), **kwargs) 

708 

709 return tex, html 

710 

711 def parse_node_with_strong(self, node, **kwargs): 

712 tex, html = self.parse_inner_node(node, **kwargs) 

713 

714 if self.for_tex_file: 714 ↛ 715line 714 didn't jump to line 715, because the condition on line 714 was never true

715 tex = "{\\bf " + tex + "}" 

716 else: 

717 tex = f"<strong>{tex}</strong>" 

718 html = f"<strong>{html}</strong>" 

719 

720 return tex, html 

721 

722 def parse_node_with_styled_content(self, node, **kwargs): 

723 tex, html = self.parse_inner_node(node, **kwargs) 

724 

725 if "style" in node.attrib: 725 ↛ 730line 725 didn't jump to line 730, because the condition on line 725 was never false

726 style = node.attrib["style"] 

727 if style != "": 727 ↛ 730line 727 didn't jump to line 730, because the condition on line 727 was never false

728 html = f'<span style="{style}">{html}</span>' 

729 

730 return tex, html 

731 

732 def parse_node_with_sub(self, node, **kwargs): 

733 tex, html = self.parse_inner_node(node, **kwargs) 

734 

735 if self.for_tex_file: 735 ↛ 736line 735 didn't jump to line 736, because the condition on line 735 was never true

736 tex = "\\textsubscript{" + tex + "}" 

737 else: 

738 tex = f"<sub>{tex}</sub>" 

739 html = f"<sub>{html}</sub>" 

740 

741 return tex, html 

742 

743 def parse_node_with_sup(self, node, **kwargs): 

744 tex, html = self.parse_inner_node(node, **kwargs) 

745 

746 if self.for_tex_file: 746 ↛ 747line 746 didn't jump to line 747, because the condition on line 746 was never true

747 tex = "\\textsuperscript{" + tex + "}" 

748 else: 

749 tex = f"<sup>{tex}</sup>" 

750 html = f"<sup>{html}</sup>" 

751 

752 return tex, html 

753 

754 def parse_node_with_table_generic(self, node, **kwargs): 

755 tex, html = self.parse_inner_node(node, **kwargs) 

756 

757 tag = normalize(node.tag) 

758 if tag == "row": 758 ↛ 759line 758 didn't jump to line 759, because the condition on line 758 was never true

759 tag = "tr" 

760 elif tag == "entry": 760 ↛ 761line 760 didn't jump to line 761, because the condition on line 760 was never true

761 tag = "td" 

762 open_tag = "<" + tag 

763 

764 if tag == "table": 

765 class_table = "table" 

766 

767 cols = node.xpath("colgroup/col") 

768 i = 1 

769 for col in cols: 

770 if "width" in col.attrib: 

771 class_table += f" nowrap-col-{i}" 

772 i += 1 

773 

774 open_tag += f' class="{class_table}"' 

775 if "rowspan" in node.attrib: 

776 open_tag += ' rowspan="' + node.attrib["rowspan"] + '"' 

777 if "colspan" in node.attrib: 

778 open_tag += ' colspan="' + node.attrib["colspan"] + '"' 

779 if "align" in node.attrib: 

780 open_tag += ' align="' + node.attrib["align"] + '"' 

781 if "valign" in node.attrib: 

782 open_tag += ' class="td-valign-' + node.attrib["valign"] + '"' 

783 if "style" in node.attrib: 

784 open_tag += ' style="' + node.attrib["style"] + '"' 

785 open_tag += ">" 

786 

787 html = f"{open_tag}{html}</{tag}>" 

788 

789 return "", html 

790 

791 def parse_node_with_table_wrap(self, node, **kwargs): 

792 """ 

793 Create a <div class="table-wrap"> around the table 

794 :param node: 

795 :return: 

796 """ 

797 

798 table_id = label = caption = None 

799 inner_text = "" 

800 

801 if "id" in node.attrib: 801 ↛ 804line 801 didn't jump to line 804, because the condition on line 801 was never false

802 table_id = node.attrib["id"] 

803 

804 for child in node: 

805 tag = normalize(child.tag) 

806 if tag == "label": 

807 _, label = self.parse_node_with_mixed_content(child, **kwargs) 

808 elif tag == "caption": 

809 _, caption = self.parse_node_with_mixed_content(child, **kwargs) 

810 else: 

811 _, child_text = self.parse_node_with_mixed_content(child, **kwargs) 

812 inner_text += child_text 

813 

814 if table_id: 814 ↛ 817line 814 didn't jump to line 817, because the condition on line 814 was never false

815 text = '<div class="table-wrap table-responsive" id="' + table_id + '">' 

816 else: 

817 text = '<div class="table-wrap table-responsive">' 

818 

819 if label or caption: 819 ↛ 822line 819 didn't jump to line 822, because the condition on line 819 was never false

820 text += '<div class="table-wrap-header">' 

821 

822 if label: 822 ↛ 825line 822 didn't jump to line 825, because the condition on line 822 was never false

823 text += "<strong>" + label + "</strong>" 

824 

825 if caption: 825 ↛ 831line 825 didn't jump to line 831, because the condition on line 825 was never false

826 if label: 826 ↛ 828line 826 didn't jump to line 828, because the condition on line 826 was never false

827 text += " " 

828 if caption: 828 ↛ 831line 828 didn't jump to line 831, because the condition on line 828 was never false

829 text += caption 

830 

831 if label or caption: 831 ↛ 834line 831 didn't jump to line 834, because the condition on line 831 was never false

832 text += "</div>" 

833 

834 text += inner_text 

835 text += "</div>" 

836 

837 if ( 837 ↛ 843line 837 didn't jump to line 843

838 "append_floats" in kwargs 

839 and kwargs["append_floats"] 

840 and hasattr(self, "floats") 

841 and table_id is not None 

842 ): 

843 self.floats[table_id] = text 

844 

845 return "", text 

846 

847 def parse_node_with_table_wrap_foot(self, node, **kwargs): 

848 """ 

849 Create a <div class="table-wrap-foot"> at bottom of the table 

850 Keep the footnotes inside this div 

851 :param node: 

852 :return: 

853 """ 

854 

855 text = '<div class="table-wrap-foot">' 

856 kwargs["keep_fn"] = True 

857 

858 for child in node: 

859 tag = normalize(child.tag) 

860 if tag == "fn-group": 860 ↛ 858line 860 didn't jump to line 858, because the condition on line 860 was never false

861 _, html = self.parse_node_with_mixed_content(child, **kwargs) 

862 text += html 

863 

864 text += "</div>" 

865 

866 return "", text 

867 

868 def parse_node_with_toc(self, node, **kwargs): 

869 tex, html = self.parse_inner_node(node, **kwargs) 

870 

871 html = f"<table>{html}</table>" 

872 

873 # text = '<ul class="no-bullet book-toc">' 

874 # text += inner_text + '</ul>' 

875 

876 return "", html 

877 

878 def parse_node_with_toc_entry(self, node, **kwargs): 

879 html = label = title = child_text = page = anchor = "" 

880 inside_toc_entry = "inside_toc_entry" in kwargs and kwargs["inside_toc_entry"] 

881 toc_class = "inside-toc" if inside_toc_entry else "" 

882 # # toc-entry may be embedded inside toc-entry: create a wrapping <ul> 

883 # html = '<tr class="inside-toc">' 

884 # #html = '<ul class="no-bullet book-toc">' 

885 

886 for child in node: 

887 tag = normalize(child.tag) 

888 if tag == "title": 

889 _, title = self.parse_node_with_mixed_content(child, **kwargs) 

890 elif tag == "label": 

891 _, label = self.parse_node_with_mixed_content(child, **kwargs) 

892 elif tag == "nav-pointer": 

893 _, page = self.parse_node_with_mixed_content(child, **kwargs) 

894 elif tag == "nav-pointer-group": 894 ↛ 895line 894 didn't jump to line 895, because the condition on line 894 was never true

895 for grandchild in child: 

896 if ( 

897 grandchild.tag == "nav-pointer" 

898 and "specific-use" in grandchild.attrib 

899 and grandchild.attrib["specific-use"] == "pagenum" 

900 ): 

901 _, page = self.parse_node_with_mixed_content(grandchild, **kwargs) 

902 if ( 

903 grandchild.tag == "nav-pointer" 

904 and "specific-use" in grandchild.attrib 

905 and grandchild.attrib["specific-use"] == "pageindex" 

906 ): 

907 anchor = int(grandchild.text) + 1 

908 elif tag == "toc-entry": 908 ↛ 886line 908 didn't jump to line 886, because the condition on line 908 was never false

909 _, text = self.parse_node_with_mixed_content(child, inside_toc_entry=True) 

910 child_text += text 

911 

912 toc_text = f"{label} {title}" 

913 page_text = f"p. {page}" 

914 

915 if anchor: 915 ↛ 916line 915 didn't jump to line 916, because the condition on line 915 was never true

916 href = reverse("item-pdf", kwargs={"pid": self.pid, "extension": "pdf"}) 

917 href += f"#page={anchor}" 

918 toc_text = f'<a href="{href}">{toc_text}</a>' 

919 page_text = f'<a href="{href}">{page_text}</a>' 

920 

921 html += f'<tr><td class="{toc_class}">{toc_text}</td><td class="toc-page">{page_text}</td></tr>' 

922 if len(child_text) > 0: 

923 html += child_text 

924 # html += f'<li>{title} <span> p. {page}</span>{child_text}</li>' 

925 

926 # if 'inside_toc_entry' in kwargs and kwargs['inside_toc_entry']: 

927 # html += '</tr>' 

928 # #html += '</ul>' 

929 

930 return "", html 

931 

932 def parse_node_with_underline(self, node, **kwargs): 

933 tex, html = self.parse_inner_node(node, **kwargs) 

934 tex = f"<u>{tex}</u>" 

935 html = f"<u>{html}</u>" 

936 

937 return tex, html 

938 

939 def parse_node_with_volume(self, node, **kwargs): 

940 tex, html = self.parse_inner_node(node, **kwargs) 

941 

942 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False 

943 if is_mixed_citation: 943 ↛ 946line 943 didn't jump to line 946, because the condition on line 943 was never false

944 html = add_span_class_to_html_from_volume(html, **kwargs) 

945 

946 return tex, html 

947 

948 def parse_node_with_xref(self, node, **kwargs): 

949 tex = html = "" 

950 

951 if "ignore_xref" in kwargs and kwargs["ignore_xref"]: 951 ↛ 952line 951 didn't jump to line 952, because the condition on line 951 was never true

952 return tex, html 

953 

954 xref_id = node.get("rid") 

955 if xref_id: 955 ↛ 969line 955 didn't jump to line 969, because the condition on line 955 was never false

956 rids = xref_id.split() 

957 

958 tex, html = self.parse_inner_node(node, **kwargs) 

959 rid0 = rids[0] 

960 if rid0.find("bib") == 0: 960 ↛ 961line 960 didn't jump to line 961, because the condition on line 960 was never true

961 rid0 = "r" + rid0[3:] 

962 html = f'<a href="#{rid0}">{html}</a>' 

963 

964 for rid in rids: 

965 ref_type = node.get("ref-type") or None 

966 if ref_type in ["fig", "table", "textbox"] and hasattr(self, "floats_to_insert"): 966 ↛ 967line 966 didn't jump to line 967, because the condition on line 966 was never true

967 self.floats_to_insert.append(rid) 

968 

969 return tex, html 

970 

971 def parse_inner_node(self, node, **kwargs): 

972 """ 

973 Used by html_from_mixed_content for nodes that have a different tag in HTML 

974 :param node: 

975 :param kwargs: 

976 :return: 

977 """ 

978 tex = html = "" 

979 kwargs["is_top"] = False 

980 kwargs["is_body_html"] = kwargs["is_body_html"] if "is_body_html" in kwargs else False 

981 

982 if node.text: 

983 node_text = node.text 

984 if self.for_tex_file: 

985 node_text = unicode_to_latex(node_text) 

986 tex = node_text 

987 html = escape(node.text) 

988 

989 for child in node: 

990 child_tex, child_html = self.parse_node_with_mixed_content(child, **kwargs) 

991 tex += child_tex 

992 html += child_html 

993 

994 return tex, html 

995 

996 def parse_node_with_mixed_content(self, node, **kwargs): 

997 """ 

998 Parse and return the HTML text of an XML node which mixes text and XML sub-nodes. 

999 Ex: <node>text1 <a>text_a</a> text2 <b>text_b</b>b_tail</node> 

1000 Some inner nodes are removed, others are kept or replaced by their HTML equivalent. 

1001 html_from_mixed_content is called recursively to get the HTML text of the children. 

1002 

1003 :param node: XML Node 

1004 :param kwargs: params of the function 

1005 :return: HTML text 

1006 """ 

1007 

1008 if node is None: 1008 ↛ 1009line 1008 didn't jump to line 1009, because the condition on line 1008 was never true

1009 return "", "" 

1010 

1011 # The tail is the text following the end of the node 

1012 # Ex: <node>text1<a>text_a</a>a_tail</node> 

1013 # The HTML text has to include the tail 

1014 # only if html_from_mixed_content was called recursively 

1015 kwargs["is_top"] = kwargs["is_top"] if "is_top" in kwargs else True 

1016 

1017 # sec_level is used to add <h1>, <h2>,... in the HTML text while parsing nodes like <sec> 

1018 kwargs["sec_level"] = kwargs["sec_level"] if "sec_level" in kwargs else 2 

1019 

1020 # Text in <comment> is parsed to add HTML link. 

1021 kwargs["add_HTML_link"] = kwargs["add_HTML_link"] if "add_HTML_link" in kwargs else False 

1022 

1023 # base_url to image links 

1024 kwargs["base_url"] = kwargs["base_url"] if "base_url" in kwargs else "" 

1025 

1026 # footnotes are removed from the fulltext (and put at the end) except for those in a table 

1027 kwargs["keep_fn"] = kwargs["keep_fn"] if "keep_fn" in kwargs else False 

1028 

1029 kwargs["is_citation"] = kwargs["is_citation"] if "is_citation" in kwargs else False 

1030 kwargs["is_comment"] = kwargs["is_comment"] if "is_comment" in kwargs else False 

1031 # mixed-citation ignores ext-link 

1032 kwargs["add_ext_link"] = kwargs["add_ext_link"] if "add_ext_link" in kwargs else False 

1033 

1034 # TODO remove once jats_parser has been validated agains xmldata 

1035 kwargs["temp_math"] = kwargs["temp_math"] if "temp_math" in kwargs else False 

1036 kwargs["temp_tex"] = kwargs["temp_tex"] if "temp_tex" in kwargs else False 

1037 kwargs["is_mixed_citation"] = ( 

1038 kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False 

1039 ) 

1040 kwargs["is_body_html"] = kwargs["is_body_html"] if "is_body_html" in kwargs else False 

1041 

1042 tag = normalize(node.tag) 

1043 

1044 # pub-id/object-id are ignored by default are they are treated separately 

1045 if not (kwargs["is_comment"]) and tag in ("pub-id", "object-id"): 

1046 return "", "" 

1047 

1048 if tag in ("mixed-citation", "toc"): 

1049 kwargs["is_citation"] = True 

1050 elif tag == "comment": 

1051 kwargs["is_comment"] = True 

1052 

1053 tex = html = inner_tex = inner_html = "" 

1054 

1055 # I. Add the node's text. 

1056 # Some tag have a corresponding parse_node_with_@tag function to generate the HTML text. 

1057 

1058 # Check if the parse_node_with_@tag exists 

1059 tag_mapped = { 

1060 "statement": "sec", 

1061 "disp-formula": "inline-formula", 

1062 "chapter-title": "article-title", 

1063 "bold": "strong", 

1064 "table": "table-generic", 

1065 "th": "table-generic", 

1066 "tr": "table-generic", 

1067 "td": "table-generic", 

1068 "thead": "table-generic", 

1069 "tbody": "table-generic", 

1070 "colgroup": "table-generic", 

1071 "col": "table-generic", 

1072 "tgroup": "table-generic", 

1073 "entry": "table-generic", 

1074 "row": "table-generic", 

1075 } 

1076 

1077 fct_name = tag_mapped[tag] if tag in tag_mapped else tag 

1078 fct_name = "parse_node_with_" + fct_name.replace("-", "_") 

1079 ftor = getattr(self, fct_name, None) 

1080 if callable(ftor): 

1081 inner_tex, inner_html = ftor(node, **kwargs) 

1082 elif tag in ("ext-link", "uri"): 

1083 # Add HTML links 

1084 inner_tex = inner_html = self.helper_add_link_from_node(node, **kwargs) 

1085 # Update self.ext_links. Useful for <ext-link> deep in a <mixed_citation>, 

1086 # and not caught by parse_citation_node 

1087 if tag == "ext-link" and not kwargs["is_comment"] and kwargs["add_ext_link"]: 

1088 is_extid_value = self.parse_ext_link(node, **kwargs) 

1089 if is_extid_value and kwargs["is_mixed_citation"]: 

1090 # an extid has been found in a mixed_citation, no need to add the text of the id here 

1091 inner_tex = inner_html = "" 

1092 elif tag == "supplementary-material": 1092 ↛ 1093line 1092 didn't jump to line 1093, because the condition on line 1092 was never true

1093 self.parse_supplementary_material(node, **kwargs) 

1094 else: 

1095 # II.1. Add the node text (before the children text) 

1096 if node.text is not None: 

1097 node_text = node.text 

1098 if self.for_tex_file: 1098 ↛ 1099line 1098 didn't jump to line 1099, because the condition on line 1098 was never true

1099 node_text = unicode_to_latex(node_text) 

1100 inner_tex += node_text 

1101 inner_html += escape(node.text) 

1102 

1103 # II.2. children 

1104 # child_text = html_from_mixed_content(child, params) 

1105 

1106 child_kwargs = kwargs.copy() 

1107 child_kwargs["is_top"] = False 

1108 

1109 for child in node: 

1110 child_tex, child_html = self.parse_node_with_mixed_content(child, **child_kwargs) 

1111 

1112 # Case where an ext-link has been removed in a mixed-citation 

1113 # We may have "title. , (year)" 

1114 # Remove the comma that is now useless 

1115 if ( 1115 ↛ 1121line 1115 didn't jump to line 1121

1116 kwargs["is_mixed_citation"] 

1117 and child_html 

1118 and child_html[0] in [",", "."] 

1119 and inner_html[-2:] == ". " 

1120 ): 

1121 inner_html = inner_html[0:-1] 

1122 child_html = child_html[1:] 

1123 inner_tex = inner_tex[0:-1] 

1124 child_tex = child_tex[1:] 

1125 

1126 inner_tex += child_tex 

1127 inner_html += child_html 

1128 

1129 # II.3. wrap the children text with html links 

1130 if kwargs["add_HTML_link"] and node.text: 

1131 match = re.match(r"[\n ]+", node.text) 

1132 if not match: 

1133 inner_html = make_links_clickable(node.text, inner_html) 

1134 

1135 tex += inner_tex 

1136 html += inner_html 

1137 

1138 # III. Add the node's tail for children 

1139 if node.tail and not kwargs["is_top"]: 

1140 node_tail = node.tail 

1141 if self.for_tex_file: 

1142 node_tail = unicode_to_latex(node_tail) 

1143 tex += node_tail 

1144 html += escape(node.tail) 

1145 

1146 return tex, html 

1147 

1148 def parse_abstract(self, node, **kwargs): 

1149 # tag = get_normalized_attrib(node, "abstract-node_type") or "abstract" 

1150 tag = get_normalized_attrib(node, "abstract-type") or "abstract" 

1151 if tag == "author": 1151 ↛ 1152line 1151 didn't jump to line 1152, because the condition on line 1151 was never true

1152 tag = "abstract" 

1153 lang = get_normalized_attrib(node, "lang") or self.lang 

1154 value_tex, value_html = self.parse_node_with_mixed_content(node) 

1155 value_xml = get_xml_from_node(node) 

1156 self.abstracts.append( 

1157 { 

1158 "tag": tag, 

1159 "lang": lang, 

1160 "value_xml": value_xml, 

1161 "value_html": value_html, 

1162 "value_tex": value_tex, 

1163 } 

1164 ) 

1165 

1166 def parse_aff_alternatives(self, node, **kwargs): 

1167 xref_id = get_normalized_attrib(node, "id") or "" 

1168 address = "" 

1169 aff_to_all = True 

1170 

1171 for child in node: 

1172 tag = normalize(child.tag) 

1173 

1174 if tag == "aff": 1174 ↛ 1185line 1174 didn't jump to line 1185, because the condition on line 1174 was never false

1175 # Skip the formatted aff and use only the complete address text 

1176 # TODO support <aff> properly 

1177 for aff in child: 

1178 if aff.tag == "label" and address == "": 1178 ↛ 1179line 1178 didn't jump to line 1179, because the condition on line 1178 was never true

1179 label = get_text_from_node(aff) 

1180 address = get_text_from_node(child)[len(label) :] 

1181 aff_to_all = False 

1182 if address == "" and child.text: 

1183 address = child.text 

1184 else: 

1185 self.warnings.append( 

1186 { 

1187 self.pid: self.__class__.__name__ 

1188 + "." 

1189 + inspect.currentframe().f_code.co_name 

1190 + " " 

1191 + tag 

1192 } 

1193 ) 

1194 

1195 if address != "": 1195 ↛ exitline 1195 didn't return from function 'parse_aff_alternatives', because the condition on line 1195 was never false

1196 for contrib in self.contributors: 

1197 if address not in contrib["addresses"] and ( 1197 ↛ 1196line 1197 didn't jump to line 1196, because the condition on line 1197 was never false

1198 ("xrefs" in contrib and xref_id in contrib["xrefs"]) or aff_to_all 

1199 ): 

1200 contrib["addresses"].append(address) 

1201 contrib["contrib_xml"] = get_contrib_xml(contrib) 

1202 

1203 def parse_award_group(self, node, **kwargs): 

1204 abbrev = award_id = None 

1205 

1206 for child in node: 

1207 tag = normalize(child.tag) 

1208 

1209 if tag == "award-id": 

1210 award_id = child.text 

1211 elif tag == "funding-source": 1211 ↛ 1214line 1211 didn't jump to line 1214, because the condition on line 1211 was never false

1212 abbrev = get_text_from_node(child) 

1213 else: 

1214 self.warnings.append( 

1215 { 

1216 self.pid: self.__class__.__name__ 

1217 + "." 

1218 + inspect.currentframe().f_code.co_name 

1219 + " " 

1220 + tag 

1221 } 

1222 ) 

1223 

1224 if abbrev is not None and award_id is not None: 1224 ↛ exitline 1224 didn't return from function 'parse_award_group', because the condition on line 1224 was never false

1225 self.awards.append({"abbrev": abbrev, "award_id": award_id}) 

1226 

1227 def parse_contrib_group(self, node, **kwargs): 

1228 role = node.get("content-type") or "" 

1229 if role and role[-1] == "s": 1229 ↛ 1232line 1229 didn't jump to line 1232, because the condition on line 1229 was never false

1230 role = role[0:-1] 

1231 

1232 for child in node: 

1233 tag = normalize(child.tag) 

1234 

1235 if tag == "contrib": 1235 ↛ 1240line 1235 didn't jump to line 1240, because the condition on line 1235 was never false

1236 contrib = self.get_data_from_contrib(child) 

1237 contrib["role"] = f"{role}|{contrib['role']}" if contrib["role"] else role 

1238 contrib["contrib_xml"] = get_xml_from_node(child) 

1239 self.contributors.append(contrib) 

1240 elif tag == "aff-alternatives": 

1241 self.parse_aff_alternatives(child) 

1242 elif tag == "fn": 

1243 _, html = self.parse_node_with_fn(child, keep_fn=True, keep_fn_label=False) 

1244 xml = get_xml_from_node(child) 

1245 self.footnotes_xml += xml 

1246 self.footnotes_html += html 

1247 else: 

1248 self.warnings.append( 

1249 { 

1250 self.pid: self.__class__.__name__ 

1251 + "." 

1252 + inspect.currentframe().f_code.co_name 

1253 + " " 

1254 + tag 

1255 } 

1256 ) 

1257 

1258 def parse_counts(self, node, **kwargs): 

1259 for child in node: 

1260 count_value = child.get("count") 

1261 if count_value is None: 

1262 count_value = child.text 

1263 

1264 if count_value is not None: 1264 ↛ 1259line 1264 didn't jump to line 1259, because the condition on line 1264 was never false

1265 tag = normalize(child.tag) 

1266 if tag == "book-page-count": 

1267 tag = "page-count" 

1268 

1269 self.counts.append((tag, count_value)) 

1270 

1271 def parse_ext_link(self, node, **kwargs): 

1272 datas = self.get_data_from_ext_link(node) 

1273 extid_value = self.add_extids_from_node_with_link(datas) 

1274 

1275 add_ext_link = kwargs["add_ext_link"] if "add_ext_link" in kwargs else False 

1276 if ( 

1277 add_ext_link 

1278 and extid_value[0] is None 

1279 and datas not in self.ext_links 

1280 and datas["rel"] != "cover" 

1281 ): 

1282 self.ext_links.append(datas) 

1283 

1284 return extid_value[0] is not None 

1285 

1286 def parse_front_matter(self, node, **kwargs): 

1287 self.frontmatter_xml = get_xml_from_node(node) 

1288 self.frontmatter_foreword_html = "" 

1289 

1290 for child in node: 

1291 tag = normalize(child.tag) 

1292 

1293 if tag == "foreword": 1293 ↛ 1294line 1293 didn't jump to line 1294, because the condition on line 1293 was never true

1294 _, self.frontmatter_foreword_html = self.parse_node_with_mixed_content(child) 

1295 elif tag == "toc": 1295 ↛ 1290line 1295 didn't jump to line 1290, because the condition on line 1295 was never false

1296 _, self.frontmatter_toc_html = self.parse_node_with_mixed_content(child) 

1297 

1298 def parse_id(self, node, **kwargs): 

1299 node_id = node.text 

1300 if "pub-id-type" in node.attrib: 

1301 node_type = node.attrib["pub-id-type"] 

1302 elif "book-id-type" in node.attrib: 

1303 node_type = node.attrib["book-id-type"] 

1304 elif "book-part-id-type" in node.attrib: 1304 ↛ 1307line 1304 didn't jump to line 1307, because the condition on line 1304 was never false

1305 node_type = node.attrib["book-part-id-type"] 

1306 else: 

1307 node_type = "" 

1308 

1309 if node_type == "pii": 1309 ↛ 1311line 1309 didn't jump to line 1311, because the condition on line 1309 was never true

1310 # Elsevier ids get a special treatment: web scrapping to find the date_published 

1311 if self.pid and len(self.pid) > 2 and self.pid[0:2] == "CR": 

1312 self.pii = node_id 

1313 elif node_type in ("numdam-id", "mathdoc-id"): 

1314 self.pid = node_id 

1315 elif node_type == "ark": 1315 ↛ 1316line 1315 didn't jump to line 1316, because the condition on line 1315 was never true

1316 self.extids.append((node_type, node_id)) 

1317 elif node_type in ("doi", "eid"): 

1318 self.ids.append((node_type, node_id)) 

1319 if node_type == "doi": 1319 ↛ exitline 1319 didn't return from function 'parse_id', because the condition on line 1319 was never false

1320 self.doi = node_id 

1321 

1322 def parse_kwd_group(self, node, **kwargs): 

1323 kwds = [] 

1324 value_html = value_tex = "" 

1325 for child in node: 

1326 tag = normalize(child.tag) 

1327 

1328 if tag == "kwd": 

1329 kwds.append(child.text) 

1330 elif tag == "unstructured-kwd-group": 1330 ↛ 1335line 1330 didn't jump to line 1335, because the condition on line 1330 was never false

1331 # value_xml = get_xml_from_node(child) 

1332 value_tex, value_html = self.parse_node_with_mixed_content(child) 

1333 kwds = split_kwds(value_tex) 

1334 else: 

1335 self.warnings.append( 

1336 { 

1337 self.pid: self.__class__.__name__ 

1338 + "." 

1339 + inspect.currentframe().f_code.co_name 

1340 + " " 

1341 + tag 

1342 } 

1343 ) 

1344 

1345 content_type = node.get("content-node_type") or "" 

1346 if content_type == "": 1346 ↛ 1348line 1346 didn't jump to line 1348, because the condition on line 1346 was never false

1347 content_type = node.get("kwd-group-type") or "" 

1348 lang = get_normalized_attrib(node, "lang") or self.lang 

1349 

1350 self.kwds.extend([{"type": content_type, "lang": lang, "value": kwd} for kwd in kwds]) 

1351 

1352 def parse_ref_list(self, node, **kwargs): 

1353 for child in node: 

1354 tag = normalize(child.tag) 

1355 

1356 if tag == "ref": 

1357 ref = JatsRef(tree=child, lang=self.lang) 

1358 self.warnings.extend(ref.warnings) 

1359 self.bibitems.append(ref) 

1360 self.bibitem.append(ref.citation_html) 

1361 elif tag == "p": 1361 ↛ 1363line 1361 didn't jump to line 1363, because the condition on line 1361 was never true

1362 # Elsevier can store supplementary-material inside ref-list / p 

1363 self.parse_node_with_mixed_content(child) 

1364 else: 

1365 self.warnings.append( 

1366 { 

1367 self.pid: self.__class__.__name__ 

1368 + "." 

1369 + inspect.currentframe().f_code.co_name 

1370 + " " 

1371 + tag 

1372 } 

1373 ) 

1374 

1375 def parse_related_article(self, node, **kwargs): 

1376 rel_type = get_normalized_attrib(node, "related-article-type") or "" 

1377 id_value = node.text 

1378 

1379 if hasattr(self, "pii") and id_value and id_value.find("10.") == -1 and id_value != "NONE": 1379 ↛ 1382line 1379 didn't jump to line 1382, because the condition on line 1379 was never true

1380 # a pii is used instead of a DOI 

1381 # Call Elsevier to get the doi 

1382 doi = scrapping.fetch_article(self.doi, id_value, pii_doi_equivalence=True) 

1383 id_value = doi 

1384 

1385 obj = Foo() 

1386 obj.rel_type = rel_type 

1387 obj.id_value = id_value 

1388 

1389 self.relations.append(obj) 

1390 

1391 def parse_related_object(self, node, **kwargs): 

1392 node_type = node.get("content-type") or "" 

1393 rel = node.get("link-type") or "" 

1394 href = get_normalized_attrib(node, "href") or "" 

1395 base = get_normalized_attrib(node, "base") or "" 

1396 text = get_xml_from_node(node) 

1397 

1398 data = { 

1399 "rel": rel, 

1400 "mimetype": node_type, 

1401 "location": href, 

1402 "base": base, 

1403 "metadata": text, 

1404 } 

1405 

1406 document_id_type = node.get("document-id-type") or "" 

1407 if document_id_type: 1407 ↛ 1408line 1407 didn't jump to line 1408, because the condition on line 1407 was never true

1408 id_value = node.get("document-id") or "" 

1409 if id_value != "NONE": 

1410 if id_value and id_value.find("10.") == -1: 

1411 # a pii is used instead of a DOI 

1412 # Call Elsevier to get the doi 

1413 doi = scrapping.fetch_article(self.doi, id_value, pii_doi_equivalence=True) 

1414 id_value = doi 

1415 

1416 obj = Foo() 

1417 obj.rel_type = "refers to" 

1418 obj.id_value = id_value 

1419 

1420 self.relations.append(obj) 

1421 else: 

1422 self.related_objects.append(data) 

1423 

1424 def parse_sec(self, node, **kwargs): 

1425 for child in node: 

1426 tag = normalize(child.tag) 

1427 

1428 if tag == "title": 

1429 pass 

1430 elif tag == "ref-list": 

1431 self.parse_ref_list(child) 

1432 else: 

1433 self.warnings.append( 

1434 { 

1435 self.pid: self.__class__.__name__ 

1436 + "." 

1437 + inspect.currentframe().f_code.co_name 

1438 + " " 

1439 + tag 

1440 } 

1441 ) 

1442 

1443 def parse_self_uri(self, node, **kwargs): 

1444 node_type = node.get("content-type") or "text/html" 

1445 href = get_normalized_attrib(node, "href") or "" 

1446 base = get_normalized_attrib(node, "base") or "" 

1447 

1448 # The XML of the Elsevier archive do not declare the PDF location like the other Mathdoc collections: 

1449 # The collection folder is missing: add it back 

1450 if hasattr(self, "pii") and hasattr(self, "issue"): 1450 ↛ 1451line 1450 didn't jump to line 1451, because the condition on line 1450 was never true

1451 base_dir = self.issue.journal.pid 

1452 if os.path.dirname(href) != base_dir: 

1453 href = os.path.join(base_dir, self.issue.pid, href) 

1454 

1455 if self.no_bib: 1455 ↛ 1456line 1455 didn't jump to line 1456, because the condition on line 1455 was never true

1456 href = "http://www.numdam.org/item/" + os.path.basename(href) 

1457 

1458 data = { 

1459 "rel": "full-text", 

1460 "mimetype": node_type, 

1461 "location": href, 

1462 "base": base, 

1463 "text": normalize_space(node.text) if node.text is not None else "", 

1464 } 

1465 

1466 # Ext-links, Related-objects used metadata instead of text. Strange difference ? 

1467 # xml_cmds ignore "application/xml" in add_objects_with_location: they are ignored here. 

1468 if node_type != "application/xml": 

1469 self.streams.append(data) 

1470 

1471 def parse_sub_article(self, node, **kwargs): 

1472 # Used for translations 

1473 trans_article = JatsArticle(tree=node) 

1474 self.translations.append(trans_article) 

1475 

1476 def parse_subj_group(self, node, **kwargs): 

1477 lang = get_normalized_attrib(node, "lang") or self.lang 

1478 type_ = node.get("subj-group-type") or "" 

1479 

1480 for child in node: 

1481 tag = normalize(child.tag) 

1482 

1483 if tag == "subject": 1483 ↛ 1488line 1483 didn't jump to line 1488, because the condition on line 1483 was never false

1484 self.subjs.append( 

1485 {"type": type_, "lang": lang, "value": get_text_from_node(child)} 

1486 ) 

1487 else: 

1488 self.warnings.append( 

1489 { 

1490 self.pid: self.__class__.__name__ 

1491 + "." 

1492 + inspect.currentframe().f_code.co_name 

1493 + " " 

1494 + tag 

1495 } 

1496 ) 

1497 

1498 def parse_supplementary_material(self, node, **kwargs): 

1499 caption = "" 

1500 for child in node: 

1501 if child.tag == "caption": 

1502 _, caption = self.parse_node_with_mixed_content(child) 

1503 

1504 location = get_normalized_attrib(node, "href") or None 

1505 if location is None: 

1506 location = get_normalized_attrib(node, "id") or "" 

1507 

1508 mimetype = node.attrib.get("mimetype") or None 

1509 if mimetype is None: 

1510 mimetype = resolver.get_mimetype(location) 

1511 

1512 material = { 

1513 "rel": node.attrib.get("content-type") or "supplementary-material", 

1514 "mimetype": mimetype, 

1515 "location": location, 

1516 "base": "", 

1517 "metadata": "", 

1518 "caption": caption if caption else "", 

1519 } 

1520 base_location = os.path.basename(location) 

1521 found_list = [ 

1522 item 

1523 for item in self.supplementary_materials 

1524 if os.path.basename(item["location"]) == base_location 

1525 ] 

1526 if len(found_list) == 0: 

1527 self.supplementary_materials.append(material) 

1528 

1529 def parse_title(self, node, **kwargs): 

1530 self.title_tex, self.title_html = self.parse_node_with_mixed_content( 

1531 node, ignore_xref=True 

1532 ) 

1533 # In xmldata.py, title_xml had the <title_group> tag: 

1534 # self.title_xml can't be set in parse_title 

1535 

1536 def parse_title_group(self, node, **kwargs): 

1537 has_fn_group = False 

1538 

1539 for child in node: 

1540 tag = normalize(child.tag) 

1541 

1542 if tag in ("title", "journal-title", "article-title", "book-title", "issue-title"): 

1543 self.parse_title(child) 

1544 elif tag == "subtitle": 1544 ↛ 1545line 1544 didn't jump to line 1545, because the condition on line 1544 was never true

1545 title_tex, title_html = self.parse_node_with_mixed_content(child) 

1546 self.title_tex += " " + title_tex 

1547 self.title_html += " " + title_html 

1548 elif tag == "trans-title-group": 

1549 self.parse_trans_title_group(child) 

1550 elif tag == "abbrev-title": 

1551 _, self.abbrev = self.parse_node_with_mixed_content(child) 

1552 elif tag == "fn-group": 1552 ↛ 1553line 1552 didn't jump to line 1553, because the condition on line 1552 was never true

1553 has_fn_group = True 

1554 for fn_node in child: 

1555 if fn_node.tag == "fn": 

1556 _, html = self.parse_node_with_fn( 

1557 fn_node, keep_fn=True, keep_fn_label=False 

1558 ) 

1559 xml = get_xml_from_node(fn_node) 

1560 self.footnotes_xml += xml 

1561 self.footnotes_html += html 

1562 else: 

1563 self.warnings.append( 

1564 { 

1565 self.pid: self.__class__.__name__ 

1566 + "." 

1567 + inspect.currentframe().f_code.co_name 

1568 + " " 

1569 + tag 

1570 } 

1571 ) 

1572 

1573 if has_fn_group: 1573 ↛ 1576line 1573 didn't jump to line 1576, because the condition on line 1573 was never true

1574 # fn-group is now a funding statement and will be exported separately in the XML: 

1575 # => remove it from the title-group 

1576 new_node = etree.Element("title-group") 

1577 for child in node: 

1578 tag = normalize(child.tag) 

1579 if tag != "fn-group": 

1580 new_node.append(copy.deepcopy(child)) 

1581 self.title_xml = get_xml_from_node(new_node) 

1582 else: 

1583 self.title_xml = get_xml_from_node(node) 

1584 

1585 def parse_trans_abstract(self, node, **kwargs): 

1586 tag = get_normalized_attrib(node, "abstract-type") or "abstract" 

1587 if tag == "author": 1587 ↛ 1588line 1587 didn't jump to line 1588, because the condition on line 1587 was never true

1588 tag = "abstract" 

1589 lang = get_normalized_attrib(node, "lang") or "und" 

1590 value_tex, value_html = self.parse_node_with_mixed_content(node) 

1591 value_xml = get_xml_from_node(node) 

1592 self.abstracts.append( 

1593 { 

1594 "tag": tag, 

1595 "lang": lang, 

1596 "value_xml": value_xml, 

1597 "value_html": value_html, 

1598 "value_tex": value_tex, 

1599 } 

1600 ) 

1601 

1602 def parse_trans_title(self, node, **kwargs): 

1603 self.trans_title_tex, self.trans_title_html = self.parse_node_with_mixed_content(node) 

1604 self.trans_title_xml = get_xml_from_node(node) 

1605 

1606 def parse_trans_title_group(self, node, **kwargs): 

1607 for child in node: 

1608 tag = normalize(child.tag) 

1609 

1610 if tag == "trans-title": 1610 ↛ 1613line 1610 didn't jump to line 1613, because the condition on line 1610 was never false

1611 self.parse_trans_title(child) 

1612 else: 

1613 self.warnings.append( 

1614 { 

1615 self.pid: self.__class__.__name__ 

1616 + "." 

1617 + inspect.currentframe().f_code.co_name 

1618 + " " 

1619 + tag 

1620 } 

1621 ) 

1622 

1623 self.trans_lang = get_normalized_attrib(node, "lang") or "und" 

1624 

1625 def get_data_from_contrib(self, node): 

1626 """ 

1627 <contrib> creates 1 person, defined in <name>, <string-name> or <name-alternatives> 

1628 In a <mixed-citation>, each <name> creates 1 person: we can't use the same code 

1629 :param node: 

1630 :return: 

1631 """ 

1632 

1633 params = create_contributor() 

1634 

1635 for child in node: 

1636 if child.tag == "name": 

1637 self.update_data_from_name(child, params) 

1638 elif child.tag == "string-name": 

1639 self.update_data_from_name(child, params) 

1640 if params["first_name"] == "" and params["last_name"] == "": 1640 ↛ 1635line 1640 didn't jump to line 1635, because the condition on line 1640 was never false

1641 params["string_name"] = child.text or "" 

1642 elif child.tag == "name-alternatives": 

1643 params["mid"] = self.get_data_from_name_alternatives(child) 

1644 elif child.tag == "contrib-id": 

1645 type_ = child.get("contrib-id-type") or "" 

1646 if type_ == "orcid": 1646 ↛ 1648line 1646 didn't jump to line 1648, because the condition on line 1646 was never false

1647 params["orcid"] = child.text or "" 

1648 if type_ == "idref": 1648 ↛ 1649line 1648 didn't jump to line 1649, because the condition on line 1648 was never true

1649 params["idref"] = child.text or "" 

1650 elif child.tag == "address": 

1651 addr = get_text_from_node(child) 

1652 params["addresses"].append(addr) 

1653 elif child.tag == "email": 

1654 params["email"] = child.text or "" 

1655 elif child.tag == "xref": 1655 ↛ 1667line 1655 didn't jump to line 1667, because the condition on line 1655 was never false

1656 # Elsevier uses xref/aff-alternatives to store affiliations 

1657 type_ = child.get("ref-type") or "" 

1658 if type_ == "aff": 1658 ↛ 1635line 1658 didn't jump to line 1635, because the condition on line 1658 was never false

1659 xref = child.get("rid") or "" 

1660 if xref == "": 1660 ↛ 1661line 1660 didn't jump to line 1661, because the condition on line 1660 was never true

1661 xref = get_text_from_node(child) 

1662 if xref != "": 1662 ↛ 1635line 1662 didn't jump to line 1635, because the condition on line 1662 was never false

1663 if "xrefs" not in params: 1663 ↛ 1666line 1663 didn't jump to line 1666, because the condition on line 1663 was never false

1664 params["xrefs"] = [xref] 

1665 else: 

1666 params["xrefs"].append(xref) 

1667 elif child.tag == "collab": 

1668 params["string_name"] = child.text or "" 

1669 elif child.tag == "role": 

1670 pass 

1671 # Role is used in BJHTUP11 as a textual description of the role (ex "Présidente"). 

1672 # The node value can not be assigned to params['role'] as we want a controlled vocabulary 

1673 # (author /editor / organizer...) 

1674 # Ignore the value 

1675 # params["role"] = child.text or "" 

1676 else: 

1677 self.warnings.append( 

1678 { 

1679 self.pid: self.__class__.__name__ 

1680 + "." 

1681 + inspect.currentframe().f_code.co_name 

1682 + " " 

1683 + child.tag 

1684 } 

1685 ) 

1686 

1687 # Remove the sort, it causes differences between the HTML and the PDF (discovered in PCJ) 

1688 # Sort was introduced on 22/09/2020, based on differences between the Cedrics->JATS XSLT et the Cedrics import 

1689 # params['addresses'].sort() 

1690 

1691 helper_update_name_params(params) 

1692 

1693 corresp = node.get("corresp") or "" 

1694 if corresp == "yes": 

1695 params["corresponding"] = True 

1696 

1697 deceased_ = node.get("deceased") or "no" 

1698 params["deceased_before_publication"] = deceased_ == "yes" 

1699 

1700 equal_contrib_ = node.get("equal-contrib") or "no" 

1701 params["equal_contrib"] = equal_contrib_ == "yes" 

1702 

1703 return params 

1704 

1705 def get_data_from_custom_meta(self, node): 

1706 name = "" 

1707 value = "" 

1708 

1709 for child in node: 

1710 tag = normalize(child.tag) 

1711 

1712 if tag == "meta-name": 

1713 name = child.text 

1714 elif tag == "meta-value": 1714 ↛ 1717line 1714 didn't jump to line 1717, because the condition on line 1714 was never false

1715 value = child.text 

1716 else: 

1717 self.warnings.append( 

1718 { 

1719 self.pid: self.__class__.__name__ 

1720 + "." 

1721 + inspect.currentframe().f_code.co_name 

1722 + " " 

1723 + tag 

1724 } 

1725 ) 

1726 

1727 return name, value 

1728 

1729 def get_data_from_date(self, node, ignore_month=False): 

1730 date_str = "" 

1731 if "iso-8601-date" in node.attrib: 

1732 date_str = node.attrib["iso-8601-date"] 

1733 else: 

1734 year = month = day = "" 

1735 for child in node: 

1736 tag = normalize(child.tag) 

1737 

1738 if tag == "year": 1738 ↛ 1740line 1738 didn't jump to line 1740, because the condition on line 1738 was never false

1739 year = child.text 

1740 elif tag == "month" and not ignore_month: 

1741 month = child.text 

1742 elif tag == "day": 

1743 day = child.text 

1744 else: 

1745 self.warnings.append( 

1746 { 

1747 self.pid: self.__class__.__name__ 

1748 + "." 

1749 + inspect.currentframe().f_code.co_name 

1750 + " " 

1751 + tag 

1752 } 

1753 ) 

1754 

1755 date_str = year 

1756 if date_str and month: 1756 ↛ 1757line 1756 didn't jump to line 1757, because the condition on line 1756 was never true

1757 date_str += "-" + month 

1758 if date_str and day: 1758 ↛ 1759line 1758 didn't jump to line 1759, because the condition on line 1758 was never true

1759 date_str += "-" + day 

1760 

1761 return date_str 

1762 

1763 def get_data_from_ext_link(self, node, **kwargs): 

1764 link_type = node.get("ext-link-type") or "" 

1765 href = get_normalized_attrib(node, "href") or "" 

1766 base = get_normalized_attrib(node, "base") or "" 

1767 

1768 kwargs["add_HTML_link"] = False 

1769 _, metadata = self.parse_inner_node(node, **kwargs) 

1770 

1771 data = { 

1772 "rel": link_type, 

1773 "mimetype": "", 

1774 "location": href, 

1775 "base": base, 

1776 "metadata": metadata, 

1777 } 

1778 

1779 return data 

1780 

1781 def get_data_from_history(self, node): 

1782 history_dates = [] 

1783 # TODO: transform history_dates in a hash where date-type is the key 

1784 # => Change database_cmds 

1785 for child in node: 

1786 if "date-type" in child.attrib: 

1787 date_type = child.attrib["date-type"] 

1788 date_str = self.get_data_from_date(child) 

1789 history_dates.append({"type": date_type, "date": date_str}) 

1790 else: 

1791 self.warnings.append( 

1792 { 

1793 self.pid: self.__class__.__name__ 

1794 + "." 

1795 + inspect.currentframe().f_code.co_name 

1796 + " " 

1797 + child.tag 

1798 } 

1799 ) 

1800 

1801 return history_dates 

1802 

1803 def update_data_from_name(self, node, contributor): 

1804 for child in node: 

1805 if child.text is not None: 1805 ↛ 1804line 1805 didn't jump to line 1804, because the condition on line 1805 was never false

1806 if child.tag == "given-names": 

1807 contributor["first_name"] = child.text 

1808 elif child.tag == "surname": 

1809 contributor["last_name"] = child.text 

1810 elif child.tag == "prefix": 1810 ↛ 1811line 1810 didn't jump to line 1811, because the condition on line 1810 was never true

1811 contributor["prefix"] = child.text 

1812 elif child.tag == "suffix": 1812 ↛ 1815line 1812 didn't jump to line 1815, because the condition on line 1812 was never false

1813 contributor["suffix"] = child.text 

1814 else: 

1815 self.warnings.append( 

1816 { 

1817 self.pid: self.__class__.__name__ 

1818 + "." 

1819 + inspect.currentframe().f_code.co_name 

1820 + " " 

1821 + child.tag 

1822 } 

1823 ) 

1824 

1825 def get_data_from_name_alternatives(self, node): 

1826 mid = "" 

1827 

1828 for child in node: 

1829 if child.text is not None: 1829 ↛ 1828line 1829 didn't jump to line 1828, because the condition on line 1829 was never false

1830 if child.tag == "string-name": 1830 ↛ 1834line 1830 didn't jump to line 1834, because the condition on line 1830 was never false

1831 if child.get("specific-use") == "index": 1831 ↛ 1828line 1831 didn't jump to line 1828, because the condition on line 1831 was never false

1832 mid = child.text 

1833 else: 

1834 self.warnings.append( 

1835 { 

1836 self.pid: self.__class__.__name__ 

1837 + "." 

1838 + inspect.currentframe().f_code.co_name 

1839 + " " 

1840 + child.tag 

1841 } 

1842 ) 

1843 

1844 return mid 

1845 

1846 def get_data_from_uri(self, node, **kwargs): 

1847 href = get_normalized_attrib(node, "href") or "" 

1848 

1849 kwargs["add_HTML_link"] = False 

1850 _, metadata = self.parse_inner_node(node, **kwargs) 

1851 

1852 data = {"rel": None, "mimetype": "", "location": href, "base": "", "metadata": metadata} 

1853 

1854 return data 

1855 

1856 def helper_add_link_from_node(self, node, **kwargs): 

1857 text = node.text or "" 

1858 tag = normalize(node.tag) 

1859 fct_name = "get_data_from_" + tag.replace("-", "_") 

1860 meth = getattr(self, fct_name) 

1861 data = meth(node, **kwargs) 

1862 if not data["rel"] or data["rel"] == "uri": 

1863 href = data["location"] 

1864 if self.for_tex_file: 1864 ↛ 1865line 1864 didn't jump to line 1865, because the condition on line 1864 was never true

1865 text = "\\href{" + href + "}{" + data["metadata"] + "}" 

1866 else: 

1867 text = make_links_clickable(href, data["metadata"]) 

1868 return text 

1869 

1870 def get_list_start_value(self, list_node): 

1871 continued_from = list_node.get("continued-from") 

1872 if continued_from is None: 

1873 start = 0 

1874 else: 

1875 from_node = self.tree.find(f'.//*[@id="{continued_from}"]') 

1876 if from_node is not None: 

1877 start = len(from_node) + self.get_list_start_value(from_node) 

1878 

1879 return start 

1880 

1881 

1882class MathdocPublication(MathdocPublicationData, JatsBase): 

1883 def __init__(self, *args, **kwargs): 

1884 super().__init__(*args, **kwargs) 

1885 self.parse_tree(kwargs["tree"]) 

1886 

1887 def parse_tree(self, tree): 

1888 super().parse_tree(tree) 

1889 

1890 for node in tree: 

1891 tag = normalize(node.tag) 

1892 

1893 if tag in ("publication-id", "collection-id"): 

1894 node_type = node.get("publication-id-type") 

1895 if node_type is None or node_type in ["numdam-id", "mathdoc-id"]: 

1896 self.pid = node.text 

1897 elif tag == "title-group": 

1898 self.parse_title_group(node) 

1899 elif tag == "issn": 

1900 node_type = node.get("pub-type") 

1901 if node_type == "ppub": 

1902 self.issn = node.text 

1903 self.ids.append(("issn", node.text)) 

1904 elif node_type == "epub": 1904 ↛ 1890line 1904 didn't jump to line 1890, because the condition on line 1904 was never false

1905 self.e_issn = node.text 

1906 self.ids.append(("e-issn", node.text)) 

1907 elif tag == "ext-link": 

1908 data = self.get_data_from_ext_link(node) 

1909 self.ext_links.append(data) 

1910 elif tag == "custom-meta-group": 

1911 self.parse_custom_meta_group(node) 

1912 elif tag == "description": 1912 ↛ 1913line 1912 didn't jump to line 1913, because the condition on line 1912 was never true

1913 self.parse_description(node) 

1914 else: 

1915 self.warnings.append( 

1916 { 

1917 self.pid: self.__class__.__name__ 

1918 + "." 

1919 + inspect.currentframe().f_code.co_name 

1920 + " " 

1921 + tag 

1922 } 

1923 ) 

1924 

1925 def parse_custom_meta_group(self, node, **kwargs): 

1926 for child in node: 

1927 tag = normalize(child.tag) 

1928 

1929 if tag == "custom-meta": 1929 ↛ 1939line 1929 didn't jump to line 1939, because the condition on line 1929 was never false

1930 name, value = self.get_data_from_custom_meta(child) 

1931 

1932 if name == "serial-type": 

1933 self.coltype = value 

1934 elif name == "wall": 

1935 self.wall = int(value) 

1936 elif name == "provider": 1936 ↛ 1926line 1936 didn't jump to line 1926, because the condition on line 1936 was never false

1937 self.provider = value 

1938 else: 

1939 self.warnings.append( 

1940 { 

1941 self.pid: self.__class__.__name__ 

1942 + "." 

1943 + inspect.currentframe().f_code.co_name 

1944 + " " 

1945 + tag 

1946 } 

1947 ) 

1948 

1949 def parse_description(self, node, **kwargs): 

1950 # tag = get_normalized_attrib(node, "abstract-node_type") or "abstract" 

1951 tag = "description" 

1952 lang = get_normalized_attrib(node, "lang") or self.lang 

1953 value_xml = get_xml_from_node(node) 

1954 value_tex = value_html = value_xml.replace("<decription", "").replace("</description>", "") 

1955 self.abstracts.append( 

1956 { 

1957 "tag": tag, 

1958 "lang": lang, 

1959 "value_xml": value_xml, 

1960 "value_html": value_html, 

1961 "value_tex": value_tex, 

1962 } 

1963 ) 

1964 

1965 

1966class JatsPublisher(PublisherData): 

1967 def __init__(self, *args, **kwargs): 

1968 super().__init__(*args, **kwargs) 

1969 self.warnings = [] 

1970 self.parse_tree(kwargs["tree"]) 

1971 self.warnings = [] 

1972 

1973 def parse_tree(self, tree): 

1974 for node in tree: 

1975 tag = normalize(node.tag) 

1976 

1977 if tag == "publisher-name": 1977 ↛ 1979line 1977 didn't jump to line 1979, because the condition on line 1977 was never false

1978 self.name = node.text 

1979 elif tag == "publisher-loc": 

1980 self.loc = node.text 

1981 else: 

1982 self.warnings.append( 

1983 { 

1984 self.pid: self.__class__.__name__ 

1985 + "." 

1986 + inspect.currentframe().f_code.co_name 

1987 + " " 

1988 + tag 

1989 } 

1990 ) 

1991 

1992 

1993class JatsJournal(JournalData, JatsBase): 

1994 def __init__(self, *args, **kwargs): 

1995 super().__init__(*args, **kwargs) 

1996 self.parse_tree(kwargs["tree"]) 

1997 

1998 def parse_tree(self, tree): 

1999 super().parse_tree(tree) 

2000 

2001 for node in tree: 

2002 tag = normalize(node.tag) 

2003 

2004 if tag == "journal-id": 

2005 id_type = node.get("journal-id-type") or "numdam-id" 

2006 if id_type == "numdam-id" or id_type == "mathdoc-id": 2006 ↛ 2001line 2006 didn't jump to line 2001, because the condition on line 2006 was never false

2007 self.pid = node.text 

2008 elif tag == "journal-title-group": 

2009 self.parse_title_group(node) 

2010 elif tag == "publisher": 

2011 self.publisher = JatsPublisher(tree=node) 

2012 elif tag == "issn": 2012 ↛ 2021line 2012 didn't jump to line 2021, because the condition on line 2012 was never false

2013 node_type = node.get("pub-type") or "ppub" 

2014 if node_type == "ppub": 

2015 self.issn = node.text 

2016 self.ids.append(("issn", node.text)) 

2017 elif node_type == "epub": 2017 ↛ 2001line 2017 didn't jump to line 2001, because the condition on line 2017 was never false

2018 self.e_issn = node.text 

2019 self.ids.append(("e-issn", node.text)) 

2020 else: 

2021 self.warnings.append( 

2022 { 

2023 self.pid: self.__class__.__name__ 

2024 + "." 

2025 + inspect.currentframe().f_code.co_name 

2026 + " " 

2027 + tag 

2028 } 

2029 ) 

2030 

2031 

2032class JatsIssue(IssueData, JatsBase): 

2033 def __init__(self, *args, **kwargs): 

2034 super().__init__(*args, **kwargs) 

2035 # from_folder is used to change the location of Elsevier graphics to a full path location 

2036 self.from_folder = kwargs["from_folder"] if "from_folder" in kwargs else None 

2037 self.no_bib = kwargs.get("no_bib", False) 

2038 

2039 self.parse_tree(kwargs["tree"]) 

2040 

2041 def parse_tree(self, tree): 

2042 super().parse_tree(tree) 

2043 

2044 for node in tree: 

2045 tag = normalize(node.tag) 

2046 

2047 if tag == "journal-meta": 

2048 self.journal = JatsJournal(tree=node) 

2049 elif tag == "issue-meta": 

2050 self.parse_issue_meta(node) 

2051 elif tag == "body": 2051 ↛ 2075line 2051 didn't jump to line 2075, because the condition on line 2051 was never false

2052 for child in node: 

2053 tag = normalize(child.tag) 

2054 

2055 if tag == "article": 2055 ↛ 2065line 2055 didn't jump to line 2065, because the condition on line 2055 was never false

2056 article = JatsArticle( 

2057 tree=child, 

2058 issue=self, 

2059 from_folder=self.from_folder, 

2060 no_bib=self.no_bib, 

2061 ) 

2062 self.warnings.extend(article.warnings) 

2063 self.articles.append(article) 

2064 else: 

2065 self.warnings.append( 

2066 { 

2067 self.pid: self.__class__.__name__ 

2068 + "." 

2069 + inspect.currentframe().f_code.co_name 

2070 + " " 

2071 + tag 

2072 } 

2073 ) 

2074 else: 

2075 self.warnings.append( 

2076 { 

2077 self.pid: self.__class__.__name__ 

2078 + "." 

2079 + inspect.currentframe().f_code.co_name 

2080 + " " 

2081 + tag 

2082 } 

2083 ) 

2084 

2085 if self.journal is not None: 2085 ↛ 2089line 2085 didn't jump to line 2089, because the condition on line 2085 was never false

2086 self.publisher = self.journal.publisher 

2087 

2088 # Issue editors may be replicated in all the articles, remove them 

2089 issue_editors = [contrib for contrib in self.contributors if contrib["role"] == "editor"] 

2090 

2091 is_elsevier = False 

2092 for xarticle in self.articles: 

2093 if hasattr(xarticle, "pii"): 2093 ↛ 2094line 2093 didn't jump to line 2094, because the condition on line 2093 was never true

2094 is_elsevier = True 

2095 

2096 editors = [contrib for contrib in xarticle.contributors if contrib["role"] == "editor"] 

2097 is_equal = len(editors) == len(issue_editors) 

2098 i = 0 

2099 while is_equal and i < len(editors): 2099 ↛ 2100line 2099 didn't jump to line 2100, because the condition on line 2099 was never true

2100 if ( 

2101 editors[i]["last_name"] != issue_editors[i]["last_name"] 

2102 or editors[i]["first_name"] != issue_editors[i]["first_name"] 

2103 ): 

2104 is_equal = False 

2105 i += 1 

2106 if is_equal: 

2107 xarticle.contributors = [ 

2108 contrib for contrib in xarticle.contributors if contrib["role"] != "editor" 

2109 ] 

2110 

2111 if is_elsevier: 2111 ↛ 2113line 2111 didn't jump to line 2113, because the condition on line 2111 was never true

2112 # Fix location of icons 

2113 for link in self.ext_links: 

2114 if link["rel"] in ["icon", "small_icon"]: 

2115 base_dir = self.journal.pid 

2116 location = link["location"] 

2117 if os.path.dirname(location) != base_dir: 

2118 location = os.path.join(base_dir, self.pid, location) 

2119 if self.from_folder: 

2120 location = os.path.join(self.from_folder, location) 

2121 location = "file:" + location 

2122 link["location"] = location 

2123 

2124 # Fix article types and subjects 

2125 for xarticle in self.articles: 

2126 article_type = "research-article" 

2127 old_type = "" 

2128 new_subjs = [] 

2129 

2130 if xarticle.fpage != "": 

2131 try: 

2132 value = int(xarticle.fpage) 

2133 except ValueError: 

2134 # fpage is not a number: the article is an editorial 

2135 article_type = "editorial" 

2136 

2137 if article_type == "research-article": 

2138 for subj in xarticle.subjs: 

2139 if subj["type"] == "type": 

2140 # Fix article types 

2141 value = subj["value"].lower() 

2142 old_type = value 

2143 if value == "discussion": 

2144 article_type = "letter" 

2145 elif value == "editorial": 

2146 if xarticle.title_tex.lower().find("foreword") == 0: 

2147 article_type = "foreword" 

2148 else: 

2149 article_type = "editorial" 

2150 elif value in ["mini review", "review article", "book review"]: 

2151 article_type = "review" 

2152 elif value == "research article": 

2153 article_type = "research-article" 

2154 elif value == "short communication": 

2155 article_type = "foreword" 

2156 elif value == "correspondence": 

2157 article_type = "letter" 

2158 elif value.find("conference") == 0: 

2159 article_type = "congress" 

2160 elif subj["type"] == "heading" and not xarticle.title_tex: 

2161 # The title may be stored in the heading: fix it 

2162 xarticle.title_tex = xarticle.title_html = subj["value"] 

2163 xarticle.title_xml = get_title_xml(subj["value"]) 

2164 elif subj["type"] == "heading": 

2165 value = subj["value"].lower().strip() 

2166 issue_title = self.title_tex.lower() 

2167 if issue_title.find("dossier: ") == 0: 

2168 issue_title = issue_title[9:] 

2169 self.title_tex = self.title_html = self.title_tex[9:] 

2170 self.title_xml = ( 

2171 "<issue-title>" 

2172 + get_single_title_xml(issue_title) 

2173 + "</issue-title>" 

2174 ) 

2175 

2176 # Some heading values are in fact article type 

2177 if value.find("erratum") == 0: 

2178 article_type = "erratum" 

2179 elif value.find("corrigendum") == 0: 

2180 article_type = "corrigendum" 

2181 elif value.find("foreword") == 0: 

2182 article_type = "foreword" 

2183 elif value.find("nécrologie") == 0 or value.find("obituary") == 0: 

2184 article_type = "history-of-sciences" 

2185 elif ( 

2186 value.find("block calendar/éphéméride") == 0 

2187 or value.find("chronique") == 0 

2188 ): 

2189 article_type = "history-of-sciences" 

2190 elif value.find("histoire") == 0 or value.find("historic") == 0: 

2191 article_type = "history-of-sciences" 

2192 elif value.find("tribute/hommage") == 0: 

2193 article_type = "history-of-sciences" 

2194 elif value.find("note historique") == 0: 

2195 article_type = "historical-commentary" 

2196 elif ( 

2197 value.find("le point sur") == 0 or value.find("le point-sur") == 0 

2198 ): 

2199 article_type = "review" 

2200 elif ( 

2201 value.find("review") == 0 

2202 or value.find("revue") == 0 

2203 or value.find("concise review") == 0 

2204 ): 

2205 article_type = "review" 

2206 elif value.find("conférence") == 0: 

2207 article_type = "congress" 

2208 elif ( 

2209 value.find("communication") == 0 or value.find("preliminary") == 0 

2210 ): 

2211 article_type = "preliminary-communication" 

2212 elif value.find("perspective") == 0 and old_type in [ 

2213 "correspondence", 

2214 "short communication", 

2215 ]: 

2216 article_type = "opinion" 

2217 elif value.find("debate") == 0: 

2218 article_type = "opinion" 

2219 elif ( 

2220 value.find("index") == 0 

2221 or value.find("keyword") == 0 

2222 or value.find("sommaire") == 0 

2223 ): 

2224 article_type = "editorial" 

2225 elif ( 

2226 value.find("table auteurs") == 0 

2227 or value.find("table sommaire") == 0 

2228 ): 

2229 article_type = "editorial" 

2230 elif value.find("page présentation des index") == 0: 

2231 article_type = "editorial" 

2232 elif value.find("fac-similé") == 0: 

2233 # Article de crbiol, Pubmed les met en "Classical Article" 

2234 article_type = "historical-commentary" 

2235 # On ajoute le sujet dans ce cas pour garder la mention de "fac-similé" (== recopie) 

2236 new_subjs.append(subj) 

2237 # Ignore the issue titles 

2238 elif ( 

2239 not self.title_tex 

2240 or value.find(self.title_tex.lower().strip()) != 0 

2241 ): 

2242 # Exclude headings that are redundant with article types 

2243 exclude_list = [ 

2244 "editorial", 

2245 "éditorial", 

2246 "avant-propos", 

2247 "book review", 

2248 "comment", 

2249 "concise review paper", 

2250 "answer", 

2251 "commentaire", 

2252 "commentary", 

2253 "reply", 

2254 "foreword", 

2255 "full paper", 

2256 "mémoire", 

2257 ] 

2258 if len([x for x in exclude_list if value.find(x) == 0]) == 0: 

2259 new_subjs.append(subj) 

2260 else: 

2261 new_subjs.append(subj) 

2262 

2263 # print(old_type, '-', old_heading, '-', article_type, '-', xarticle.pid, '-', xarticle.fpage) 

2264 xarticle.atype = article_type 

2265 xarticle.subjs = new_subjs 

2266 

2267 def parse_custom_meta_group(self, node, **kwargs): 

2268 for child in node: 

2269 tag = normalize(child.tag) 

2270 

2271 if tag == "custom-meta": 2271 ↛ 2279line 2271 didn't jump to line 2279, because the condition on line 2271 was never false

2272 name, value = self.get_data_from_custom_meta(child) 

2273 

2274 if name == "provider": 

2275 self.provider = value 

2276 elif name == "efirst": 2276 ↛ 2268line 2276 didn't jump to line 2268, because the condition on line 2276 was never false

2277 self.with_online_first = value == "yes" 

2278 else: 

2279 self.warnings.append( 

2280 { 

2281 self.pid: self.__class__.__name__ 

2282 + "." 

2283 + inspect.currentframe().f_code.co_name 

2284 + " " 

2285 + tag 

2286 } 

2287 ) 

2288 

2289 def parse_issue_meta(self, node, **kwargs): 

2290 for child in node: 

2291 tag = normalize(child.tag) 

2292 

2293 if tag == "issue-id": 

2294 self.parse_id(child) 

2295 elif tag == "volume-series": 

2296 self.vseries = child.text 

2297 elif tag == "volume": 

2298 self.volume = child.text 

2299 elif tag == "issue": 

2300 self.number = child.text 

2301 elif tag == "pub-date": 

2302 self.year = self.get_data_from_date(child, ignore_month=True) 

2303 elif tag == "history": 

2304 history_dates = self.get_data_from_history(child) 

2305 for date in history_dates: 

2306 if date["type"] == "last-modified": 

2307 self.last_modified_iso_8601_date_str = date["date"] 

2308 elif date["type"] == "prod-deployed-date": 

2309 self.prod_deployed_date_iso_8601_date_str = date["date"] 

2310 elif tag == "issue-title": 

2311 content_type = child.get("content-type") or "" 

2312 if content_type != "subtitle" and content_type != "cover-date": 2312 ↛ 2290line 2312 didn't jump to line 2290, because the condition on line 2312 was never false

2313 # Elsevier stores contributors in subtitles. Ignore. 

2314 lang = get_normalized_attrib(child, "lang") or "und" 

2315 if not self.title_tex and ( 

2316 self.lang == "und" or lang == "und" or lang == self.lang 

2317 ): 

2318 self.parse_title(child) 

2319 # In xmldata, title_xml had the <title_group> tag: 

2320 # self.title_xml can't be set in parse_title 

2321 self.title_xml += get_xml_from_node(child) 

2322 else: 

2323 self.trans_lang = lang 

2324 ( 

2325 self.trans_title_tex, 

2326 self.trans_title_html, 

2327 ) = self.parse_node_with_mixed_content(child) 

2328 self.title_xml += get_xml_from_node(child) 

2329 elif tag == "issue-title-group": 2329 ↛ 2330line 2329 didn't jump to line 2330, because the condition on line 2329 was never true

2330 self.parse_title_group(child) 

2331 else: 

2332 fct_name = "parse_" + tag.replace("-", "_") 

2333 ftor = getattr(self, fct_name, None) 

2334 if callable(ftor): 2334 ↛ 2337line 2334 didn't jump to line 2337, because the condition on line 2334 was never false

2335 ftor(child, add_ext_link=True) 

2336 else: 

2337 self.warnings.append( 

2338 { 

2339 self.pid: self.__class__.__name__ 

2340 + "." 

2341 + inspect.currentframe().f_code.co_name 

2342 + " " 

2343 + tag 

2344 } 

2345 ) 

2346 

2347 if self.last_modified_iso_8601_date_str is None: 

2348 self.last_modified_iso_8601_date_str = timezone.now().isoformat() 

2349 

2350 

2351class JatsArticleBase(JatsBase): 

2352 def parse_custom_meta_group(self, node, **kwargs): 

2353 for child in node: 

2354 tag = normalize(child.tag) 

2355 

2356 if tag == "custom-meta": 2356 ↛ 2372line 2356 didn't jump to line 2372, because the condition on line 2356 was never false

2357 name, value = self.get_data_from_custom_meta(child) 

2358 

2359 if name == "article-number": 

2360 self.article_number = value 

2361 elif name == "talk-number": 

2362 self.talk_number = value 

2363 elif name == "presented": 2363 ↛ 2364line 2363 didn't jump to line 2364, because the condition on line 2363 was never true

2364 presenter = create_contributor() 

2365 presenter["role"] = "presenter" 

2366 presenter["string_name"] = value.replace("Presented by ", "").replace( 

2367 "Présenté par ", "" 

2368 ) 

2369 presenter["contrib_xml"] = get_contrib_xml(presenter) 

2370 self.contributors.append(presenter) 

2371 else: 

2372 self.warnings.append( 

2373 { 

2374 self.pid: self.__class__.__name__ 

2375 + "." 

2376 + inspect.currentframe().f_code.co_name 

2377 + " " 

2378 + tag 

2379 } 

2380 ) 

2381 

2382 

2383class JatsArticle(ArticleData, JatsArticleBase): 

2384 def __init__(self, *args, **kwargs): # , tree, pid=None): 

2385 super().__init__(*args, **kwargs) 

2386 self.pid = kwargs["pid"] if "pid" in kwargs else None 

2387 self.issue = kwargs["issue"] if "issue" in kwargs else None 

2388 

2389 self.add_span_around_tex_formula = ( 

2390 kwargs["add_span_around_tex_formula"] 

2391 if "add_span_around_tex_formula" in kwargs 

2392 else False 

2393 ) 

2394 self.for_tex_file = kwargs["for_tex_file"] if "for_tex_file" in kwargs else False 

2395 self.from_folder = kwargs["from_folder"] if "from_folder" in kwargs else None 

2396 self.no_bib = kwargs.get("no_bib", False) 

2397 

2398 self.parse_tree(kwargs["tree"]) 

2399 

2400 def parse_tree(self, tree): 

2401 super().parse_tree(tree) 

2402 

2403 self.atype = get_normalized_attrib(tree, "article-type") or "" 

2404 

2405 # First loop to catch float-groups that are inserted inside the body 

2406 for node in tree: 

2407 tag = normalize(node.tag) 

2408 

2409 if tag == "front": 

2410 for child in node: 

2411 tag = normalize(child.tag) 

2412 

2413 if tag == "article-meta": 

2414 self.parse_article_meta(child) 

2415 else: 

2416 self.warnings.append( 

2417 { 

2418 self.pid: self.__class__.__name__ 

2419 + "." 

2420 + inspect.currentframe().f_code.co_name 

2421 + " " 

2422 + tag 

2423 } 

2424 ) 

2425 elif tag == "front-stub": 2425 ↛ 2426line 2425 didn't jump to line 2426, because the condition on line 2425 was never true

2426 self.parse_article_meta(node) 

2427 elif tag == "floats-group": 2427 ↛ 2428line 2427 didn't jump to line 2428, because the condition on line 2427 was never true

2428 self.parse_floats_group(node) 

2429 

2430 for node in tree: 

2431 tag = normalize(node.tag) 

2432 if tag == "back": 

2433 for child in node: 

2434 tag = normalize(child.tag) 

2435 

2436 if tag == "ref-list" and not self.no_bib: 

2437 print("Parse bib") 

2438 self.parse_ref_list(child) 

2439 elif tag == "ack": 2439 ↛ 2440line 2439 didn't jump to line 2440, because the condition on line 2439 was never true

2440 self.parse_ack(child) 

2441 elif tag == "sec": 2441 ↛ 2442line 2441 didn't jump to line 2442, because the condition on line 2441 was never true

2442 self.parse_sec(child) 

2443 elif tag == "app-group": 2443 ↛ 2444line 2443 didn't jump to line 2444, because the condition on line 2443 was never true

2444 self.parse_app_group(child) 

2445 elif tag == "fn-group": 2445 ↛ 2446line 2445 didn't jump to line 2446, because the condition on line 2445 was never true

2446 self.parse_fn_group(child) 

2447 else: 

2448 self.warnings.append( 

2449 { 

2450 self.pid: self.__class__.__name__ 

2451 + "." 

2452 + inspect.currentframe().f_code.co_name 

2453 + " " 

2454 + tag 

2455 } 

2456 ) 

2457 

2458 elif tag == "body": 

2459 self.parse_body(node) 

2460 elif tag == "sub-article": 2460 ↛ 2461line 2460 didn't jump to line 2461, because the condition on line 2460 was never true

2461 self.parse_sub_article(node) 

2462 elif tag == "floats-group" or tag == "front": 2462 ↛ 2466line 2462 didn't jump to line 2466, because the condition on line 2462 was never false

2463 # Handled above 

2464 pass 

2465 else: 

2466 self.warnings.append( 

2467 { 

2468 self.pid: self.__class__.__name__ 

2469 + "." 

2470 + inspect.currentframe().f_code.co_name 

2471 + " " 

2472 + tag 

2473 } 

2474 ) 

2475 

2476 # Add the footnotes at the end 

2477 if len(self.fns) > 0: 2477 ↛ 2478line 2477 didn't jump to line 2478, because the condition on line 2477 was never true

2478 fn_text = '<div class="footnotes">' 

2479 for fn in self.fns: 

2480 fn_text += fn 

2481 fn_text += "</div>" 

2482 

2483 self.body_html = fn_text if not self.body_html else self.body_html + fn_text 

2484 

2485 if ( 2485 ↛ 2489line 2485 didn't jump to line 2489

2486 len(self.funding_statement_xml) > 0 

2487 and self.funding_statement_xml.find('<name-content content-type="fn"') == -1 

2488 ): 

2489 self.funding_statement_xml = ( 

2490 f'<name-content content-type="fn">{self.funding_statement_xml}</name-content>' 

2491 ) 

2492 

2493 # Case for XML with <body>, then <back> and <floats_group> 

2494 # The figures/tables of the floats_group are added inside the body_html 

2495 # (close to their first <xref>) 

2496 # It's too complicated to do the same for the body_xml as we use the get_xml_from_node function. 

2497 # Instead, we append the floats_group_xml to the body_xml 

2498 if hasattr(self, "floats_group_xml"): 2498 ↛ 2499line 2498 didn't jump to line 2499, because the condition on line 2498 was never true

2499 self.body_xml += self.floats_group_xml 

2500 

2501 # Special treatment for Elsevier articles: web scrapping to find the date_published 

2502 # Moved to the import management commands since Elsevier blocks IP after 1000+ requests 

2503 # if hasattr(self, 'pii') and self.date_published_iso_8601_date_str is None: 

2504 # article_data = scrapping.fetch_article(self.doi, self.pii) 

2505 # self.date_published_iso_8601_date_str = article_data.date_published_iso_8601_date_str 

2506 

2507 self.post_parse_tree() 

2508 

2509 def update_body_content(self, node, **kwargs): 

2510 if len(node) == 0: 

2511 # Most journals do not display the Full text 

2512 # the <body> is then used to store the text for the search engine and has no children 

2513 # Let's not compute body_html in this case. 

2514 # We want the same behavior for journals that display the Full text, 

2515 # but with old articles without Full text. 

2516 return 

2517 

2518 # <front> has to be put before <body> so self.pid is defined here 

2519 if hasattr(settings, "SITE_URL_PREFIX"): 2519 ↛ 2520line 2519 didn't jump to line 2520, because the condition on line 2519 was never true

2520 prefix = settings.SITE_URL_PREFIX 

2521 base_article = settings.ARTICLE_BASE_URL 

2522 base_url = "/" + prefix + base_article + self.pid 

2523 else: 

2524 base_url = os.path.join(settings.ARTICLE_BASE_URL, self.pid) 

2525 kwargs["base_url"] = base_url 

2526 

2527 append_to_body = True 

2528 current_len = len(self.supplementary_materials) 

2529 

2530 if "use_sec" in kwargs and kwargs["use_sec"]: 2530 ↛ 2532line 2530 didn't jump to line 2532, because the condition on line 2530 was never true

2531 # Hack for Elsevier: convert <ack> into <sec> of the <body> 

2532 body_tex, body_html = self.parse_node_with_sec(node, **kwargs) 

2533 else: 

2534 body_tex, body_html = self.parse_node_with_mixed_content(node, **kwargs) 

2535 

2536 if len(self.supplementary_materials) != current_len: 2536 ↛ 2539line 2536 didn't jump to line 2539, because the condition on line 2536 was never true

2537 # Elsevier stores supplementary-material in app-group. 

2538 # They are extracted, but ignored in the body_html if the appendix has only supplements 

2539 append_to_body = False 

2540 

2541 for child in node: 

2542 if child.tag == "p": 

2543 for gchild in child: 

2544 if gchild.tag != "supplementary-material": 

2545 append_to_body = True 

2546 

2547 if append_to_body: 2547 ↛ exitline 2547 didn't return from function 'update_body_content', because the condition on line 2547 was never false

2548 self.body_tex = body_tex if not self.body_tex else self.body_tex + body_tex 

2549 self.body_html = body_html if not self.body_html else self.body_html + body_html 

2550 

2551 body_xml = get_xml_from_node(node) 

2552 if not self.body_xml: 2552 ↛ 2555line 2552 didn't jump to line 2555, because the condition on line 2552 was never false

2553 self.body_xml = body_xml 

2554 else: 

2555 if "use_sec" in kwargs and kwargs["use_sec"]: 

2556 self.body_xml = f"{self.body_xml[0:-7]}<sec>{body_xml[5:-6]}</sec></body>" 

2557 else: 

2558 self.body_xml = f"{self.body_xml[0:-7]}{body_xml}</body>" 

2559 

2560 def parse_ack(self, node, **kwargs): 

2561 content_type = node.get("content-type") or "" 

2562 if content_type == "COI-statement": 

2563 self.coi_statement = get_text_from_node(node) 

2564 else: 

2565 # Hack for Elsevier: convert <ack> into <sec> of the <body> 

2566 self.update_body_content(node, use_sec=True) 

2567 

2568 def parse_app(self, node, **kwargs): 

2569 for child in node: 

2570 tag = normalize(child.tag) 

2571 

2572 if tag == "sec": 

2573 # Elsevier can store all appendixes inside one <app> ?!? 

2574 # One of them can store the supplements and has to be ignored in the body_html 

2575 self.update_body_content(child) 

2576 else: 

2577 self.warnings.append( 

2578 { 

2579 self.pid: self.__class__.__name__ 

2580 + "." 

2581 + inspect.currentframe().f_code.co_name 

2582 + " " 

2583 + tag 

2584 } 

2585 ) 

2586 

2587 def parse_app_group(self, node, **kwargs): 

2588 for child in node: 

2589 tag = normalize(child.tag) 

2590 

2591 if tag == "app": 

2592 self.parse_app(child) 

2593 else: 

2594 self.warnings.append( 

2595 { 

2596 self.pid: self.__class__.__name__ 

2597 + "." 

2598 + inspect.currentframe().f_code.co_name 

2599 + " " 

2600 + tag 

2601 } 

2602 ) 

2603 

2604 def parse_article_categories(self, node, **kwargs): 

2605 for child in node: 

2606 tag = normalize(child.tag) 

2607 

2608 if tag == "subj-group": 2608 ↛ 2611line 2608 didn't jump to line 2611, because the condition on line 2608 was never false

2609 self.parse_subj_group(child) 

2610 else: 

2611 self.warnings.append( 

2612 { 

2613 self.pid: self.__class__.__name__ 

2614 + "." 

2615 + inspect.currentframe().f_code.co_name 

2616 + " " 

2617 + tag 

2618 } 

2619 ) 

2620 

2621 def parse_article_meta(self, node, **kwargs): 

2622 for child in node: 

2623 tag = normalize(child.tag) 

2624 

2625 if tag == "article-id": 

2626 self.parse_id(child) 

2627 elif tag == "fpage": 

2628 self.fpage = child.text 

2629 self.page_type = child.get("content-type") or "" 

2630 elif tag == "lpage": 

2631 self.lpage = child.text or "" 

2632 elif tag == "page-range": 

2633 self.page_range = child.text 

2634 elif tag in ("page-count", "size"): 2634 ↛ 2635line 2634 didn't jump to line 2635, because the condition on line 2634 was never true

2635 self.size = child.text 

2636 elif tag == "elocation-id": 2636 ↛ 2637line 2636 didn't jump to line 2637, because the condition on line 2636 was never true

2637 self.elocation = child.text 

2638 elif tag == "pub-date": 

2639 date_type = child.get("date-type") or "pub" 

2640 if date_type == "pub": 

2641 self.date_published_iso_8601_date_str = self.get_data_from_date(child) 

2642 else: 

2643 date_str = self.get_data_from_date(child) 

2644 self.history_dates.append({"type": "online", "date": date_str}) 

2645 elif tag == "history": 

2646 self.history_dates += self.get_data_from_history(child) 

2647 for date in self.history_dates: 

2648 if date["type"] == "prod-deployed-date": 

2649 self.prod_deployed_date_iso_8601_date_str = date["date"] 

2650 elif tag in ["volume", "issue-id", "permissions", "pub-date-not-available"]: 

2651 pass 

2652 # TODO: store permissions in XML 

2653 elif tag == "author-notes": 2653 ↛ 2655line 2653 didn't jump to line 2655, because the condition on line 2653 was never true

2654 # 2022/11/15 Mersenne meeting. ignore author-notes 

2655 pass 

2656 # self.parse_author_notes(child) 

2657 else: 

2658 fct_name = "parse_" + tag.replace("-", "_") 

2659 ftor = getattr(self, fct_name, None) 

2660 if callable(ftor): 

2661 ftor(child, add_ext_link=True) 

2662 else: 

2663 self.warnings.append( 

2664 { 

2665 self.pid: self.__class__.__name__ 

2666 + "." 

2667 + inspect.currentframe().f_code.co_name 

2668 + " " 

2669 + tag 

2670 } 

2671 ) 

2672 

2673 def parse_author_notes(self, node, **kwargs): 

2674 for child in node: 

2675 tag = normalize(child.tag) 

2676 if tag == "fn": 

2677 _, html = self.parse_node_with_fn(child, keep_fn=True, keep_fn_label=False) 

2678 xml = get_xml_from_node(child) 

2679 self.footnotes_xml += xml 

2680 self.footnotes_html += html 

2681 

2682 def parse_body(self, node, **kwargs): 

2683 self.body = get_text_from_node(node) 

2684 

2685 if hasattr(self, "floats"): 2685 ↛ 2686line 2685 didn't jump to line 2686, because the condition on line 2685 was never true

2686 self.floats_to_insert = [] 

2687 

2688 self.update_body_content(node, **kwargs) 

2689 

2690 if not self.body_xml: 

2691 self.body_xml = get_xml_from_node(node) 

2692 

2693 def parse_boxed_text(self, node, **kwargs): 

2694 """ 

2695 Parse <boxed-text> inside <floats-group> and fills the self.float_boxed_texts dictionary. 

2696 The dictionary is then used during parse_body to embed the boxed-text inside the body HTML. 

2697 """ 

2698 box_id = node.attrib["id"] if "id" in node.attrib else None 

2699 

2700 _, html = self.parse_node_with_boxed_text(node, **kwargs) 

2701 

2702 if box_id is not None: 

2703 self.floats[box_id] = html 

2704 

2705 def parse_floats_group(self, node, **kwargs): 

2706 if hasattr(settings, "SITE_URL_PREFIX"): 

2707 prefix = settings.SITE_URL_PREFIX 

2708 base_article = settings.ARTICLE_BASE_URL 

2709 base_url = "/" + prefix + base_article + self.pid 

2710 else: 

2711 base_url = os.path.join(settings.ARTICLE_BASE_URL, self.pid) 

2712 

2713 self.floats = {} 

2714 for child in node: 

2715 tag = normalize(child.tag) 

2716 

2717 if tag == "fig": 

2718 self.parse_node_with_fig(child, append_floats=True, base_url=base_url) 

2719 elif tag == "table-wrap": 

2720 self.parse_node_with_table_wrap(child, append_floats=True, base_url=base_url) 

2721 elif tag == "boxed-text": 

2722 self.parse_boxed_text(child, base_url=base_url) 

2723 else: 

2724 self.warnings.append( 

2725 { 

2726 self.pid: self.__class__.__name__ 

2727 + "." 

2728 + inspect.currentframe().f_code.co_name 

2729 + " " 

2730 + tag 

2731 } 

2732 ) 

2733 

2734 self.floats_group_xml = get_xml_from_node(node) 

2735 

2736 def parse_fn_group(self, node, **kwargs): 

2737 for child in node: 

2738 tag = normalize(child.tag) 

2739 

2740 if tag == "fn": 

2741 _, html = self.parse_node_with_fn(child, keep_fn=True) 

2742 xml = get_xml_from_node(child) 

2743 

2744 self.footnotes_html += html 

2745 self.footnotes_xml += xml 

2746 else: 

2747 self.warnings.append( 

2748 { 

2749 self.pid: self.__class__.__name__ 

2750 + "." 

2751 + inspect.currentframe().f_code.co_name 

2752 + " " 

2753 + tag 

2754 } 

2755 ) 

2756 

2757 def parse_funding_group(self, node, **kwargs): 

2758 for child in node: 

2759 tag = normalize(child.tag) 

2760 

2761 if tag == "award-group": 2761 ↛ 2763line 2761 didn't jump to line 2763, because the condition on line 2761 was never false

2762 self.parse_award_group(child) 

2763 elif tag == "funding-statement": 

2764 for funding_node in child: 

2765 if funding_node.tag == "name-content": 

2766 for funding_child in funding_node: 

2767 if funding_child.tag == "fn": 

2768 _, html = self.parse_node_with_fn(funding_child, keep_fn=True) 

2769 self.funding_statement_html += html 

2770 self.funding_statement_xml = get_xml_from_node(funding_node) 

2771 

2772 # TODO: handle funding-statement with simple texts 

2773 else: 

2774 self.warnings.append( 

2775 { 

2776 self.pid: self.__class__.__name__ 

2777 + "." 

2778 + inspect.currentframe().f_code.co_name 

2779 + " " 

2780 + tag 

2781 } 

2782 ) 

2783 

2784 def parse_issue(self, node, **kwargs): 

2785 # Elsevier stores bs in the seq attribute 

2786 self.seq = "0" if hasattr(self, "pii") else (node.get("seq") or "0") 

2787 

2788 

2789class JatsRef(RefBase, JatsBase): 

2790 def __init__(self, *args, **kwargs): # , tree, lang): 

2791 super().__init__(*args, **kwargs) # lang) 

2792 self.parse_tree(kwargs["tree"]) 

2793 

2794 def parse_tree(self, tree): 

2795 super().parse_tree(tree) 

2796 

2797 self.user_id = get_normalized_attrib(tree, "id") or "" 

2798 

2799 for node in tree: 

2800 tag = normalize(node.tag) 

2801 

2802 if tag == "label": 

2803 self.label = node.text or "" 

2804 

2805 if self.label: 2805 ↛ 2840line 2805 didn't jump to line 2840, because the condition on line 2805 was never false

2806 if self.label[0] != "[": 

2807 self.label = "[" + self.label + "]" 

2808 

2809 elif tag == "mixed-citation" or tag == "note": 

2810 self.parse_citation_node(node) 

2811 

2812 self.citation_tex, self.citation_html = self.parse_node_with_mixed_content( 

2813 node, 

2814 is_citation=True, 

2815 is_mixed_citation=True, 

2816 add_ext_link=True, 

2817 ref_type="misc", 

2818 ) 

2819 

2820 if self.label: 

2821 self.citation_html = self.label + " " + self.citation_html 

2822 self.citation_tex = self.label + " " + self.citation_tex 

2823 

2824 elif tag == "element-citation": 

2825 self.parse_citation_node(node) 

2826 

2827 self.citation_tex = self.citation_html = get_citation_html(self) 

2828 else: 

2829 self.warnings.append( 

2830 { 

2831 self.pid: self.__class__.__name__ 

2832 + "." 

2833 + inspect.currentframe().f_code.co_name 

2834 + " " 

2835 + tag 

2836 } 

2837 ) 

2838 

2839 # With xmldata, citation_xml does not have '<ref>', but only the text of the children 

2840 self.citation_xml += get_xml_from_node(node) 

2841 

2842 def get_data_from_name_in_ref(self, node, role): 

2843 params = create_contributor() 

2844 params["role"] = role 

2845 

2846 if node.tag == "name": 

2847 self.update_data_from_name(node, params) 

2848 elif node.tag == "string-name": 

2849 self.update_data_from_name(node, params) 

2850 if params["first_name"] == "" and params["last_name"] == "": 

2851 params["string_name"] = node.text or "" 

2852 elif node.tag == "name-alternatives": 2852 ↛ 2853line 2852 didn't jump to line 2853, because the condition on line 2852 was never true

2853 params["mid"] = self.get_data_from_name_alternatives(node) 

2854 elif node.tag == "collab": 2854 ↛ 2855line 2854 didn't jump to line 2855, because the condition on line 2854 was never true

2855 params["string_name"] = node.text or "" 

2856 

2857 use_initials = getattr(settings, "REF_JEP_STYLE", False) 

2858 helper_update_name_params(params, use_initials) 

2859 params["contrib_xml"] = "<etal/>" if node.tag == "etal" else get_xml_from_node(node) 

2860 

2861 return params 

2862 

2863 def parse_node_with_chapter_title(self, node, **kwargs): 

2864 tex, html = self.parse_inner_node(node, **kwargs) 

2865 

2866 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False 

2867 if is_mixed_citation: 

2868 html = add_span_class_to_html_from_chapter_title(html, **kwargs) 

2869 

2870 return tex, html 

2871 

2872 def parse_node_with_source(self, node, **kwargs): 

2873 tex, html = self.parse_inner_node(node, **kwargs) 

2874 

2875 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False 

2876 if is_mixed_citation: 

2877 html = add_span_class_to_html_from_source(html, **kwargs) 

2878 

2879 return tex, html 

2880 

2881 def parse_citation_node(self, node, **kwargs): 

2882 self.type = get_normalized_attrib(node, "publication-type") or "misc" 

2883 

2884 # Elsevier can store data about a translation after comments (<source>...) 

2885 # Append these tags in the comment 

2886 has_comment = False 

2887 

2888 for child in node: 

2889 tag = normalize(child.tag) 

2890 

2891 if tag in ("page-count", "size"): 

2892 if not self.size: 2892 ↛ 2888line 2892 didn't jump to line 2888, because the condition on line 2892 was never false

2893 self.size = child.text 

2894 elif tag == "comment": 

2895 has_comment = True 

2896 # comments may have ext-links or uri. HTML <a> links will be added 

2897 _, comment = self.parse_node_with_mixed_content( 

2898 child, is_citation=True, is_comment=True, add_HTML_link=True 

2899 ) 

2900 if self.comment: 

2901 self.comment += " " 

2902 self.comment += comment 

2903 elif tag == "source": 

2904 # TODO: migration to store source_tex and source_html 

2905 _, source_tex = self.parse_node_with_mixed_content(child, is_citation=True) 

2906 

2907 if self.type in ["book", "inproceedings"] and len(self.source_tex) > 0: 2907 ↛ 2909line 2907 didn't jump to line 2909, because the condition on line 2907 was never true

2908 # Multiple source for a book, store the extra source in series 

2909 if self.series and has_comment: 

2910 self.comment += " " + source_tex 

2911 else: 

2912 if self.series: 

2913 self.series += ", " 

2914 self.series += get_text_from_node(child) 

2915 else: 

2916 if self.source_tex and has_comment: 2916 ↛ 2917line 2916 didn't jump to line 2917, because the condition on line 2916 was never true

2917 self.comment += " " + source_tex 

2918 else: 

2919 self.source_tex = source_tex 

2920 elif tag == "series": 

2921 series = get_text_from_node(child) 

2922 if self.series and has_comment: 2922 ↛ 2923line 2922 didn't jump to line 2923, because the condition on line 2922 was never true

2923 self.comment += ", " + series 

2924 else: 

2925 if self.series: 2925 ↛ 2926line 2925 didn't jump to line 2926, because the condition on line 2925 was never true

2926 self.series += ", " 

2927 self.series += series 

2928 elif tag == "annotation": 2928 ↛ 2929line 2928 didn't jump to line 2929, because the condition on line 2928 was never true

2929 if not self.annotation: 

2930 self.annotation = get_text_from_node(child) 

2931 elif tag == "article-title": 

2932 # TODO: migration to store article_title_tex and article_title_html 

2933 _, article_title_tex = self.parse_node_with_mixed_content(child, is_citation=True) 

2934 

2935 if self.type == "book": 2935 ↛ 2937line 2935 didn't jump to line 2937, because the condition on line 2935 was never true

2936 # Elsevier uses article-title for books !?! 

2937 if len(self.source_tex) == 0: 

2938 if has_comment: 

2939 self.comment += " " + article_title_tex 

2940 else: 

2941 self.source_tex = article_title_tex 

2942 else: 

2943 if self.series and has_comment: 

2944 self.comment += ", " + article_title_tex 

2945 else: 

2946 self.series += get_text_from_node(child) 

2947 elif self.type == "inproceedings": 

2948 if self.chapter_title_tex and has_comment: 2948 ↛ 2949line 2948 didn't jump to line 2949, because the condition on line 2948 was never true

2949 self.comment += " " + article_title_tex 

2950 else: 

2951 self.chapter_title_tex = article_title_tex 

2952 else: 

2953 if self.article_title_tex and has_comment: 2953 ↛ 2954line 2953 didn't jump to line 2954, because the condition on line 2953 was never true

2954 self.comment += " " + article_title_tex 

2955 else: 

2956 self.article_title_tex = article_title_tex 

2957 elif tag == "chapter-title": 

2958 # TODO: migration to store chapter_title_tex and chapter_title_html 

2959 _, chapter_title_tex = self.parse_node_with_mixed_content(child, is_citation=True) 

2960 if self.chapter_title_tex and has_comment: 2960 ↛ 2961line 2960 didn't jump to line 2961, because the condition on line 2960 was never true

2961 self.comment += " " + chapter_title_tex 

2962 else: 

2963 self.chapter_title_tex = chapter_title_tex 

2964 elif tag == "conf-name": 

2965 _, conf_tex = self.parse_node_with_mixed_content(child, is_citation=True) 

2966 if self.source_tex and has_comment: 2966 ↛ 2967line 2966 didn't jump to line 2967, because the condition on line 2966 was never true

2967 self.comment += ", " + conf_tex 

2968 else: 

2969 self.source_tex = conf_tex 

2970 elif tag in ("name", "string-name", "name-alternatives", "etal", "collab"): 

2971 params = self.get_data_from_name_in_ref(child, "author") 

2972 self.contributors.append(params) 

2973 elif tag == "person-group": 

2974 self.parse_person_group(child) 

2975 elif tag == "ext-link": 

2976 self.parse_ext_link(child, add_ext_link=True) 

2977 elif tag == "pub-id": 

2978 self.parse_pub_id(child) 

2979 elif tag == "date": 2979 ↛ 2980line 2979 didn't jump to line 2980, because the condition on line 2979 was never true

2980 self.year = get_text_from_node(child) 

2981 elif tag == "date-in-citation": 2981 ↛ 2982line 2981 didn't jump to line 2982, because the condition on line 2981 was never true

2982 date_ = child.get("iso-8601-date") or "" 

2983 if date_: 

2984 if self.comment: 

2985 self.comment += ", " 

2986 self.comment += "Accessed " + date_ 

2987 elif tag == "isbn": 2987 ↛ 2988line 2987 didn't jump to line 2988, because the condition on line 2987 was never true

2988 if self.annotation: 

2989 self.annotation += ", " 

2990 self.annotation += "ISBN: " + child.text 

2991 elif tag == "issn": 2991 ↛ 2992line 2991 didn't jump to line 2992, because the condition on line 2991 was never true

2992 if self.annotation: 

2993 self.annotation += ", " 

2994 self.annotation += "ISSN: " + child.text 

2995 elif child.text is not None: 

2996 variable_name = tag.replace("-", "_") 

2997 if has_comment and hasattr(self, variable_name) and getattr(self, variable_name): 2997 ↛ 2998line 2997 didn't jump to line 2998, because the condition on line 2997 was never true

2998 if tag == "fpage": 

2999 self.comment += ", pp. " 

3000 elif tag == "lpage": 

3001 self.comment += "-" 

3002 else: 

3003 self.comment += ", " 

3004 self.comment += child.text 

3005 elif not hasattr(self, variable_name) or not getattr(self, variable_name): 

3006 setattr(self, variable_name, child.text) 

3007 

3008 def parse_person_group(self, node, **kwargs): 

3009 role = node.get("person-group-type") or "" 

3010 if role and role[-1] == "s": 3010 ↛ 3011line 3010 didn't jump to line 3011, because the condition on line 3010 was never true

3011 role = role[:-1] 

3012 

3013 for child in node: 

3014 tag = normalize(child.tag) 

3015 

3016 if tag in ("name", "string-name", "name-alternatives", "etal", "collab"): 3016 ↛ 3020line 3016 didn't jump to line 3020, because the condition on line 3016 was never false

3017 contrib = self.get_data_from_name_in_ref(child, role) 

3018 self.contributors.append(contrib) 

3019 else: 

3020 self.warnings.append( 

3021 { 

3022 self.pid: self.__class__.__name__ 

3023 + "." 

3024 + inspect.currentframe().f_code.co_name 

3025 + " " 

3026 + tag 

3027 } 

3028 ) 

3029 

3030 def parse_pub_id(self, node, **kwargs): 

3031 node_type = node.get("pub-id-type") or "" 

3032 

3033 data = { 

3034 "rel": node_type, 

3035 "mimetype": "", 

3036 "location": "", 

3037 "base": "", 

3038 "metadata": node.text, 

3039 } 

3040 

3041 self.add_extids_from_node_with_link(data) 

3042 

3043 def split_label(self): 

3044 """ 

3045 Used when sorting non-digit bibitems 

3046 """ 

3047 label = self.label.lower() 

3048 if len(label) > 1: 

3049 label = label[1:-1] 

3050 

3051 try: 

3052 self.label_prefix, self.label_suffix = re.split(r"[\d]+", label) 

3053 except ValueError: 

3054 # Special case where label is similar as "Sma" instead of "Sma15" 

3055 self.label_prefix, self.label_suffix = [label, ""] 

3056 

3057 

3058class BitsCollection(CollectionData, JatsBase): 

3059 def __init__(self, *args, **kwargs): 

3060 super().__init__(*args, **kwargs) 

3061 self.parse_tree(kwargs["tree"]) 

3062 

3063 def parse_tree(self, tree): 

3064 super().parse_tree(tree) 

3065 

3066 if tree is not None: 3066 ↛ 3109line 3066 didn't jump to line 3109, because the condition on line 3066 was never false

3067 tag = normalize(tree.tag) 

3068 collection_meta_node = None 

3069 if tag == "collection-meta": 

3070 self.parse_collection_meta(tree) 

3071 collection_meta_node = tree 

3072 elif tag == "in-collection": 3072 ↛ 3096line 3072 didn't jump to line 3096, because the condition on line 3072 was never false

3073 for node in tree: 

3074 tag = normalize(node.tag) 

3075 

3076 if tag == "collection-meta": 

3077 self.parse_collection_meta(node) 

3078 collection_meta_node = node 

3079 elif tag == "volume": 

3080 self.parse_volume(node) 

3081 elif tag == "volume-series": 3081 ↛ 3083line 3081 didn't jump to line 3083, because the condition on line 3081 was never false

3082 self.parse_volume_series(node) 

3083 elif tag == "volume-title": 

3084 self.parse_volume_title(node) 

3085 else: 

3086 self.warnings.append( 

3087 { 

3088 self.pid: self.__class__.__name__ 

3089 + "." 

3090 + inspect.currentframe().f_code.co_name 

3091 + " " 

3092 + tag 

3093 } 

3094 ) 

3095 

3096 if collection_meta_node is not None: 3096 ↛ 3099line 3096 didn't jump to line 3099, because the condition on line 3096 was never false

3097 self.set_seq(collection_meta_node) 

3098 else: 

3099 self.warnings.append( 

3100 { 

3101 self.pid: self.__class__.__name__ 

3102 + "." 

3103 + inspect.currentframe().f_code.co_name 

3104 + " " 

3105 + tag 

3106 } 

3107 ) 

3108 

3109 self.collection = Foo() 

3110 self.collection.pid = self.pid 

3111 

3112 def parse_collection_meta(self, node, **kwargs): 

3113 self.coltype = node.get("collection-type") 

3114 

3115 for child in node: 

3116 tag = normalize(child.tag) 

3117 

3118 if tag == "collection-id": 

3119 self.pid = child.text 

3120 elif tag == "title-group": 

3121 self.parse_title_group(child) 

3122 elif tag == "issn": 

3123 node_type = child.get("pub-type") 

3124 if node_type == "ppub": 3124 ↛ 3125line 3124 didn't jump to line 3125, because the condition on line 3124 was never true

3125 self.issn = child.text 

3126 self.ids.append(("issn", child.text)) 

3127 elif node_type == "epub": 3127 ↛ 3128line 3127 didn't jump to line 3128, because the condition on line 3127 was never true

3128 self.e_issn = child.text 

3129 self.ids.append(("e-issn", child.text)) 

3130 elif tag == "ext-link": 3130 ↛ 3131line 3130 didn't jump to line 3131, because the condition on line 3130 was never true

3131 data = self.get_data_from_ext_link(child) 

3132 self.ext_links.append(data) 

3133 elif tag == "volume-in-collection": 

3134 self.parse_volume_in_collection(child) 

3135 else: 

3136 self.warnings.append( 

3137 { 

3138 self.pid: self.__class__.__name__ 

3139 + "." 

3140 + inspect.currentframe().f_code.co_name 

3141 + " " 

3142 + tag 

3143 } 

3144 ) 

3145 

3146 def parse_volume(self, node, **kwargs): 

3147 self.volume = node.text 

3148 

3149 def parse_volume_in_collection(self, node, **kwargs): 

3150 for child in node: 

3151 tag = normalize(child.tag) 

3152 

3153 if tag == "volume-number": 

3154 self.parse_volume(child) 

3155 elif tag == "volume-series": 

3156 self.parse_volume_series(child) 

3157 elif tag == "volume-title": 3157 ↛ 3160line 3157 didn't jump to line 3160, because the condition on line 3157 was never false

3158 self.parse_volume_title(child) 

3159 else: 

3160 self.warnings.append( 

3161 { 

3162 self.pid: self.__class__.__name__ 

3163 + "." 

3164 + inspect.currentframe().f_code.co_name 

3165 + " " 

3166 + tag 

3167 } 

3168 ) 

3169 

3170 def parse_volume_series(self, node, **kwargs): 

3171 self.vseries = node.text 

3172 

3173 def parse_volume_title(self, node, **kwargs): 

3174 self.title_tex, self.title_html = self.parse_node_with_mixed_content(node) 

3175 self.title_xml = get_xml_from_node(node) 

3176 

3177 def set_seq(self, node): 

3178 try: 

3179 # First, use the seq attribute, if any 

3180 self.seq = int(node.get("seq") or "") 

3181 except ValueError: 

3182 # Second, use self.volume (which can be like "158-159") 

3183 if not self.volume: 3183 ↛ 3184line 3183 didn't jump to line 3184, because the condition on line 3183 was never true

3184 self.seq = 0 

3185 else: 

3186 text = self.volume.split("-")[0] 

3187 try: 

3188 self.seq = int(text) 

3189 except ValueError: 

3190 self.seq = 0 

3191 

3192 # Third, use self.vseries as an offset 

3193 try: 

3194 # pas plus de 10000 ouvrages dans une série (gasp) 

3195 self.seq = int(self.vseries) * 10000 + self.seq 

3196 except ValueError: 

3197 pass 

3198 

3199 

3200class BitsBook(BookData, JatsBase): 

3201 def __init__(self, *args, **kwargs): 

3202 super().__init__(*args, **kwargs) 

3203 self.no_bib = kwargs.get("no_bib", False) 

3204 

3205 self.parse_tree(kwargs["tree"]) 

3206 

3207 def parse_tree(self, tree): 

3208 super().parse_tree(tree) 

3209 

3210 book_type = get_normalized_attrib(tree, "book-type") or "Book" 

3211 self.ctype = "book-" + book_type 

3212 

3213 for node in tree: 

3214 if type(tree) == type(node): 3214 ↛ 3213line 3214 didn't jump to line 3213, because the condition on line 3214 was never false

3215 tag = normalize(node.tag) 

3216 

3217 if tag in ("collection-meta", "in-collection"): 

3218 col = BitsCollection(tree=node) 

3219 self.incollection.append(col) 

3220 elif tag == "book-meta": 

3221 self.parse_book_meta(node) 

3222 elif tag == "book-body": 

3223 self.parse_book_body(node) 

3224 elif tag == "front-matter": 

3225 self.parse_front_matter(node) 

3226 elif tag == "book-back": 

3227 for child in node: 

3228 tag = normalize(child.tag) 

3229 if tag == "ref-list": 

3230 self.parse_ref_list(child) 

3231 else: 

3232 self.warnings.append( 

3233 { 

3234 self.pid: self.__class__.__name__ 

3235 + "." 

3236 + inspect.currentframe().f_code.co_name 

3237 + " " 

3238 + tag 

3239 } 

3240 ) 

3241 else: 

3242 self.warnings.append( 

3243 { 

3244 self.pid: self.__class__.__name__ 

3245 + "." 

3246 + inspect.currentframe().f_code.co_name 

3247 + " " 

3248 + tag 

3249 } 

3250 ) 

3251 

3252 self.set_contribs() 

3253 self.set_title() 

3254 self.post_parse_tree() 

3255 

3256 def parse_book_body(self, node, **kwargs): 

3257 for child in node: 

3258 if type(child) == type(node): 3258 ↛ 3257line 3258 didn't jump to line 3257, because the condition on line 3258 was never false

3259 tag = normalize(child.tag) 

3260 

3261 if tag == "book-part": 3261 ↛ 3266line 3261 didn't jump to line 3266, because the condition on line 3261 was never false

3262 book_part = BitsBookPart(tree=child, no_bib=self.no_bib) 

3263 self.warnings.extend(book_part.warnings) 

3264 self.parts.append(book_part) 

3265 else: 

3266 self.warnings.append( 

3267 { 

3268 self.pid: self.__class__.__name__ 

3269 + "." 

3270 + inspect.currentframe().f_code.co_name 

3271 + " " 

3272 + tag 

3273 } 

3274 ) 

3275 

3276 if not self.parts: 

3277 self.body = get_text_from_node(node) 

3278 

3279 def parse_book_meta(self, node, **kwargs): 

3280 for child in node: 

3281 tag = normalize(child.tag) 

3282 

3283 if tag == "book-id": 

3284 self.parse_id(child) 

3285 elif tag == "pub-date": 

3286 self.year = self.get_data_from_date(child) 

3287 elif tag == "book-volume-number": 3287 ↛ 3288line 3287 didn't jump to line 3288, because the condition on line 3287 was never true

3288 self.volume = child.text 

3289 self.volume_int = child.text 

3290 elif tag == "pub-history": 

3291 history_dates = self.get_data_from_history(child) 

3292 for date in history_dates: 

3293 if date["type"] == "last-modified": 

3294 self.last_modified_iso_8601_date_str = date["date"] 

3295 elif date["type"] == "prod-deployed-date": 3295 ↛ 3296line 3295 didn't jump to line 3296, because the condition on line 3295 was never true

3296 self.prod_deployed_date_iso_8601_date_str = date["date"] 

3297 elif tag == "book-title-group": 

3298 self.parse_title_group(child) 

3299 elif tag == "publisher": 

3300 self.publisher = JatsPublisher(tree=child) 

3301 else: 

3302 fct_name = "parse_" + tag.replace("-", "_") 

3303 ftor = getattr(self, fct_name, None) 

3304 if callable(ftor): 

3305 ftor(child, add_ext_link=True) 

3306 else: 

3307 self.warnings.append( 

3308 { 

3309 self.pid: self.__class__.__name__ 

3310 + "." 

3311 + inspect.currentframe().f_code.co_name 

3312 + " " 

3313 + tag 

3314 } 

3315 ) 

3316 

3317 if self.last_modified_iso_8601_date_str is None: 3317 ↛ 3318line 3317 didn't jump to line 3318, because the condition on line 3317 was never true

3318 self.last_modified_iso_8601_date_str = timezone.now().isoformat() 

3319 

3320 def parse_custom_meta_group(self, node, **kwargs): 

3321 for child in node: 

3322 tag = normalize(child.tag) 

3323 

3324 if tag == "custom-meta": 3324 ↛ 3321line 3324 didn't jump to line 3321, because the condition on line 3324 was never false

3325 name, value = self.get_data_from_custom_meta(child) 

3326 

3327 if name == "provider": 3327 ↛ 3321line 3327 didn't jump to line 3321, because the condition on line 3327 was never false

3328 self.provider = value 

3329 

3330 def set_contribs(self): 

3331 """ 

3332 Update the contrib_groups if the XML does not declare any 

3333 - with the authors of the first part 

3334 - if the book is a monograph 

3335 - if all parts are written by the same authors 

3336 

3337 :return: 

3338 """ 

3339 

3340 authors = [contrib for contrib in self.contributors if contrib["role"] == "author"] 

3341 if not authors: 

3342 if self.ctype == "book-monograph" and self.parts: 

3343 first_part = self.parts[0] 

3344 self.contributors = first_part.contributors 

3345 elif ( 3345 ↛ exitline 3345 didn't return from function 'set_contribs', because the condition on line 3345 was never false

3346 self.ctype == "book-edited-book" or self.ctype == "book-lecture-notes" 

3347 ) and self.parts: 

3348 # check if authors of the book-parts are identical 

3349 equal = True 

3350 book_part_contributors = self.parts[0].contributors 

3351 i = 1 

3352 while equal and i < len(self.parts): 

3353 part = self.parts[i] 

3354 if part.contributors != book_part_contributors: 3354 ↛ 3356line 3354 didn't jump to line 3356, because the condition on line 3354 was never false

3355 equal = False 

3356 i += 1 

3357 if equal: 3357 ↛ 3358line 3357 didn't jump to line 3358, because the condition on line 3357 was never true

3358 if self.ctype == "book-edited-book": 

3359 self.ctype = "book-monograph" 

3360 self.contributors = book_part_contributors 

3361 else: 

3362 contrib = create_contributor() 

3363 contrib["string_name"] = "Collectif" 

3364 contrib["role"] = "author" 

3365 contrib["contrib_xml"] = get_contrib_xml(contrib) 

3366 self.contributors.append(contrib) 

3367 

3368 def set_title(self): 

3369 if self.title_xml == "" and len(self.incollection) > 0: 

3370 self.title_xml = self.incollection[0].title_xml 

3371 self.title_html = self.incollection[0].title_html 

3372 self.title_tex = self.incollection[0].title_tex 

3373 

3374 

3375class BitsBookPart(BookPartData, JatsArticleBase): 

3376 def __init__(self, *args, **kwargs): 

3377 super().__init__(*args, **kwargs) 

3378 self.no_bib = kwargs.get("no_bib", False) 

3379 

3380 self.parse_tree(kwargs["tree"]) 

3381 

3382 def parse_tree(self, tree): 

3383 super().parse_tree(tree) 

3384 

3385 self.atype = get_normalized_attrib(tree, "book-part-type") or "" 

3386 try: 

3387 self.seq = int(get_normalized_attrib(tree, "seq") or "") 

3388 except ValueError: 

3389 pass 

3390 

3391 for node in tree: 

3392 tag = normalize(node.tag) 

3393 

3394 if tag == "book-part-meta": 

3395 self.parse_book_part_meta(node) 

3396 elif tag == "body": 

3397 self.parse_body(node) 

3398 elif tag == "front-matter": 3398 ↛ 3399line 3398 didn't jump to line 3399, because the condition on line 3398 was never true

3399 self.parse_front_matter(node) 

3400 elif tag == "back": 3400 ↛ 3417line 3400 didn't jump to line 3417, because the condition on line 3400 was never false

3401 for child in node: 

3402 tag =