Coverage for apps/ptf/cmds/xml/jats/jats_parser.py: 70%

2037 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2024-02-28 09:09 +0000

1################################################################################################## 

2# 

3# README 

4# 

5# jats_parser.py is a replacement of xmldata.py 

6# The purpose is to parse a JATS xml (or BITS) tree from top to bottom. 

7# Each node is read only once. 

8# 

9# JatsArticle, JatsIssue, JatsJournal, BitsBook are the objects created by xml_cmds. 

10# The xml tree is parsed in the class constructor (__init__) 

11# These classes have parse_<tag> functions to parse the xml nodes and set instance variables. 

12# Some parse_<tag> functions are called directly. 

13# Ex: if tag == "article-meta": 

14# self.parse_article_meta(child) 

15# Other parse_<tag> functions are called "automatically" 

16# fct_name = 'parse_' + tag.replace('-', '_') 

17# ftor = getattr(self, fct_name, None) 

18# if callable(ftor): 

19# ftor(child) 

20# 

21# JatsBase and JatsArticleBase are base classes. 

22# They provide common instance variables and their corresponding parse_<tag> functions 

23# 

24# html_from_<tag> are used to generate the HTML text of a node with mixed content: 

25# a node that mixes text, children and tail 

26# These functions can also extract data and set instance variables (ex: self.figures) 

27# 

28# get_data_from_* parse a node, but simply return data (text, dict,...) without side effects 

29# 

30# At the end of this file, there are some functions that are/were called by ptf-tools. 

31# They are kept here for simplicity: we can switch xmldata entirely with jats_parser 

32# 

33# TODO: the import OAI or the import of a collection could simply call the first function 

34# (def parser(tree)) 

35# 

36################################################################################################## 

37 

38import copy 

39import inspect 

40import os 

41import re 

42 

43from lxml import etree 

44from pylatexenc.latexencode import unicode_to_latex 

45 

46from django.conf import settings 

47from django.urls import reverse 

48from django.utils import timezone 

49 

50from matching import scrapping 

51from ptf.cmds.xml.citation_html import add_span_class_to_html_from_article_title 

52from ptf.cmds.xml.citation_html import add_span_class_to_html_from_authors 

53from ptf.cmds.xml.citation_html import add_span_class_to_html_from_chapter_title 

54from ptf.cmds.xml.citation_html import add_span_class_to_html_from_source 

55from ptf.cmds.xml.citation_html import add_span_class_to_html_from_volume 

56from ptf.cmds.xml.citation_html import get_citation_html 

57from ptf.cmds.xml.xml_base import RefBase 

58from ptf.cmds.xml.xml_base import XmlParserBase 

59from ptf.cmds.xml.xml_utils import escape 

60from ptf.cmds.xml.xml_utils import get_contrib_xml 

61from ptf.cmds.xml.xml_utils import get_elsevier_image_extensions 

62from ptf.cmds.xml.xml_utils import get_normalized_attrib 

63from ptf.cmds.xml.xml_utils import get_text_from_node 

64from ptf.cmds.xml.xml_utils import get_xml_from_node 

65from ptf.cmds.xml.xml_utils import helper_update_name_params 

66from ptf.cmds.xml.xml_utils import make_links_clickable 

67from ptf.cmds.xml.xml_utils import normalize 

68from ptf.cmds.xml.xml_utils import normalize_space 

69from ptf.cmds.xml.xml_utils import split_kwds 

70from ptf.display import resolver 

71from ptf.model_data import ArticleData 

72from ptf.model_data import BookData 

73from ptf.model_data import BookPartData 

74from ptf.model_data import CollectionData 

75from ptf.model_data import Foo 

76from ptf.model_data import IssueData 

77from ptf.model_data import JournalData 

78from ptf.model_data import MathdocPublicationData 

79from ptf.model_data import PublisherData 

80from ptf.model_data import create_contributor 

81 

82 

83class JatsBase(XmlParserBase): 

84 def __init__(self, *args, **kwargs): 

85 super().__init__() 

86 self.warnings = [] 

87 self.fns = [] 

88 self.tree = None 

89 # Used to convert an XML value for CKEditor (ie abstract) 

90 self.add_span_around_tex_formula = False 

91 # Used to create a Tex file from an XML value (ie abstract) 

92 self.for_tex_file = False 

93 

94 def parse_tree(self, tree): 

95 self.tree = tree 

96 self.lang = get_normalized_attrib(tree, "lang") or "und" 

97 

98 def parse_node_with_article_title(self, node, **kwargs): 

99 tex, html = self.parse_inner_node(node, **kwargs) 

100 

101 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False 

102 if is_mixed_citation: 

103 html = add_span_class_to_html_from_article_title(html, **kwargs) 

104 

105 return tex, html 

106 

107 def parse_node_with_break(self, node, **kwargs): 

108 tex = "\\newline\n" if self.for_tex_file else " " 

109 html = "<br/>" 

110 

111 return tex, html 

112 

113 def parse_node_with_chem_struct_wrap(self, node, **kwargs): 

114 table_id = label = None 

115 inner_text = "" 

116 

117 if "id" in node.attrib: 

118 table_id = node.attrib["id"] 

119 

120 for child in node: 

121 tag = normalize(child.tag) 

122 if tag == "label": 

123 _, label = self.parse_node_with_mixed_content(child, **kwargs) 

124 else: 

125 _, child_text = self.parse_node_with_mixed_content(child, **kwargs) 

126 inner_text += child_text 

127 

128 text = "<table " 

129 if table_id: 

130 text += f'id="{table_id}" ' 

131 text += f'class="formula"><tr><td class="formula-inner">{inner_text}</td>' 

132 

133 text += '<td class="formula-label">' 

134 if label: 

135 text += label 

136 text += "</td></tr>" 

137 text += "</table>" 

138 

139 return text, text 

140 

141 def parse_node_with_disp_quote(self, node, **kwargs): 

142 tex, html = self.parse_inner_node(node, **kwargs) 

143 

144 html = f'<div class="disp-quote">{html}</div>' 

145 tex = f'<div class="disp-quote">{tex}</div>' 

146 

147 return tex, html 

148 

149 def parse_node_with_boxed_text(self, node, **kwargs): 

150 box_id = node.attrib["id"] if "id" in node.attrib else None 

151 

152 _, node_html = self.parse_inner_node(node, **kwargs) 

153 

154 if box_id: 

155 html = f'<div id="{box_id}" class="boxed-text">' 

156 else: 

157 html = '<div class="boxed-text">' 

158 

159 html = f"{html}{node_html}</div>" 

160 

161 return "", html 

162 

163 def parse_node_with_fig(self, node, **kwargs): 

164 """ 

165 Ex: <fig><label>LABEL</label><caption><title>TITLE</title>CAPTION</caption><graphic/></fig> 

166 becomes: <figure><img><figcaption>LABEL : TITLE<p>CAPTION</p></figcaption></figure> 

167 

168 :param node: XML node of a fig 

169 :return: the HTML text + the dict representing the image (mimetype, location,...) 

170 """ 

171 html = "" 

172 

173 fig_id = label_html = title_html = caption_html = None 

174 img_html = "" 

175 

176 if "id" in node.attrib: 

177 fig_id = node.attrib["id"] 

178 

179 for child in node: 

180 tag = normalize(child.tag) 

181 if tag == "label": 

182 _, label_html = self.parse_node_with_mixed_content(child, **kwargs) 

183 elif tag == "caption": 

184 for caption_child in child: 

185 tag = normalize(caption_child.tag) 

186 if tag == "title": 

187 _, title_html = self.parse_node_with_mixed_content(caption_child, **kwargs) 

188 elif tag == "p": 188 ↛ 202line 188 didn't jump to line 202, because the condition on line 188 was never false

189 _, caption_p_html = self.parse_node_with_mixed_content( 

190 caption_child, **kwargs 

191 ) 

192 if caption_html: 

193 caption_html = caption_html.replace( 

194 "<p>", '<p class="fig-first-caption">', 1 

195 ) 

196 caption_html += caption_p_html.replace( 

197 "<p>", '<p class="fig-small-caption">', 1 

198 ) 

199 else: 

200 caption_html = caption_p_html 

201 else: 

202 self.warnings.append( 

203 { 

204 self.pid: self.__class__.__name__ 

205 + "." 

206 + inspect.currentframe().f_code.co_name 

207 + " " 

208 + tag 

209 } 

210 ) 

211 

212 elif tag == "graphic": 

213 _, graphic_html = self.parse_node_with_graphic(child, **kwargs) 

214 img_html += graphic_html 

215 elif tag == "attrib": 

216 _, html = self.parse_node_with_mixed_content(child, **kwargs) 

217 caption_html = f'{caption_html}<p class="fig-small-caption">{html}</p>' 

218 elif tag == "permissions": 218 ↛ 224line 218 didn't jump to line 224, because the condition on line 218 was never false

219 for gchild in child: 

220 if gchild.tag == "copyright-statement": 220 ↛ 219line 220 didn't jump to line 219, because the condition on line 220 was never false

221 _, html = self.parse_node_with_mixed_content(gchild, **kwargs) 

222 caption_html = f'{caption_html}<p class="fig-small-caption">{html}</p>' 

223 else: 

224 self.warnings.append( 

225 { 

226 self.pid: self.__class__.__name__ 

227 + "." 

228 + inspect.currentframe().f_code.co_name 

229 + " " 

230 + tag 

231 } 

232 ) 

233 

234 if fig_id: 

235 html = '<figure id="' + fig_id + '">' 

236 else: 

237 html = "<figure>" 

238 

239 if len(img_html) > 0: 239 ↛ 242line 239 didn't jump to line 242, because the condition on line 239 was never false

240 html += img_html 

241 

242 if label_html or title_html or (caption_html is not None and len(caption_html) > 0): 242 ↛ 256line 242 didn't jump to line 256, because the condition on line 242 was never false

243 html += "<figcaption>" 

244 

245 if label_html: 245 ↛ 247line 245 didn't jump to line 247, because the condition on line 245 was never false

246 html += label_html 

247 if label_html and title_html: 

248 html += " : " 

249 if title_html: 

250 html += title_html 

251 if caption_html: 251 ↛ 254line 251 didn't jump to line 254, because the condition on line 251 was never false

252 html += caption_html 

253 

254 html += "</figcaption>" 

255 

256 html += "</figure>" 

257 

258 if ( 258 ↛ 264line 258 didn't jump to line 264

259 "append_floats" in kwargs 

260 and kwargs["append_floats"] 

261 and hasattr(self, "floats") 

262 and fig_id is not None 

263 ): 

264 self.floats[fig_id] = html 

265 

266 return "", html 

267 

268 def parse_node_with_fn(self, node, **kwargs): 

269 """ 

270 Ex: <fn><label>LABEL</label><p>TEXT</p></fn> 

271 

272 :param node: XML node of a fn 

273 :return: ''. the text is stripped from the HTML. but a list of fn is built 

274 """ 

275 html = fn_html = "" 

276 

277 label_html = fn_id = None 

278 

279 if "id" in node.attrib: 279 ↛ 280line 279 didn't jump to line 280, because the condition on line 279 was never true

280 fn_id = node.attrib["id"] 

281 

282 for child in node: 

283 tag = normalize(child.tag) 

284 if tag == "label": 

285 _, label_html = self.parse_node_with_mixed_content(child, **kwargs) 

286 elif tag == "p": 286 ↛ 290line 286 didn't jump to line 290

287 _, fn_html = self.parse_node_with_mixed_content(child, **kwargs) 

288 fn_html = fn_html.replace("<p>", "").replace("</p>", "") 

289 else: 

290 warning = ( 

291 self.__class__.__name__ 

292 + "." 

293 + inspect.currentframe().f_code.co_name 

294 + " " 

295 + tag 

296 ) 

297 self.warnings.append({self.pid: warning}) 

298 

299 if fn_id: 299 ↛ 300line 299 didn't jump to line 300, because the condition on line 299 was never true

300 html = '<p id="' + fn_id + '">' 

301 else: 

302 html = "<p>" 

303 

304 if label_html and ("keep_fn_label" not in kwargs or kwargs["keep_fn_label"]): 304 ↛ 307line 304 didn't jump to line 307, because the condition on line 304 was never false

305 html += f"<sup>{label_html}</sup> " 

306 

307 html += fn_html + "</p>" 

308 

309 if not kwargs["keep_fn"] and html not in self.fns: 309 ↛ 310line 309 didn't jump to line 310, because the condition on line 309 was never true

310 self.fns.append(html) 

311 

312 html = html if kwargs["keep_fn"] else "" 

313 return "", html 

314 

315 def parse_node_with_graphic(self, node, **kwargs): 

316 """ 

317 The href value of graphics used in our XML can have the following values 

318 - relative path to the issue XML folder (Elsevier JATS) 

319 - full path starting with "file:/" (Elsevier JATS created in early 2022) 

320 - simple file name (with no relative path) in the RVT FullText XML 

321 

322 After the import, we want 

323 - the files located in the src/tex/figures article folder 

324 - the url pointing to the image, built thanks to kwargs['base_url'] 

325 

326 addRelatedObjectPtfCmd will copy the images to the src/tex/figures folder if the location starts with file:/ 

327 => change the location to "file:/..." for Elsevier JATS (the xarticle has a pii attribute) 

328 """ 

329 href = "" 

330 

331 for attrib in node.attrib: 

332 name = normalize(attrib) 

333 if name == "href": 

334 href = node.attrib[attrib] 

335 

336 if href: 336 ↛ 382line 336 didn't jump to line 382, because the condition on line 336 was never false

337 basename = os.path.basename(href) 

338 ext = basename.split(".")[-1] 

339 if ext == "png": 339 ↛ 340line 339 didn't jump to line 340, because the condition on line 339 was never true

340 mimetype = "image/png" 

341 else: 

342 mimetype = "image/jpeg" 

343 

344 img_url = "src/tex/figures/" + basename 

345 

346 if ext in get_elsevier_image_extensions(): # Elsevier uses "jc3" instead of jpg. WTF ? 346 ↛ 349line 346 didn't jump to line 349, because the condition on line 346 was never false

347 img_url = img_url[0 : -len(ext)] + "jpg" 

348 

349 data_location = href if "file:/" in href else img_url 

350 if ( 350 ↛ 356line 350 didn't jump to line 356

351 hasattr(self, "pii") 

352 and hasattr(self, "issue") 

353 and "file:/" not in href 

354 and self.from_folder 

355 ): 

356 base_dir = self.issue.journal.pid 

357 if os.path.dirname(href) != base_dir: 

358 href = os.path.join(self.from_folder, base_dir, self.issue.pid, href) 

359 data_location = "file:" + href 

360 

361 data = { 

362 "rel": "html-image", 

363 "mimetype": mimetype, 

364 "location": data_location, 

365 "base": None, 

366 "metadata": node.text if node.text is not None else "", 

367 } 

368 

369 if ext == "png": 369 ↛ 370line 369 didn't jump to line 370, because the condition on line 369 was never true

370 img_url = os.path.join(kwargs["base_url"], "png", img_url) 

371 else: 

372 img_url = os.path.join(kwargs["base_url"], "jpg", img_url) 

373 img_text = '<a href="' + img_url + '" data-lightbox="image-' 

374 img_text += str(len(self.figures)) + '" title="">' 

375 img_text += '<img src="' + img_url + '" class="article-body-img" />' 

376 img_text += "</a>" 

377 

378 if data not in self.figures: 378 ↛ 382line 378 didn't jump to line 382, because the condition on line 378 was never false

379 self.figures.append(data) 

380 self.related_objects.append(data) 

381 

382 return "", img_text 

383 

384 def parse_node_with_inline_formula(self, node, **kwargs): 

385 # MathJAX is doing a good job with formulae and is now the standard 

386 # MathML could be ignored in HTML (the original XML value is preserved with value_xml) 

387 # We could simply return the tex-math text 

388 # But there are multiple errors in the TeX of the Mersenne articles. 

389 # We first need to fix those mistakes before switching to TeX 

390 

391 tex_math = "" 

392 math_text = "" 

393 formula_id = label = None 

394 

395 if "id" in node.attrib: 

396 formula_id = node.attrib["id"] 

397 

398 for child in node: 

399 tag = normalize(child.tag) 

400 if tag == "alternatives": 

401 for alternative in child: 

402 tag = normalize(alternative.tag) 

403 if tag == "tex-math": 

404 tex_math = alternative.text or "" 

405 elif tag == "math": 

406 # remove_namespace(child) 

407 # Elsevier sometimes provide the formula a an alternative image. Remove it. 

408 alternative.attrib.pop("altimg", None) 

409 

410 math_text = get_xml_from_node(alternative).replace("mml:", "") 

411 math_text = math_text.replace( 

412 'xmlns:xlink="http://www.w3.org/1999/xlink"', "" 

413 ) 

414 math_text = math_text.replace( 

415 'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"', "" 

416 ) 

417 if node.tag == "disp-formula": 

418 math_text = math_text.replace("<math", '<math display="block"') 

419 elif tag == "label": 419 ↛ 422line 419 didn't jump to line 422, because the condition on line 419 was never false

420 label = child.text or "" 

421 else: 

422 self.warnings.append( 

423 { 

424 self.pid: self.__class__.__name__ 

425 + "." 

426 + inspect.currentframe().f_code.co_name 

427 + " " 

428 + tag 

429 } 

430 ) 

431 

432 if (math_text == "" and tex_math != "") or (math_text != "" and tex_math == ""): 

433 stack = inspect.stack() 

434 stack_str = " ".join( 

435 [ 

436 frameinfo[3] 

437 for frameinfo in stack[1:] 

438 if frameinfo[3].find("parse_") == 0 

439 and frameinfo[3].find("parse_node") == -1 

440 and frameinfo[3].find("parse_inner") == -1 

441 and frameinfo[3].find("parse_tree") == -1 

442 and frameinfo[3].find("parse_article_meta") == -1 

443 ] 

444 ) 

445 print(f"{self.pid} no math formula for {stack_str}") 

446 # raise ValueError("No formula alternative") 

447 

448 if node.tag != "disp-formula": 

449 if tex_math != "" and tex_math[0] != "$": 449 ↛ 450line 449 didn't jump to line 450, because the condition on line 449 was never true

450 tex_math = "$" + tex_math 

451 if tex_math != "" and tex_math[-1] != "$": 451 ↛ 452line 451 didn't jump to line 452, because the condition on line 451 was never true

452 tex_math = tex_math + "$" 

453 

454 tex = tex_math 

455 

456 html = "" 

457 if label or node.tag == "disp-formula": 

458 html += '<table class="formula"><tr><td class="formula-inner">' 

459 

460 html += '<span class="mathjax-formula" ' 

461 if formula_id: 

462 html += 'id="' + formula_id + '" ' 

463 alt_text = tex_math.replace("\n", "") if node.tag == "disp-formula" else tex_math 

464 html += f'data-tex="{alt_text}">{math_text}</span>' 

465 

466 if label or node.tag == "disp-formula": 

467 html += '</td><td class="formula-label">' 

468 if label: 

469 html += label 

470 html += "</td></tr>" 

471 html += "</table>" 

472 

473 if self.add_span_around_tex_formula: 473 ↛ 474line 473 didn't jump to line 474, because the condition on line 473 was never true

474 tex = f'<span class="mathjax-formula">\\({tex[1:-1]}\\)</span>' 

475 

476 return tex, html 

477 

478 def parse_node_with_institution_id(self, node, **kwargs): 

479 return "", "" 

480 

481 def parse_node_with_italic(self, node, **kwargs): 

482 tex, html = self.parse_inner_node(node, **kwargs) 

483 

484 # is_mixed_citation = kwargs['is_mixed_citation'] if 'is_mixed_citation' in kwargs else False 

485 # is_citation = kwargs['is_citation'] if 'is_citation' in kwargs else False 

486 # is_comment = kwargs['is_comment'] if 'is_comment' in kwargs else False 

487 # 

488 # if inner_text == '' or kwargs['temp_tex'] or (is_citation and not is_mixed_citation and not is_comment): 

489 # text = inner_text 

490 # else: 

491 # text = '<span class="italique">' + inner_text + '</span>' 

492 

493 html = f'<span class="italique">{html}</span>' 

494 

495 if self.for_tex_file: 495 ↛ 496line 495 didn't jump to line 496, because the condition on line 495 was never true

496 tex = "{\\it " + tex + "}" 

497 else: 

498 tex = f"<i>{tex}</i>" 

499 

500 return tex, html 

501 

502 def parse_node_with_list(self, node, **kwargs): 

503 tex, html = self.parse_inner_node(node, **kwargs) 

504 

505 start = None 

506 continued_from = node.get("continued-from") 

507 if continued_from is not None: 507 ↛ 508line 507 didn't jump to line 508, because the condition on line 507 was never true

508 start = self.get_list_start_value(node) + 1 

509 

510 list_type = node.get("list-type") 

511 if list_type == "bullet" or list_type == "simple": 

512 if self.for_tex_file: 512 ↛ 513line 512 didn't jump to line 513, because the condition on line 512 was never true

513 tex = "\n\\begin{itemize}\n" + tex + "\\end{itemize}\n" 

514 else: 

515 tex = f"<ul>{tex}</ul>" 

516 

517 html = f"<ul>{html}</ul>" 

518 else: 

519 if self.for_tex_file: 519 ↛ 520line 519 didn't jump to line 520, because the condition on line 519 was never true

520 tex = "\n\\begin{enumerate}\n" + tex + "\\end{enumerate}\n" 

521 else: 

522 if list_type == "order" or list_type == "number": 

523 if start is not None: 523 ↛ 524line 523 didn't jump to line 524, because the condition on line 523 was never true

524 html = f'<ol type="1" start="{str(start)}">{html}</ol>' 

525 tex = f'<ol type="1" start="{str(start)}">{tex}</ol>' 

526 else: 

527 html = f'<ol type="1">{html}</ol>' 

528 tex = f'<ol type="1">{tex}</ol>' 

529 elif list_type == "alpha-lower": 

530 html = f'<ol type="a">{html}</ol>' 

531 tex = f'<ol type="a">{tex}</ol>' 

532 elif list_type == "alpha-upper": 

533 html = f'<ol type="A">{html}</ol>' 

534 tex = f'<ol type="A">{tex}</ol>' 

535 elif list_type == "roman-lower": 

536 html = f'<ol type="i">{html}</ol>' 

537 tex = f'<ol type="i">{tex}</ol>' 

538 elif list_type == "roman-upper": 538 ↛ 539line 538 didn't jump to line 539, because the condition on line 538 was never true

539 html = f'<ol type="I">{html}</ol>' 

540 tex = f'<ol type="I">{tex}</ol>' 

541 else: 

542 html = f'<ul class="no-bullet" style="list-style-type:none;">{html}</ul>' 

543 tex = f'<ul class="no-bullet" style="list-style-type:none;">{tex}</ul>' 

544 

545 return tex, html 

546 

547 def parse_node_with_list_item(self, node, **kwargs): 

548 """ 

549 <list-item><label>LABEL</label><p>TEXT</p> becomes 

550 <li>LABEL TEXT</li> 

551 (same with <title>) 

552 

553 :param node: 

554 :return: 

555 """ 

556 

557 title_tex = ( 

558 title_html 

559 ) = label_tex = label_html = p_tex = p_html = content_tex = content_html = "" 

560 

561 for child in node: 

562 tag = normalize(child.tag) 

563 if tag == "label": 

564 label_tex, label_html = self.parse_node_with_mixed_content(child, **kwargs) 

565 elif tag == "title": 565 ↛ 566line 565 didn't jump to line 566, because the condition on line 565 was never true

566 title_tex, title_html = self.parse_node_with_mixed_content(child, **kwargs) 

567 elif tag == "p": 

568 if p_html == "" and content_html == "": 568 ↛ 571line 568 didn't jump to line 571, because the condition on line 568 was never false

569 p_tex, p_html = self.parse_inner_node(child, **kwargs) 

570 else: 

571 content_tex, content_html = self.parse_inner_node(child, **kwargs) 

572 content_html = f"<p>{content_html}</p>" 

573 elif tag == "list": 573 ↛ 577line 573 didn't jump to line 577, because the condition on line 573 was never false

574 content_tex, content_html = self.parse_node_with_mixed_content(child, **kwargs) 

575 # TODO if tag == "def-list": 

576 else: 

577 self.warnings.append( 

578 { 

579 self.pid: self.__class__.__name__ 

580 + "." 

581 + inspect.currentframe().f_code.co_name 

582 + " " 

583 + tag 

584 } 

585 ) 

586 

587 inner_tex = "" 

588 if label_tex: 

589 inner_tex += label_tex + " " 

590 if title_tex: 590 ↛ 591line 590 didn't jump to line 591, because the condition on line 590 was never true

591 inner_tex += title_tex + " " 

592 inner_tex += p_tex + content_tex 

593 

594 if self.for_tex_file: 594 ↛ 595line 594 didn't jump to line 595, because the condition on line 594 was never true

595 tex = "\\item " + inner_tex + "\n" 

596 else: 

597 tex = f"<li>{inner_tex}</li>" 

598 

599 html = "<li>" 

600 if label_html: 

601 html += label_html + " " 

602 if title_html: 602 ↛ 603line 602 didn't jump to line 603, because the condition on line 602 was never true

603 html += title_html + " " 

604 html += p_html + content_html + "</li>" 

605 

606 return tex, html 

607 

608 def parse_node_with_name_content(self, node, **kwargs): 

609 tex, html = self.parse_inner_node(node, **kwargs) 

610 return tex, html 

611 

612 def parse_node_with_p(self, node, **kwargs): 

613 tex, html = self.parse_inner_node(node, **kwargs) 

614 

615 if not self.for_tex_file: 

616 tex = f"<p>{tex}</p>" 

617 

618 node_type = node.get("specific-use") 

619 if node_type: 

620 html = f'<p class="{node_type}">{html}</p>' 

621 else: 

622 html = f"<p>{html}</p>" 

623 

624 if hasattr(self, "floats_to_insert") and hasattr(self, "floats"): 624 ↛ 625line 624 didn't jump to line 625, because the condition on line 624 was never true

625 while len(self.floats_to_insert) > 0: 

626 float_id = self.floats_to_insert.pop(0) 

627 if float_id in self.floats: 

628 html += self.floats[float_id] 

629 self.floats.pop(float_id) 

630 

631 return tex, html 

632 

633 def parse_node_with_sc(self, node, **kwargs): 

634 tex, html = self.parse_inner_node(node, **kwargs) 

635 html = f'<span class="smallcaps">{html}</span>' 

636 

637 return tex, html 

638 

639 def parse_node_with_sec(self, node, **kwargs): 

640 """ 

641 <sec><title>TITLE</title><p>TEXT</p> becomes 

642 <section><h@i>TITLE</h@i><p>TEXT</p> (i is the current level and is increased for children) 

643 

644 :param node: 

645 :param kwargs: 

646 :return: 

647 """ 

648 

649 label_tex = label_html = title_tex = title_html = None 

650 sec_level = kwargs["sec_level"] = kwargs["sec_level"] if "sec_level" in kwargs else 2 

651 

652 inner_tex = inner_html = "" 

653 kwargs["sec_level"] += 1 

654 

655 for child in node: 

656 tag = normalize(child.tag) 

657 if tag == "label": 

658 label_tex, label_html = self.parse_node_with_mixed_content(child) 

659 elif tag == "title": 

660 title_tex, title_html = self.parse_node_with_mixed_content(child) 

661 else: 

662 child_tex, child_html = self.parse_node_with_mixed_content(child, **kwargs) 

663 inner_tex += child_tex 

664 inner_html += child_html 

665 

666 tex = "" 

667 html = "<section>" 

668 

669 if label_html or title_html: 669 ↛ 682line 669 didn't jump to line 682, because the condition on line 669 was never false

670 html += f"<h{str(sec_level)}>" 

671 if label_html: 671 ↛ 674line 671 didn't jump to line 674, because the condition on line 671 was never false

672 tex += label_tex 

673 html += label_html 

674 if label_html and title_html: 674 ↛ 677line 674 didn't jump to line 677, because the condition on line 674 was never false

675 tex += " " 

676 html += " " 

677 if title_html: 677 ↛ 680line 677 didn't jump to line 680, because the condition on line 677 was never false

678 tex += title_tex 

679 html += title_html 

680 html += f"</h{str(sec_level)}>" 

681 

682 tex += inner_tex 

683 html += inner_html + "</section>" 

684 

685 return tex, html 

686 

687 def parse_node_with_string_name(self, node, **kwargs): 

688 tex, html = self.parse_inner_node(node, **kwargs) 

689 

690 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False 

691 if is_mixed_citation: 691 ↛ 694line 691 didn't jump to line 694, because the condition on line 691 was never false

692 html = add_span_class_to_html_from_authors(html.title(), **kwargs) 

693 

694 return tex, html 

695 

696 def parse_node_with_strong(self, node, **kwargs): 

697 tex, html = self.parse_inner_node(node, **kwargs) 

698 

699 if self.for_tex_file: 699 ↛ 700line 699 didn't jump to line 700, because the condition on line 699 was never true

700 tex = "{\\bf " + tex + "}" 

701 else: 

702 tex = f"<strong>{tex}</strong>" 

703 html = f"<strong>{html}</strong>" 

704 

705 return tex, html 

706 

707 def parse_node_with_styled_content(self, node, **kwargs): 

708 tex, html = self.parse_inner_node(node, **kwargs) 

709 

710 if "style" in node.attrib: 710 ↛ 715line 710 didn't jump to line 715, because the condition on line 710 was never false

711 style = node.attrib["style"] 

712 if style != "": 712 ↛ 715line 712 didn't jump to line 715, because the condition on line 712 was never false

713 html = f'<span style="{style}">{html}</span>' 

714 

715 return tex, html 

716 

717 def parse_node_with_sub(self, node, **kwargs): 

718 tex, html = self.parse_inner_node(node, **kwargs) 

719 

720 if self.for_tex_file: 720 ↛ 721line 720 didn't jump to line 721, because the condition on line 720 was never true

721 tex = "\\textsubscript{" + tex + "}" 

722 else: 

723 tex = f"<sub>{tex}</sub>" 

724 html = f"<sub>{html}</sub>" 

725 

726 return tex, html 

727 

728 def parse_node_with_sup(self, node, **kwargs): 

729 tex, html = self.parse_inner_node(node, **kwargs) 

730 

731 if self.for_tex_file: 731 ↛ 732line 731 didn't jump to line 732, because the condition on line 731 was never true

732 tex = "\\textsuperscript{" + tex + "}" 

733 else: 

734 tex = f"<sup>{tex}</sup>" 

735 html = f"<sup>{html}</sup>" 

736 

737 return tex, html 

738 

739 def parse_node_with_table_generic(self, node, **kwargs): 

740 tex, html = self.parse_inner_node(node, **kwargs) 

741 

742 tag = normalize(node.tag) 

743 if tag == "row": 743 ↛ 744line 743 didn't jump to line 744, because the condition on line 743 was never true

744 tag = "tr" 

745 elif tag == "entry": 745 ↛ 746line 745 didn't jump to line 746, because the condition on line 745 was never true

746 tag = "td" 

747 open_tag = "<" + tag 

748 

749 if tag == "table": 

750 class_table = "table" 

751 

752 cols = node.xpath("colgroup/col") 

753 i = 1 

754 for col in cols: 

755 if "width" in col.attrib: 

756 class_table += f" nowrap-col-{i}" 

757 i += 1 

758 

759 open_tag += f' class="{class_table}"' 

760 if "rowspan" in node.attrib: 

761 open_tag += ' rowspan="' + node.attrib["rowspan"] + '"' 

762 if "colspan" in node.attrib: 

763 open_tag += ' colspan="' + node.attrib["colspan"] + '"' 

764 if "align" in node.attrib: 

765 open_tag += ' align="' + node.attrib["align"] + '"' 

766 if "valign" in node.attrib: 

767 open_tag += ' class="td-valign-' + node.attrib["valign"] + '"' 

768 if "style" in node.attrib: 

769 open_tag += ' style="' + node.attrib["style"] + '"' 

770 open_tag += ">" 

771 

772 html = f"{open_tag}{html}</{tag}>" 

773 

774 return "", html 

775 

776 def parse_node_with_table_wrap(self, node, **kwargs): 

777 """ 

778 Create a <div class="table-wrap"> around the table 

779 :param node: 

780 :return: 

781 """ 

782 

783 table_id = label = caption = None 

784 inner_text = "" 

785 

786 if "id" in node.attrib: 786 ↛ 789line 786 didn't jump to line 789, because the condition on line 786 was never false

787 table_id = node.attrib["id"] 

788 

789 for child in node: 

790 tag = normalize(child.tag) 

791 if tag == "label": 

792 _, label = self.parse_node_with_mixed_content(child, **kwargs) 

793 elif tag == "caption": 

794 _, caption = self.parse_node_with_mixed_content(child, **kwargs) 

795 else: 

796 _, child_text = self.parse_node_with_mixed_content(child, **kwargs) 

797 inner_text += child_text 

798 

799 if table_id: 799 ↛ 802line 799 didn't jump to line 802, because the condition on line 799 was never false

800 text = '<div class="table-wrap table-responsive" id="' + table_id + '">' 

801 else: 

802 text = '<div class="table-wrap table-responsive">' 

803 

804 if label or caption: 804 ↛ 807line 804 didn't jump to line 807, because the condition on line 804 was never false

805 text += '<div class="table-wrap-header">' 

806 

807 if label: 807 ↛ 810line 807 didn't jump to line 810, because the condition on line 807 was never false

808 text += "<strong>" + label + "</strong>" 

809 

810 if caption: 810 ↛ 816line 810 didn't jump to line 816, because the condition on line 810 was never false

811 if label: 811 ↛ 813line 811 didn't jump to line 813, because the condition on line 811 was never false

812 text += " " 

813 if caption: 813 ↛ 816line 813 didn't jump to line 816, because the condition on line 813 was never false

814 text += caption 

815 

816 if label or caption: 816 ↛ 819line 816 didn't jump to line 819, because the condition on line 816 was never false

817 text += "</div>" 

818 

819 text += inner_text 

820 text += "</div>" 

821 

822 if ( 822 ↛ 828line 822 didn't jump to line 828

823 "append_floats" in kwargs 

824 and kwargs["append_floats"] 

825 and hasattr(self, "floats") 

826 and table_id is not None 

827 ): 

828 self.floats[table_id] = text 

829 

830 return "", text 

831 

832 def parse_node_with_table_wrap_foot(self, node, **kwargs): 

833 """ 

834 Create a <div class="table-wrap-foot"> at bottom of the table 

835 Keep the footnotes inside this div 

836 :param node: 

837 :return: 

838 """ 

839 

840 text = '<div class="table-wrap-foot">' 

841 kwargs["keep_fn"] = True 

842 

843 for child in node: 

844 tag = normalize(child.tag) 

845 if tag == "fn-group": 845 ↛ 843line 845 didn't jump to line 843, because the condition on line 845 was never false

846 _, html = self.parse_node_with_mixed_content(child, **kwargs) 

847 text += html 

848 

849 text += "</div>" 

850 

851 return "", text 

852 

853 def parse_node_with_toc(self, node, **kwargs): 

854 tex, html = self.parse_inner_node(node, **kwargs) 

855 

856 html = f"<table>{html}</table>" 

857 

858 # text = '<ul class="no-bullet book-toc">' 

859 # text += inner_text + '</ul>' 

860 

861 return "", html 

862 

863 def parse_node_with_toc_entry(self, node, **kwargs): 

864 html = label = title = child_text = page = anchor = "" 

865 inside_toc_entry = "inside_toc_entry" in kwargs and kwargs["inside_toc_entry"] 

866 toc_class = "inside-toc" if inside_toc_entry else "" 

867 # # toc-entry may be embedded inside toc-entry: create a wrapping <ul> 

868 # html = '<tr class="inside-toc">' 

869 # #html = '<ul class="no-bullet book-toc">' 

870 

871 for child in node: 

872 tag = normalize(child.tag) 

873 if tag == "title": 

874 _, title = self.parse_node_with_mixed_content(child, **kwargs) 

875 elif tag == "label": 

876 _, label = self.parse_node_with_mixed_content(child, **kwargs) 

877 elif tag == "nav-pointer": 

878 _, page = self.parse_node_with_mixed_content(child, **kwargs) 

879 elif tag == "nav-pointer-group": 879 ↛ 880line 879 didn't jump to line 880, because the condition on line 879 was never true

880 for grandchild in child: 

881 if ( 

882 grandchild.tag == "nav-pointer" 

883 and "specific-use" in grandchild.attrib 

884 and grandchild.attrib["specific-use"] == "pagenum" 

885 ): 

886 _, page = self.parse_node_with_mixed_content(grandchild, **kwargs) 

887 if ( 

888 grandchild.tag == "nav-pointer" 

889 and "specific-use" in grandchild.attrib 

890 and grandchild.attrib["specific-use"] == "pageindex" 

891 ): 

892 anchor = int(grandchild.text) + 1 

893 elif tag == "toc-entry": 893 ↛ 871line 893 didn't jump to line 871, because the condition on line 893 was never false

894 _, text = self.parse_node_with_mixed_content(child, inside_toc_entry=True) 

895 child_text += text 

896 

897 toc_text = f"{label} {title}" 

898 page_text = f"p. {page}" 

899 

900 if anchor: 900 ↛ 901line 900 didn't jump to line 901, because the condition on line 900 was never true

901 href = reverse("item-pdf", kwargs={"pid": self.pid, "extension": "pdf"}) 

902 href += f"#page={anchor}" 

903 toc_text = f'<a href="{href}">{toc_text}</a>' 

904 page_text = f'<a href="{href}">{page_text}</a>' 

905 

906 html += f'<tr><td class="{toc_class}">{toc_text}</td><td class="toc-page">{page_text}</td></tr>' 

907 if len(child_text) > 0: 

908 html += child_text 

909 # html += f'<li>{title} <span> p. {page}</span>{child_text}</li>' 

910 

911 # if 'inside_toc_entry' in kwargs and kwargs['inside_toc_entry']: 

912 # html += '</tr>' 

913 # #html += '</ul>' 

914 

915 return "", html 

916 

917 def parse_node_with_underline(self, node, **kwargs): 

918 tex, html = self.parse_inner_node(node, **kwargs) 

919 tex = f"<u>{tex}</u>" 

920 html = f"<u>{html}</u>" 

921 

922 return tex, html 

923 

924 def parse_node_with_volume(self, node, **kwargs): 

925 tex, html = self.parse_inner_node(node, **kwargs) 

926 

927 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False 

928 if is_mixed_citation: 928 ↛ 931line 928 didn't jump to line 931, because the condition on line 928 was never false

929 html = add_span_class_to_html_from_volume(html, **kwargs) 

930 

931 return tex, html 

932 

933 def parse_node_with_xref(self, node, **kwargs): 

934 tex = html = "" 

935 

936 if "ignore_xref" in kwargs and kwargs["ignore_xref"]: 936 ↛ 937line 936 didn't jump to line 937, because the condition on line 936 was never true

937 return tex, html 

938 

939 xref_id = node.get("rid") 

940 if xref_id: 940 ↛ 954line 940 didn't jump to line 954, because the condition on line 940 was never false

941 rids = xref_id.split() 

942 

943 tex, html = self.parse_inner_node(node, **kwargs) 

944 rid0 = rids[0] 

945 if rid0.find("bib") == 0: 945 ↛ 946line 945 didn't jump to line 946, because the condition on line 945 was never true

946 rid0 = "r" + rid0[3:] 

947 html = f'<a href="#{rid0}">{html}</a>' 

948 

949 for rid in rids: 

950 ref_type = node.get("ref-type") or None 

951 if ref_type in ["fig", "table", "textbox"] and hasattr(self, "floats_to_insert"): 951 ↛ 952line 951 didn't jump to line 952, because the condition on line 951 was never true

952 self.floats_to_insert.append(rid) 

953 

954 return tex, html 

955 

956 def parse_inner_node(self, node, **kwargs): 

957 """ 

958 Used by html_from_mixed_content for nodes that have a different tag in HTML 

959 :param node: 

960 :param kwargs: 

961 :return: 

962 """ 

963 tex = html = "" 

964 kwargs["is_top"] = False 

965 kwargs["is_body_html"] = kwargs["is_body_html"] if "is_body_html" in kwargs else False 

966 

967 if node.text: 

968 node_text = node.text 

969 if self.for_tex_file: 

970 node_text = unicode_to_latex(node_text) 

971 tex = node_text 

972 html = escape(node.text) 

973 

974 for child in node: 

975 child_tex, child_html = self.parse_node_with_mixed_content(child, **kwargs) 

976 tex += child_tex 

977 html += child_html 

978 

979 return tex, html 

980 

981 def parse_node_with_mixed_content(self, node, **kwargs): 

982 """ 

983 Parse and return the HTML text of an XML node which mixes text and XML sub-nodes. 

984 Ex: <node>text1 <a>text_a</a> text2 <b>text_b</b>b_tail</node> 

985 Some inner nodes are removed, others are kept or replaced by their HTML equivalent. 

986 html_from_mixed_content is called recursively to get the HTML text of the children. 

987 

988 :param node: XML Node 

989 :param kwargs: params of the function 

990 :return: HTML text 

991 """ 

992 

993 if node is None: 993 ↛ 994line 993 didn't jump to line 994, because the condition on line 993 was never true

994 return "", "" 

995 

996 # The tail is the text following the end of the node 

997 # Ex: <node>text1<a>text_a</a>a_tail</node> 

998 # The HTML text has to include the tail 

999 # only if html_from_mixed_content was called recursively 

1000 kwargs["is_top"] = kwargs["is_top"] if "is_top" in kwargs else True 

1001 

1002 # sec_level is used to add <h1>, <h2>,... in the HTML text while parsing nodes like <sec> 

1003 kwargs["sec_level"] = kwargs["sec_level"] if "sec_level" in kwargs else 2 

1004 

1005 # Text in <comment> is parsed to add HTML link. 

1006 kwargs["add_HTML_link"] = kwargs["add_HTML_link"] if "add_HTML_link" in kwargs else False 

1007 

1008 # base_url to image links 

1009 kwargs["base_url"] = kwargs["base_url"] if "base_url" in kwargs else "" 

1010 

1011 # footnotes are removed from the fulltext (and put at the end) except for those in a table 

1012 kwargs["keep_fn"] = kwargs["keep_fn"] if "keep_fn" in kwargs else False 

1013 

1014 kwargs["is_citation"] = kwargs["is_citation"] if "is_citation" in kwargs else False 

1015 kwargs["is_comment"] = kwargs["is_comment"] if "is_comment" in kwargs else False 

1016 # mixed-citation ignores ext-link 

1017 kwargs["add_ext_link"] = kwargs["add_ext_link"] if "add_ext_link" in kwargs else False 

1018 

1019 # TODO remove once jats_parser has been validated agains xmldata 

1020 kwargs["temp_math"] = kwargs["temp_math"] if "temp_math" in kwargs else False 

1021 kwargs["temp_tex"] = kwargs["temp_tex"] if "temp_tex" in kwargs else False 

1022 kwargs["is_mixed_citation"] = ( 

1023 kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False 

1024 ) 

1025 kwargs["is_body_html"] = kwargs["is_body_html"] if "is_body_html" in kwargs else False 

1026 

1027 tag = normalize(node.tag) 

1028 

1029 # pub-id/object-id are ignored by default are they are treated separately 

1030 if not (kwargs["is_comment"]) and tag in ("pub-id", "object-id"): 

1031 return "", "" 

1032 

1033 if tag in ("mixed-citation", "toc"): 

1034 kwargs["is_citation"] = True 

1035 elif tag == "comment": 

1036 kwargs["is_comment"] = True 

1037 

1038 tex = html = inner_tex = inner_html = "" 

1039 

1040 # I. Add the node's text. 

1041 # Some tag have a corresponding parse_node_with_@tag function to generate the HTML text. 

1042 

1043 # Check if the parse_node_with_@tag exists 

1044 tag_mapped = { 

1045 "statement": "sec", 

1046 "disp-formula": "inline-formula", 

1047 "chapter-title": "article-title", 

1048 "bold": "strong", 

1049 "table": "table-generic", 

1050 "th": "table-generic", 

1051 "tr": "table-generic", 

1052 "td": "table-generic", 

1053 "thead": "table-generic", 

1054 "tbody": "table-generic", 

1055 "colgroup": "table-generic", 

1056 "col": "table-generic", 

1057 "tgroup": "table-generic", 

1058 "entry": "table-generic", 

1059 "row": "table-generic", 

1060 } 

1061 

1062 fct_name = tag_mapped[tag] if tag in tag_mapped else tag 

1063 fct_name = "parse_node_with_" + fct_name.replace("-", "_") 

1064 ftor = getattr(self, fct_name, None) 

1065 if callable(ftor): 

1066 inner_tex, inner_html = ftor(node, **kwargs) 

1067 elif tag in ("ext-link", "uri"): 

1068 # Add HTML links 

1069 inner_tex = inner_html = self.helper_add_link_from_node(node, **kwargs) 

1070 # Update self.ext_links. Useful for <ext-link> deep in a <mixed_citation>, 

1071 # and not caught by parse_citation_node 

1072 if tag == "ext-link" and not kwargs["is_comment"] and kwargs["add_ext_link"]: 

1073 is_extid_value = self.parse_ext_link(node, **kwargs) 

1074 if is_extid_value and kwargs["is_mixed_citation"]: 

1075 # an extid has been found in a mixed_citation, no need to add the text of the id here 

1076 inner_tex = inner_html = "" 

1077 elif tag == "supplementary-material": 1077 ↛ 1078line 1077 didn't jump to line 1078, because the condition on line 1077 was never true

1078 self.parse_supplementary_material(node, **kwargs) 

1079 else: 

1080 # II.1. Add the node text (before the children text) 

1081 if node.text is not None: 

1082 node_text = node.text 

1083 if self.for_tex_file: 1083 ↛ 1084line 1083 didn't jump to line 1084, because the condition on line 1083 was never true

1084 node_text = unicode_to_latex(node_text) 

1085 inner_tex += node_text 

1086 inner_html += escape(node.text) 

1087 

1088 # II.2. children 

1089 # child_text = html_from_mixed_content(child, params) 

1090 

1091 child_kwargs = kwargs.copy() 

1092 child_kwargs["is_top"] = False 

1093 

1094 for child in node: 

1095 child_tex, child_html = self.parse_node_with_mixed_content(child, **child_kwargs) 

1096 

1097 # Case where an ext-link has been removed in a mixed-citation 

1098 # We may have "title. , (year)" 

1099 # Remove the comma that is now useless 

1100 if ( 1100 ↛ 1106line 1100 didn't jump to line 1106

1101 kwargs["is_mixed_citation"] 

1102 and child_html 

1103 and child_html[0] in [",", "."] 

1104 and inner_html[-2:] == ". " 

1105 ): 

1106 inner_html = inner_html[0:-1] 

1107 child_html = child_html[1:] 

1108 inner_tex = inner_tex[0:-1] 

1109 child_tex = child_tex[1:] 

1110 

1111 inner_tex += child_tex 

1112 inner_html += child_html 

1113 

1114 # II.3. wrap the children text with html links 

1115 if kwargs["add_HTML_link"] and node.text: 

1116 match = re.match(r"[\n ]+", node.text) 

1117 if not match: 

1118 inner_html = make_links_clickable(node.text, inner_html) 

1119 

1120 tex += inner_tex 

1121 html += inner_html 

1122 

1123 # III. Add the node's tail for children 

1124 if node.tail and not kwargs["is_top"]: 

1125 node_tail = node.tail 

1126 if self.for_tex_file: 

1127 node_tail = unicode_to_latex(node_tail) 

1128 tex += node_tail 

1129 html += escape(node.tail) 

1130 

1131 return tex, html 

1132 

1133 def parse_abstract(self, node, **kwargs): 

1134 # tag = get_normalized_attrib(node, "abstract-node_type") or "abstract" 

1135 tag = get_normalized_attrib(node, "abstract-type") or "abstract" 

1136 if tag == "author": 1136 ↛ 1137line 1136 didn't jump to line 1137, because the condition on line 1136 was never true

1137 tag = "abstract" 

1138 lang = get_normalized_attrib(node, "lang") or self.lang 

1139 value_tex, value_html = self.parse_node_with_mixed_content(node) 

1140 value_xml = get_xml_from_node(node) 

1141 self.abstracts.append( 

1142 { 

1143 "tag": tag, 

1144 "lang": lang, 

1145 "value_xml": value_xml, 

1146 "value_html": value_html, 

1147 "value_tex": value_tex, 

1148 } 

1149 ) 

1150 

1151 def parse_aff_alternatives(self, node, **kwargs): 

1152 xref_id = get_normalized_attrib(node, "id") or "" 

1153 address = "" 

1154 aff_to_all = True 

1155 

1156 for child in node: 

1157 tag = normalize(child.tag) 

1158 

1159 if tag == "aff": 1159 ↛ 1170line 1159 didn't jump to line 1170, because the condition on line 1159 was never false

1160 # Skip the formatted aff and use only the complete address text 

1161 # TODO support <aff> properly 

1162 for aff in child: 

1163 if aff.tag == "label" and address == "": 1163 ↛ 1164line 1163 didn't jump to line 1164, because the condition on line 1163 was never true

1164 label = get_text_from_node(aff) 

1165 address = get_text_from_node(child)[len(label) :] 

1166 aff_to_all = False 

1167 if address == "" and child.text: 

1168 address = child.text 

1169 else: 

1170 self.warnings.append( 

1171 { 

1172 self.pid: self.__class__.__name__ 

1173 + "." 

1174 + inspect.currentframe().f_code.co_name 

1175 + " " 

1176 + tag 

1177 } 

1178 ) 

1179 

1180 if address != "": 1180 ↛ exitline 1180 didn't return from function 'parse_aff_alternatives', because the condition on line 1180 was never false

1181 for contrib in self.contributors: 

1182 if address not in contrib["addresses"] and ( 1182 ↛ 1181line 1182 didn't jump to line 1181, because the condition on line 1182 was never false

1183 ("xrefs" in contrib and xref_id in contrib["xrefs"]) or aff_to_all 

1184 ): 

1185 contrib["addresses"].append(address) 

1186 contrib["contrib_xml"] = get_contrib_xml(contrib) 

1187 

1188 def parse_award_group(self, node, **kwargs): 

1189 abbrev = award_id = None 

1190 

1191 for child in node: 

1192 tag = normalize(child.tag) 

1193 

1194 if tag == "award-id": 

1195 award_id = child.text 

1196 elif tag == "funding-source": 1196 ↛ 1199line 1196 didn't jump to line 1199, because the condition on line 1196 was never false

1197 abbrev = get_text_from_node(child) 

1198 else: 

1199 self.warnings.append( 

1200 { 

1201 self.pid: self.__class__.__name__ 

1202 + "." 

1203 + inspect.currentframe().f_code.co_name 

1204 + " " 

1205 + tag 

1206 } 

1207 ) 

1208 

1209 if abbrev is not None and award_id is not None: 1209 ↛ exitline 1209 didn't return from function 'parse_award_group', because the condition on line 1209 was never false

1210 self.awards.append({"abbrev": abbrev, "award_id": award_id}) 

1211 

1212 def parse_contrib_group(self, node, **kwargs): 

1213 role = node.get("content-type") or "" 

1214 if role and role[-1] == "s": 1214 ↛ 1217line 1214 didn't jump to line 1217, because the condition on line 1214 was never false

1215 role = role[0:-1] 

1216 

1217 for child in node: 

1218 tag = normalize(child.tag) 

1219 

1220 if tag == "contrib": 1220 ↛ 1225line 1220 didn't jump to line 1225, because the condition on line 1220 was never false

1221 contrib = self.get_data_from_contrib(child) 

1222 contrib["role"] = f"{role}|{contrib['role']}" if contrib["role"] else role 

1223 contrib["contrib_xml"] = get_xml_from_node(child) 

1224 self.contributors.append(contrib) 

1225 elif tag == "aff-alternatives": 

1226 self.parse_aff_alternatives(child) 

1227 elif tag == "fn": 

1228 _, html = self.parse_node_with_fn(child, keep_fn=True, keep_fn_label=False) 

1229 xml = get_xml_from_node(child) 

1230 self.footnotes_xml += xml 

1231 self.footnotes_html += html 

1232 else: 

1233 self.warnings.append( 

1234 { 

1235 self.pid: self.__class__.__name__ 

1236 + "." 

1237 + inspect.currentframe().f_code.co_name 

1238 + " " 

1239 + tag 

1240 } 

1241 ) 

1242 

1243 def parse_counts(self, node, **kwargs): 

1244 for child in node: 

1245 count_value = child.get("count") 

1246 if count_value is None: 

1247 count_value = child.text 

1248 

1249 if count_value is not None: 1249 ↛ 1244line 1249 didn't jump to line 1244, because the condition on line 1249 was never false

1250 tag = normalize(child.tag) 

1251 if tag == "book-page-count": 

1252 tag = "page-count" 

1253 

1254 self.counts.append((tag, count_value)) 

1255 

1256 def parse_ext_link(self, node, **kwargs): 

1257 datas = self.get_data_from_ext_link(node) 

1258 extid_value = self.add_extids_from_node_with_link(datas) 

1259 

1260 add_ext_link = kwargs["add_ext_link"] if "add_ext_link" in kwargs else False 

1261 if ( 

1262 add_ext_link 

1263 and extid_value[0] is None 

1264 and datas not in self.ext_links 

1265 and datas["rel"] != "cover" 

1266 ): 

1267 self.ext_links.append(datas) 

1268 

1269 return extid_value[0] is not None 

1270 

1271 def parse_front_matter(self, node, **kwargs): 

1272 self.frontmatter_xml = get_xml_from_node(node) 

1273 self.frontmatter_foreword_html = "" 

1274 

1275 for child in node: 

1276 tag = normalize(child.tag) 

1277 

1278 if tag == "foreword": 1278 ↛ 1279line 1278 didn't jump to line 1279, because the condition on line 1278 was never true

1279 _, self.frontmatter_foreword_html = self.parse_node_with_mixed_content(child) 

1280 elif tag == "toc": 1280 ↛ 1275line 1280 didn't jump to line 1275, because the condition on line 1280 was never false

1281 _, self.frontmatter_toc_html = self.parse_node_with_mixed_content(child) 

1282 

1283 def parse_id(self, node, **kwargs): 

1284 node_id = node.text 

1285 if "pub-id-type" in node.attrib: 

1286 node_type = node.attrib["pub-id-type"] 

1287 elif "book-id-type" in node.attrib: 

1288 node_type = node.attrib["book-id-type"] 

1289 elif "book-part-id-type" in node.attrib: 1289 ↛ 1292line 1289 didn't jump to line 1292, because the condition on line 1289 was never false

1290 node_type = node.attrib["book-part-id-type"] 

1291 else: 

1292 node_type = "" 

1293 

1294 if node_type == "pii": 1294 ↛ 1296line 1294 didn't jump to line 1296, because the condition on line 1294 was never true

1295 # Elsevier ids get a special treatment: web scrapping to find the date_published 

1296 if self.pid and len(self.pid) > 2 and self.pid[0:2] == "CR": 

1297 self.pii = node_id 

1298 elif node_type in ("numdam-id", "mathdoc-id"): 

1299 self.pid = node_id 

1300 elif node_type == "ark": 1300 ↛ 1301line 1300 didn't jump to line 1301, because the condition on line 1300 was never true

1301 self.extids.append((node_type, node_id)) 

1302 elif node_type in ("doi", "eid"): 

1303 self.ids.append((node_type, node_id)) 

1304 if node_type == "doi": 1304 ↛ exitline 1304 didn't return from function 'parse_id', because the condition on line 1304 was never false

1305 self.doi = node_id 

1306 

1307 def parse_kwd_group(self, node, **kwargs): 

1308 kwds = [] 

1309 value_html = value_tex = "" 

1310 for child in node: 

1311 tag = normalize(child.tag) 

1312 

1313 if tag == "kwd": 

1314 kwds.append(child.text) 

1315 elif tag == "unstructured-kwd-group": 1315 ↛ 1320line 1315 didn't jump to line 1320, because the condition on line 1315 was never false

1316 # value_xml = get_xml_from_node(child) 

1317 value_tex, value_html = self.parse_node_with_mixed_content(child) 

1318 kwds = split_kwds(value_tex) 

1319 else: 

1320 self.warnings.append( 

1321 { 

1322 self.pid: self.__class__.__name__ 

1323 + "." 

1324 + inspect.currentframe().f_code.co_name 

1325 + " " 

1326 + tag 

1327 } 

1328 ) 

1329 

1330 content_type = node.get("content-node_type") or "" 

1331 if content_type == "": 1331 ↛ 1333line 1331 didn't jump to line 1333, because the condition on line 1331 was never false

1332 content_type = node.get("kwd-group-type") or "" 

1333 lang = get_normalized_attrib(node, "lang") or self.lang 

1334 

1335 self.kwds.extend([{"type": content_type, "lang": lang, "value": kwd} for kwd in kwds]) 

1336 

1337 def parse_ref_list(self, node, **kwargs): 

1338 for child in node: 

1339 tag = normalize(child.tag) 

1340 

1341 if tag == "ref": 

1342 ref = JatsRef(tree=child, lang=self.lang) 

1343 self.warnings.extend(ref.warnings) 

1344 self.bibitems.append(ref) 

1345 self.bibitem.append(ref.citation_html) 

1346 elif tag == "p": 1346 ↛ 1348line 1346 didn't jump to line 1348, because the condition on line 1346 was never true

1347 # Elsevier can store supplementary-material inside ref-list / p 

1348 self.parse_node_with_mixed_content(child) 

1349 else: 

1350 self.warnings.append( 

1351 { 

1352 self.pid: self.__class__.__name__ 

1353 + "." 

1354 + inspect.currentframe().f_code.co_name 

1355 + " " 

1356 + tag 

1357 } 

1358 ) 

1359 

1360 def parse_related_article(self, node, **kwargs): 

1361 rel_type = get_normalized_attrib(node, "related-article-type") or "" 

1362 id_value = node.text 

1363 

1364 if hasattr(self, "pii") and id_value and id_value.find("10.") == -1 and id_value != "NONE": 1364 ↛ 1367line 1364 didn't jump to line 1367, because the condition on line 1364 was never true

1365 # a pii is used instead of a DOI 

1366 # Call Elsevier to get the doi 

1367 doi = scrapping.fetch_article(self.doi, id_value, pii_doi_equivalence=True) 

1368 id_value = doi 

1369 

1370 obj = Foo() 

1371 obj.rel_type = rel_type 

1372 obj.id_value = id_value 

1373 

1374 self.relations.append(obj) 

1375 

1376 def parse_related_object(self, node, **kwargs): 

1377 node_type = node.get("content-type") or "" 

1378 rel = node.get("link-type") or "" 

1379 href = get_normalized_attrib(node, "href") or "" 

1380 base = get_normalized_attrib(node, "base") or "" 

1381 text = get_xml_from_node(node) 

1382 

1383 data = { 

1384 "rel": rel, 

1385 "mimetype": node_type, 

1386 "location": href, 

1387 "base": base, 

1388 "metadata": text, 

1389 } 

1390 

1391 document_id_type = node.get("document-id-type") or "" 

1392 if document_id_type: 1392 ↛ 1393line 1392 didn't jump to line 1393, because the condition on line 1392 was never true

1393 id_value = node.get("document-id") or "" 

1394 if id_value != "NONE": 

1395 if id_value and id_value.find("10.") == -1: 

1396 # a pii is used instead of a DOI 

1397 # Call Elsevier to get the doi 

1398 doi = scrapping.fetch_article(self.doi, id_value, pii_doi_equivalence=True) 

1399 id_value = doi 

1400 

1401 obj = Foo() 

1402 obj.rel_type = "refers to" 

1403 obj.id_value = id_value 

1404 

1405 self.relations.append(obj) 

1406 else: 

1407 self.related_objects.append(data) 

1408 

1409 def parse_sec(self, node, **kwargs): 

1410 for child in node: 

1411 tag = normalize(child.tag) 

1412 

1413 if tag == "title": 

1414 pass 

1415 elif tag == "ref-list": 

1416 self.parse_ref_list(child) 

1417 else: 

1418 self.warnings.append( 

1419 { 

1420 self.pid: self.__class__.__name__ 

1421 + "." 

1422 + inspect.currentframe().f_code.co_name 

1423 + " " 

1424 + tag 

1425 } 

1426 ) 

1427 

1428 def parse_self_uri(self, node, **kwargs): 

1429 node_type = node.get("content-type") or "text/html" 

1430 href = get_normalized_attrib(node, "href") or "" 

1431 base = get_normalized_attrib(node, "base") or "" 

1432 

1433 # The XML of the Elsevier archive do not declare the PDF location like the other Mathdoc collections: 

1434 # The collection folder is missing: add it back 

1435 if hasattr(self, "pii") and hasattr(self, "issue"): 1435 ↛ 1436line 1435 didn't jump to line 1436, because the condition on line 1435 was never true

1436 base_dir = self.issue.journal.pid 

1437 if os.path.dirname(href) != base_dir: 

1438 href = os.path.join(base_dir, self.issue.pid, href) 

1439 

1440 data = { 

1441 "rel": "full-text", 

1442 "mimetype": node_type, 

1443 "location": href, 

1444 "base": base, 

1445 "text": normalize_space(node.text) if node.text is not None else "", 

1446 } 

1447 

1448 # Ext-links, Related-objects used metadata instead of text. Strange difference ? 

1449 # xml_cmds ignore "application/xml" in add_objects_with_location: they are ignored here. 

1450 if node_type != "application/xml": 

1451 self.streams.append(data) 

1452 

1453 def parse_sub_article(self, node, **kwargs): 

1454 # Used for translations 

1455 trans_article = JatsArticle(tree=node) 

1456 self.translations.append(trans_article) 

1457 

1458 def parse_subj_group(self, node, **kwargs): 

1459 lang = get_normalized_attrib(node, "lang") or self.lang 

1460 type_ = node.get("subj-group-type") or "" 

1461 

1462 for child in node: 

1463 tag = normalize(child.tag) 

1464 

1465 if tag == "subject": 1465 ↛ 1470line 1465 didn't jump to line 1470, because the condition on line 1465 was never false

1466 self.subjs.append( 

1467 {"type": type_, "lang": lang, "value": get_text_from_node(child)} 

1468 ) 

1469 else: 

1470 self.warnings.append( 

1471 { 

1472 self.pid: self.__class__.__name__ 

1473 + "." 

1474 + inspect.currentframe().f_code.co_name 

1475 + " " 

1476 + tag 

1477 } 

1478 ) 

1479 

1480 def parse_supplementary_material(self, node, **kwargs): 

1481 caption = "" 

1482 for child in node: 

1483 if child.tag == "caption": 

1484 _, caption = self.parse_node_with_mixed_content(child) 

1485 

1486 location = get_normalized_attrib(node, "href") or None 

1487 if location is None: 

1488 location = get_normalized_attrib(node, "id") or "" 

1489 

1490 mimetype = node.attrib.get("mimetype") or None 

1491 if mimetype is None: 

1492 mimetype = resolver.get_mimetype(location) 

1493 

1494 material = { 

1495 "rel": node.attrib.get("content-type") or "supplementary-material", 

1496 "mimetype": mimetype, 

1497 "location": location, 

1498 "base": "", 

1499 "metadata": "", 

1500 "caption": caption if caption else "", 

1501 } 

1502 base_location = os.path.basename(location) 

1503 found_list = [ 

1504 item 

1505 for item in self.supplementary_materials 

1506 if os.path.basename(item["location"]) == base_location 

1507 ] 

1508 if len(found_list) == 0: 

1509 self.supplementary_materials.append(material) 

1510 

1511 def parse_title(self, node, **kwargs): 

1512 self.title_tex, self.title_html = self.parse_node_with_mixed_content( 

1513 node, ignore_xref=True 

1514 ) 

1515 # In xmldata.py, title_xml had the <title_group> tag: 

1516 # self.title_xml can't be set in parse_title 

1517 

1518 def parse_title_group(self, node, **kwargs): 

1519 has_fn_group = False 

1520 

1521 for child in node: 

1522 tag = normalize(child.tag) 

1523 

1524 if tag in ("title", "journal-title", "article-title", "book-title", "issue-title"): 

1525 self.parse_title(child) 

1526 elif tag == "subtitle": 1526 ↛ 1527line 1526 didn't jump to line 1527, because the condition on line 1526 was never true

1527 title_tex, title_html = self.parse_node_with_mixed_content(child) 

1528 self.title_tex += " " + title_tex 

1529 self.title_html += " " + title_html 

1530 elif tag == "trans-title-group": 

1531 self.parse_trans_title_group(child) 

1532 elif tag == "abbrev-title": 

1533 _, self.abbrev = self.parse_node_with_mixed_content(child) 

1534 elif tag == "fn-group": 1534 ↛ 1535line 1534 didn't jump to line 1535, because the condition on line 1534 was never true

1535 has_fn_group = True 

1536 for fn_node in child: 

1537 if fn_node.tag == "fn": 

1538 _, html = self.parse_node_with_fn( 

1539 fn_node, keep_fn=True, keep_fn_label=False 

1540 ) 

1541 xml = get_xml_from_node(fn_node) 

1542 self.footnotes_xml += xml 

1543 self.footnotes_html += html 

1544 else: 

1545 self.warnings.append( 

1546 { 

1547 self.pid: self.__class__.__name__ 

1548 + "." 

1549 + inspect.currentframe().f_code.co_name 

1550 + " " 

1551 + tag 

1552 } 

1553 ) 

1554 

1555 if has_fn_group: 1555 ↛ 1558line 1555 didn't jump to line 1558, because the condition on line 1555 was never true

1556 # fn-group is now a funding statement and will be exported separately in the XML: 

1557 # => remove it from the title-group 

1558 new_node = etree.Element("title-group") 

1559 for child in node: 

1560 tag = normalize(child.tag) 

1561 if tag != "fn-group": 

1562 new_node.append(copy.deepcopy(child)) 

1563 self.title_xml = get_xml_from_node(new_node) 

1564 else: 

1565 self.title_xml = get_xml_from_node(node) 

1566 

1567 def parse_trans_abstract(self, node, **kwargs): 

1568 tag = get_normalized_attrib(node, "abstract-type") or "abstract" 

1569 if tag == "author": 1569 ↛ 1570line 1569 didn't jump to line 1570, because the condition on line 1569 was never true

1570 tag = "abstract" 

1571 lang = get_normalized_attrib(node, "lang") or "und" 

1572 value_tex, value_html = self.parse_node_with_mixed_content(node) 

1573 value_xml = get_xml_from_node(node) 

1574 self.abstracts.append( 

1575 { 

1576 "tag": tag, 

1577 "lang": lang, 

1578 "value_xml": value_xml, 

1579 "value_html": value_html, 

1580 "value_tex": value_tex, 

1581 } 

1582 ) 

1583 

1584 def parse_trans_title(self, node, **kwargs): 

1585 self.trans_title_tex, self.trans_title_html = self.parse_node_with_mixed_content(node) 

1586 self.trans_title_xml = get_xml_from_node(node) 

1587 

1588 def parse_trans_title_group(self, node, **kwargs): 

1589 for child in node: 

1590 tag = normalize(child.tag) 

1591 

1592 if tag == "trans-title": 1592 ↛ 1595line 1592 didn't jump to line 1595, because the condition on line 1592 was never false

1593 self.parse_trans_title(child) 

1594 else: 

1595 self.warnings.append( 

1596 { 

1597 self.pid: self.__class__.__name__ 

1598 + "." 

1599 + inspect.currentframe().f_code.co_name 

1600 + " " 

1601 + tag 

1602 } 

1603 ) 

1604 

1605 self.trans_lang = get_normalized_attrib(node, "lang") or "und" 

1606 

1607 def get_data_from_contrib(self, node): 

1608 """ 

1609 <contrib> creates 1 person, defined in <name>, <string-name> or <name-alternatives> 

1610 In a <mixed-citation>, each <name> creates 1 person: we can't use the same code 

1611 :param node: 

1612 :return: 

1613 """ 

1614 

1615 params = create_contributor() 

1616 

1617 for child in node: 

1618 if child.tag == "name": 

1619 self.update_data_from_name(child, params) 

1620 elif child.tag == "string-name": 

1621 self.update_data_from_name(child, params) 

1622 if params["first_name"] == "" and params["last_name"] == "": 1622 ↛ 1617line 1622 didn't jump to line 1617, because the condition on line 1622 was never false

1623 params["string_name"] = child.text or "" 

1624 elif child.tag == "name-alternatives": 

1625 params["mid"] = self.get_data_from_name_alternatives(child) 

1626 elif child.tag == "contrib-id": 1626 ↛ 1627line 1626 didn't jump to line 1627, because the condition on line 1626 was never true

1627 type_ = child.get("contrib-id-type") or "" 

1628 if type_ == "orcid": 

1629 params["orcid"] = child.text or "" 

1630 elif child.tag == "address": 

1631 addr = get_text_from_node(child) 

1632 params["addresses"].append(addr) 

1633 elif child.tag == "email": 

1634 params["email"] = child.text or "" 

1635 elif child.tag == "xref": 1635 ↛ 1647line 1635 didn't jump to line 1647, because the condition on line 1635 was never false

1636 # Elsevier uses xref/aff-alternatives to store affiliations 

1637 type_ = child.get("ref-type") or "" 

1638 if type_ == "aff": 1638 ↛ 1617line 1638 didn't jump to line 1617, because the condition on line 1638 was never false

1639 xref = child.get("rid") or "" 

1640 if xref == "": 1640 ↛ 1641line 1640 didn't jump to line 1641, because the condition on line 1640 was never true

1641 xref = get_text_from_node(child) 

1642 if xref != "": 1642 ↛ 1617line 1642 didn't jump to line 1617, because the condition on line 1642 was never false

1643 if "xrefs" not in params: 1643 ↛ 1646line 1643 didn't jump to line 1646, because the condition on line 1643 was never false

1644 params["xrefs"] = [xref] 

1645 else: 

1646 params["xrefs"].append(xref) 

1647 elif child.tag == "collab": 

1648 params["string_name"] = child.text or "" 

1649 elif child.tag == "role": 

1650 pass 

1651 # Role is used in BJHTUP11 as a textual description of the role (ex "Présidente"). 

1652 # The node value can not be assigned to params['role'] as we want a controlled vocabulary 

1653 # (author /editor / organizer...) 

1654 # Ignore the value 

1655 # params["role"] = child.text or "" 

1656 else: 

1657 self.warnings.append( 

1658 { 

1659 self.pid: self.__class__.__name__ 

1660 + "." 

1661 + inspect.currentframe().f_code.co_name 

1662 + " " 

1663 + child.tag 

1664 } 

1665 ) 

1666 

1667 # Remove the sort, it causes differences between the HTML and the PDF (discovered in PCJ) 

1668 # Sort was introduced on 22/09/2020, based on differences between the Cedrics->JATS XSLT et the Cedrics import 

1669 # params['addresses'].sort() 

1670 

1671 helper_update_name_params(params) 

1672 

1673 corresp = node.get("corresp") or "" 

1674 if corresp == "yes": 

1675 params["corresponding"] = True 

1676 

1677 deceased_ = node.get("deceased") or "no" 

1678 params["deceased_before_publication"] = deceased_ == "yes" 

1679 

1680 equal_contrib_ = node.get("equal-contrib") or "no" 

1681 params["equal_contrib"] = equal_contrib_ == "yes" 

1682 

1683 return params 

1684 

1685 def get_data_from_custom_meta(self, node): 

1686 name = "" 

1687 value = "" 

1688 

1689 for child in node: 

1690 tag = normalize(child.tag) 

1691 

1692 if tag == "meta-name": 

1693 name = child.text 

1694 elif tag == "meta-value": 1694 ↛ 1697line 1694 didn't jump to line 1697, because the condition on line 1694 was never false

1695 value = child.text 

1696 else: 

1697 self.warnings.append( 

1698 { 

1699 self.pid: self.__class__.__name__ 

1700 + "." 

1701 + inspect.currentframe().f_code.co_name 

1702 + " " 

1703 + tag 

1704 } 

1705 ) 

1706 

1707 return name, value 

1708 

1709 def get_data_from_date(self, node, ignore_month=False): 

1710 date_str = "" 

1711 if "iso-8601-date" in node.attrib: 

1712 date_str = node.attrib["iso-8601-date"] 

1713 else: 

1714 year = month = day = "" 

1715 for child in node: 

1716 tag = normalize(child.tag) 

1717 

1718 if tag == "year": 1718 ↛ 1720line 1718 didn't jump to line 1720, because the condition on line 1718 was never false

1719 year = child.text 

1720 elif tag == "month" and not ignore_month: 

1721 month = child.text 

1722 elif tag == "day": 

1723 day = child.text 

1724 else: 

1725 self.warnings.append( 

1726 { 

1727 self.pid: self.__class__.__name__ 

1728 + "." 

1729 + inspect.currentframe().f_code.co_name 

1730 + " " 

1731 + tag 

1732 } 

1733 ) 

1734 

1735 date_str = year 

1736 if date_str and month: 1736 ↛ 1737line 1736 didn't jump to line 1737, because the condition on line 1736 was never true

1737 date_str += "-" + month 

1738 if date_str and day: 1738 ↛ 1739line 1738 didn't jump to line 1739, because the condition on line 1738 was never true

1739 date_str += "-" + day 

1740 

1741 return date_str 

1742 

1743 def get_data_from_ext_link(self, node, **kwargs): 

1744 link_type = node.get("ext-link-type") or "" 

1745 href = get_normalized_attrib(node, "href") or "" 

1746 base = get_normalized_attrib(node, "base") or "" 

1747 

1748 kwargs["add_HTML_link"] = False 

1749 _, metadata = self.parse_inner_node(node, **kwargs) 

1750 

1751 data = { 

1752 "rel": link_type, 

1753 "mimetype": "", 

1754 "location": href, 

1755 "base": base, 

1756 "metadata": metadata, 

1757 } 

1758 

1759 return data 

1760 

1761 def get_data_from_history(self, node): 

1762 history_dates = [] 

1763 # TODO: transform history_dates in a hash where date-type is the key 

1764 # => Change database_cmds 

1765 for child in node: 

1766 if "date-type" in child.attrib: 

1767 date_type = child.attrib["date-type"] 

1768 date_str = self.get_data_from_date(child) 

1769 history_dates.append({"type": date_type, "date": date_str}) 

1770 else: 

1771 self.warnings.append( 

1772 { 

1773 self.pid: self.__class__.__name__ 

1774 + "." 

1775 + inspect.currentframe().f_code.co_name 

1776 + " " 

1777 + child.tag 

1778 } 

1779 ) 

1780 

1781 return history_dates 

1782 

1783 def update_data_from_name(self, node, contributor): 

1784 for child in node: 

1785 if child.text is not None: 1785 ↛ 1784line 1785 didn't jump to line 1784, because the condition on line 1785 was never false

1786 if child.tag == "given-names": 

1787 contributor["first_name"] = child.text 

1788 elif child.tag == "surname": 

1789 contributor["last_name"] = child.text 

1790 elif child.tag == "prefix": 1790 ↛ 1791line 1790 didn't jump to line 1791, because the condition on line 1790 was never true

1791 contributor["prefix"] = child.text 

1792 elif child.tag == "suffix": 1792 ↛ 1795line 1792 didn't jump to line 1795, because the condition on line 1792 was never false

1793 contributor["suffix"] = child.text 

1794 else: 

1795 self.warnings.append( 

1796 { 

1797 self.pid: self.__class__.__name__ 

1798 + "." 

1799 + inspect.currentframe().f_code.co_name 

1800 + " " 

1801 + child.tag 

1802 } 

1803 ) 

1804 

1805 def get_data_from_name_alternatives(self, node): 

1806 mid = "" 

1807 

1808 for child in node: 

1809 if child.text is not None: 1809 ↛ 1808line 1809 didn't jump to line 1808, because the condition on line 1809 was never false

1810 if child.tag == "string-name": 1810 ↛ 1814line 1810 didn't jump to line 1814, because the condition on line 1810 was never false

1811 if child.get("specific-use") == "index": 1811 ↛ 1808line 1811 didn't jump to line 1808, because the condition on line 1811 was never false

1812 mid = child.text 

1813 else: 

1814 self.warnings.append( 

1815 { 

1816 self.pid: self.__class__.__name__ 

1817 + "." 

1818 + inspect.currentframe().f_code.co_name 

1819 + " " 

1820 + child.tag 

1821 } 

1822 ) 

1823 

1824 return mid 

1825 

1826 def get_data_from_uri(self, node, **kwargs): 

1827 href = get_normalized_attrib(node, "href") or "" 

1828 

1829 kwargs["add_HTML_link"] = False 

1830 _, metadata = self.parse_inner_node(node, **kwargs) 

1831 

1832 data = {"rel": None, "mimetype": "", "location": href, "base": "", "metadata": metadata} 

1833 

1834 return data 

1835 

1836 def helper_add_link_from_node(self, node, **kwargs): 

1837 text = node.text or "" 

1838 tag = normalize(node.tag) 

1839 fct_name = "get_data_from_" + tag.replace("-", "_") 

1840 meth = getattr(self, fct_name) 

1841 data = meth(node, **kwargs) 

1842 if not data["rel"] or data["rel"] == "uri": 

1843 href = data["location"] 

1844 if self.for_tex_file: 1844 ↛ 1845line 1844 didn't jump to line 1845, because the condition on line 1844 was never true

1845 text = "\\href{" + href + "}{" + data["metadata"] + "}" 

1846 else: 

1847 text = make_links_clickable(href, data["metadata"]) 

1848 return text 

1849 

1850 def get_list_start_value(self, list_node): 

1851 continued_from = list_node.get("continued-from") 

1852 if continued_from is None: 

1853 start = 0 

1854 else: 

1855 from_node = self.tree.find(f'.//*[@id="{continued_from}"]') 

1856 if from_node is not None: 

1857 start = len(from_node) + self.get_list_start_value(from_node) 

1858 

1859 return start 

1860 

1861 

1862class MathdocPublication(MathdocPublicationData, JatsBase): 

1863 def __init__(self, *args, **kwargs): 

1864 super().__init__(*args, **kwargs) 

1865 self.parse_tree(kwargs["tree"]) 

1866 

1867 def parse_tree(self, tree): 

1868 super().parse_tree(tree) 

1869 

1870 for node in tree: 

1871 tag = normalize(node.tag) 

1872 

1873 if tag in ("publication-id", "collection-id"): 

1874 node_type = node.get("publication-id-type") 

1875 if node_type is None or node_type in ["numdam-id", "mathdoc-id"]: 

1876 self.pid = node.text 

1877 elif tag == "title-group": 

1878 self.parse_title_group(node) 

1879 elif tag == "issn": 

1880 node_type = node.get("pub-type") 

1881 if node_type == "ppub": 

1882 self.issn = node.text 

1883 self.ids.append(("issn", node.text)) 

1884 elif node_type == "epub": 1884 ↛ 1870line 1884 didn't jump to line 1870, because the condition on line 1884 was never false

1885 self.e_issn = node.text 

1886 self.ids.append(("e-issn", node.text)) 

1887 elif tag == "ext-link": 

1888 data = self.get_data_from_ext_link(node) 

1889 self.ext_links.append(data) 

1890 elif tag == "custom-meta-group": 

1891 self.parse_custom_meta_group(node) 

1892 elif tag == "description": 1892 ↛ 1893line 1892 didn't jump to line 1893, because the condition on line 1892 was never true

1893 self.parse_description(node) 

1894 else: 

1895 self.warnings.append( 

1896 { 

1897 self.pid: self.__class__.__name__ 

1898 + "." 

1899 + inspect.currentframe().f_code.co_name 

1900 + " " 

1901 + tag 

1902 } 

1903 ) 

1904 

1905 def parse_custom_meta_group(self, node, **kwargs): 

1906 for child in node: 

1907 tag = normalize(child.tag) 

1908 

1909 if tag == "custom-meta": 1909 ↛ 1919line 1909 didn't jump to line 1919, because the condition on line 1909 was never false

1910 name, value = self.get_data_from_custom_meta(child) 

1911 

1912 if name == "serial-type": 

1913 self.coltype = value 

1914 elif name == "wall": 

1915 self.wall = int(value) 

1916 elif name == "provider": 1916 ↛ 1906line 1916 didn't jump to line 1906, because the condition on line 1916 was never false

1917 self.provider = value 

1918 else: 

1919 self.warnings.append( 

1920 { 

1921 self.pid: self.__class__.__name__ 

1922 + "." 

1923 + inspect.currentframe().f_code.co_name 

1924 + " " 

1925 + tag 

1926 } 

1927 ) 

1928 

1929 def parse_description(self, node, **kwargs): 

1930 # tag = get_normalized_attrib(node, "abstract-node_type") or "abstract" 

1931 tag = "description" 

1932 lang = get_normalized_attrib(node, "lang") or self.lang 

1933 value_xml = get_xml_from_node(node) 

1934 value_tex = value_html = value_xml.replace("<decription", "").replace("</description>", "") 

1935 self.abstracts.append( 

1936 { 

1937 "tag": tag, 

1938 "lang": lang, 

1939 "value_xml": value_xml, 

1940 "value_html": value_html, 

1941 "value_tex": value_tex, 

1942 } 

1943 ) 

1944 

1945 

1946class JatsPublisher(PublisherData): 

1947 def __init__(self, *args, **kwargs): 

1948 super().__init__(*args, **kwargs) 

1949 self.warnings = [] 

1950 self.parse_tree(kwargs["tree"]) 

1951 self.warnings = [] 

1952 

1953 def parse_tree(self, tree): 

1954 for node in tree: 

1955 tag = normalize(node.tag) 

1956 

1957 if tag == "publisher-name": 1957 ↛ 1959line 1957 didn't jump to line 1959, because the condition on line 1957 was never false

1958 self.name = node.text 

1959 elif tag == "publisher-loc": 

1960 self.loc = node.text 

1961 else: 

1962 self.warnings.append( 

1963 { 

1964 self.pid: self.__class__.__name__ 

1965 + "." 

1966 + inspect.currentframe().f_code.co_name 

1967 + " " 

1968 + tag 

1969 } 

1970 ) 

1971 

1972 

1973class JatsJournal(JournalData, JatsBase): 

1974 def __init__(self, *args, **kwargs): 

1975 super().__init__(*args, **kwargs) 

1976 self.parse_tree(kwargs["tree"]) 

1977 

1978 def parse_tree(self, tree): 

1979 super().parse_tree(tree) 

1980 

1981 for node in tree: 

1982 tag = normalize(node.tag) 

1983 

1984 if tag == "journal-id": 

1985 id_type = node.get("journal-id-type") or "numdam-id" 

1986 if id_type == "numdam-id" or id_type == "mathdoc-id": 1986 ↛ 1981line 1986 didn't jump to line 1981, because the condition on line 1986 was never false

1987 self.pid = node.text 

1988 elif tag == "journal-title-group": 

1989 self.parse_title_group(node) 

1990 elif tag == "publisher": 

1991 self.publisher = JatsPublisher(tree=node) 

1992 elif tag == "issn": 1992 ↛ 2001line 1992 didn't jump to line 2001, because the condition on line 1992 was never false

1993 node_type = node.get("pub-type") or "ppub" 

1994 if node_type == "ppub": 

1995 self.issn = node.text 

1996 self.ids.append(("issn", node.text)) 

1997 elif node_type == "epub": 1997 ↛ 1981line 1997 didn't jump to line 1981, because the condition on line 1997 was never false

1998 self.e_issn = node.text 

1999 self.ids.append(("e-issn", node.text)) 

2000 else: 

2001 self.warnings.append( 

2002 { 

2003 self.pid: self.__class__.__name__ 

2004 + "." 

2005 + inspect.currentframe().f_code.co_name 

2006 + " " 

2007 + tag 

2008 } 

2009 ) 

2010 

2011 

2012class JatsIssue(IssueData, JatsBase): 

2013 def __init__(self, *args, **kwargs): 

2014 super().__init__(*args, **kwargs) 

2015 # from_folder is used to change the location of Elsevier graphics to a full path location 

2016 self.from_folder = kwargs["from_folder"] if "from_folder" in kwargs else None 

2017 

2018 self.parse_tree(kwargs["tree"]) 

2019 

2020 def parse_tree(self, tree): 

2021 super().parse_tree(tree) 

2022 

2023 for node in tree: 

2024 tag = normalize(node.tag) 

2025 

2026 if tag == "journal-meta": 

2027 self.journal = JatsJournal(tree=node) 

2028 elif tag == "issue-meta": 

2029 self.parse_issue_meta(node) 

2030 elif tag == "body": 2030 ↛ 2049line 2030 didn't jump to line 2049, because the condition on line 2030 was never false

2031 for child in node: 

2032 tag = normalize(child.tag) 

2033 

2034 if tag == "article": 2034 ↛ 2039line 2034 didn't jump to line 2039, because the condition on line 2034 was never false

2035 article = JatsArticle(tree=child, issue=self, from_folder=self.from_folder) 

2036 self.warnings.extend(article.warnings) 

2037 self.articles.append(article) 

2038 else: 

2039 self.warnings.append( 

2040 { 

2041 self.pid: self.__class__.__name__ 

2042 + "." 

2043 + inspect.currentframe().f_code.co_name 

2044 + " " 

2045 + tag 

2046 } 

2047 ) 

2048 else: 

2049 self.warnings.append( 

2050 { 

2051 self.pid: self.__class__.__name__ 

2052 + "." 

2053 + inspect.currentframe().f_code.co_name 

2054 + " " 

2055 + tag 

2056 } 

2057 ) 

2058 

2059 if self.journal is not None: 2059 ↛ 2063line 2059 didn't jump to line 2063, because the condition on line 2059 was never false

2060 self.publisher = self.journal.publisher 

2061 

2062 # Issue editors may be replicated in all the articles, remove them 

2063 issue_editors = [contrib for contrib in self.contributors if contrib["role"] == "editor"] 

2064 

2065 is_elsevier = False 

2066 for xarticle in self.articles: 

2067 if hasattr(xarticle, "pii"): 2067 ↛ 2068line 2067 didn't jump to line 2068, because the condition on line 2067 was never true

2068 is_elsevier = True 

2069 

2070 editors = [contrib for contrib in xarticle.contributors if contrib["role"] == "editor"] 

2071 is_equal = len(editors) == len(issue_editors) 

2072 i = 0 

2073 while is_equal and i < len(editors): 2073 ↛ 2074line 2073 didn't jump to line 2074, because the condition on line 2073 was never true

2074 if ( 

2075 editors[i]["last_name"] != issue_editors[i]["last_name"] 

2076 or editors[i]["first_name"] != issue_editors[i]["first_name"] 

2077 ): 

2078 is_equal = False 

2079 i += 1 

2080 if is_equal: 

2081 xarticle.contributors = [ 

2082 contrib for contrib in xarticle.contributors if contrib["role"] != "editor" 

2083 ] 

2084 

2085 if is_elsevier: 2085 ↛ 2087line 2085 didn't jump to line 2087, because the condition on line 2085 was never true

2086 # Fix location of icons 

2087 for link in self.ext_links: 

2088 if link["rel"] in ["icon", "small_icon"]: 

2089 base_dir = self.journal.pid 

2090 location = link["location"] 

2091 if os.path.dirname(location) != base_dir: 

2092 location = os.path.join(base_dir, self.pid, location) 

2093 if self.from_folder: 

2094 location = os.path.join(self.from_folder, location) 

2095 location = "file:" + location 

2096 link["location"] = location 

2097 

2098 # Fix article types and subjects 

2099 for xarticle in self.articles: 

2100 article_type = "research-article" 

2101 old_type = "" 

2102 new_subjs = [] 

2103 

2104 if xarticle.fpage != "": 

2105 try: 

2106 value = int(xarticle.fpage) 

2107 except ValueError: 

2108 # fpage is not a number: the article is an editorial 

2109 article_type = "editorial" 

2110 

2111 if article_type == "research-article": 

2112 for subj in xarticle.subjs: 

2113 if subj["type"] == "type": 

2114 # Fix article types 

2115 value = subj["value"].lower() 

2116 old_type = value 

2117 if value == "discussion": 

2118 article_type = "letter" 

2119 elif value == "editorial": 

2120 if xarticle.title_tex.lower().find("foreword") == 0: 

2121 article_type = "foreword" 

2122 else: 

2123 article_type = "editorial" 

2124 elif value in ["mini review", "review article", "book review"]: 

2125 article_type = "review" 

2126 elif value == "research article": 

2127 article_type = "research-article" 

2128 elif value == "short communication": 

2129 article_type = "foreword" 

2130 elif value == "correspondence": 

2131 article_type = "letter" 

2132 elif value.find("conference") == 0: 

2133 article_type = "congress" 

2134 elif subj["type"] == "heading" and not xarticle.title_tex: 

2135 # The title may be stored in the heading: fix it 

2136 xarticle.title_tex = xarticle.title_html = subj["value"] 

2137 xarticle.title_xml = get_title_xml(subj["value"]) 

2138 elif subj["type"] == "heading": 

2139 value = subj["value"].lower().strip() 

2140 issue_title = self.title_tex.lower() 

2141 if issue_title.find("dossier: ") == 0: 

2142 issue_title = issue_title[9:] 

2143 self.title_tex = self.title_html = self.title_tex[9:] 

2144 self.title_xml = ( 

2145 "<issue-title>" 

2146 + get_single_title_xml(issue_title) 

2147 + "</issue-title>" 

2148 ) 

2149 

2150 # Some heading values are in fact article type 

2151 if value.find("erratum") == 0: 

2152 article_type = "erratum" 

2153 elif value.find("corrigendum") == 0: 

2154 article_type = "corrigendum" 

2155 elif value.find("foreword") == 0: 

2156 article_type = "foreword" 

2157 elif value.find("nécrologie") == 0 or value.find("obituary") == 0: 

2158 article_type = "history-of-sciences" 

2159 elif ( 

2160 value.find("block calendar/éphéméride") == 0 

2161 or value.find("chronique") == 0 

2162 ): 

2163 article_type = "history-of-sciences" 

2164 elif value.find("histoire") == 0 or value.find("historic") == 0: 

2165 article_type = "history-of-sciences" 

2166 elif value.find("tribute/hommage") == 0: 

2167 article_type = "history-of-sciences" 

2168 elif value.find("note historique") == 0: 

2169 article_type = "historical-commentary" 

2170 elif ( 

2171 value.find("le point sur") == 0 or value.find("le point-sur") == 0 

2172 ): 

2173 article_type = "review" 

2174 elif ( 

2175 value.find("review") == 0 

2176 or value.find("revue") == 0 

2177 or value.find("concise review") == 0 

2178 ): 

2179 article_type = "review" 

2180 elif value.find("conférence") == 0: 

2181 article_type = "congress" 

2182 elif ( 

2183 value.find("communication") == 0 or value.find("preliminary") == 0 

2184 ): 

2185 article_type = "preliminary-communication" 

2186 elif value.find("perspective") == 0 and old_type in [ 

2187 "correspondence", 

2188 "short communication", 

2189 ]: 

2190 article_type = "opinion" 

2191 elif value.find("debate") == 0: 

2192 article_type = "opinion" 

2193 elif ( 

2194 value.find("index") == 0 

2195 or value.find("keyword") == 0 

2196 or value.find("sommaire") == 0 

2197 ): 

2198 article_type = "editorial" 

2199 elif ( 

2200 value.find("table auteurs") == 0 

2201 or value.find("table sommaire") == 0 

2202 ): 

2203 article_type = "editorial" 

2204 elif value.find("page présentation des index") == 0: 

2205 article_type = "editorial" 

2206 elif value.find("fac-similé") == 0: 

2207 # Article de crbiol, Pubmed les met en "Classical Article" 

2208 article_type = "historical-commentary" 

2209 # On ajoute le sujet dans ce cas pour garder la mention de "fac-similé" (== recopie) 

2210 new_subjs.append(subj) 

2211 # Ignore the issue titles 

2212 elif ( 

2213 not self.title_tex 

2214 or value.find(self.title_tex.lower().strip()) != 0 

2215 ): 

2216 # Exclude headings that are redundant with article types 

2217 exclude_list = [ 

2218 "editorial", 

2219 "éditorial", 

2220 "avant-propos", 

2221 "book review", 

2222 "comment", 

2223 "concise review paper", 

2224 "answer", 

2225 "commentaire", 

2226 "commentary", 

2227 "reply", 

2228 "foreword", 

2229 "full paper", 

2230 "mémoire", 

2231 ] 

2232 if len([x for x in exclude_list if value.find(x) == 0]) == 0: 

2233 new_subjs.append(subj) 

2234 else: 

2235 new_subjs.append(subj) 

2236 

2237 # print(old_type, '-', old_heading, '-', article_type, '-', xarticle.pid, '-', xarticle.fpage) 

2238 xarticle.atype = article_type 

2239 xarticle.subjs = new_subjs 

2240 

2241 def parse_custom_meta_group(self, node, **kwargs): 

2242 for child in node: 

2243 tag = normalize(child.tag) 

2244 

2245 if tag == "custom-meta": 2245 ↛ 2253line 2245 didn't jump to line 2253, because the condition on line 2245 was never false

2246 name, value = self.get_data_from_custom_meta(child) 

2247 

2248 if name == "provider": 

2249 self.provider = value 

2250 elif name == "efirst": 2250 ↛ 2242line 2250 didn't jump to line 2242, because the condition on line 2250 was never false

2251 self.with_online_first = value == "yes" 

2252 else: 

2253 self.warnings.append( 

2254 { 

2255 self.pid: self.__class__.__name__ 

2256 + "." 

2257 + inspect.currentframe().f_code.co_name 

2258 + " " 

2259 + tag 

2260 } 

2261 ) 

2262 

2263 def parse_issue_meta(self, node, **kwargs): 

2264 for child in node: 

2265 tag = normalize(child.tag) 

2266 

2267 if tag == "issue-id": 

2268 self.parse_id(child) 

2269 elif tag == "volume-series": 

2270 self.vseries = child.text 

2271 elif tag == "volume": 

2272 self.volume = child.text 

2273 elif tag == "issue": 

2274 self.number = child.text 

2275 elif tag == "pub-date": 

2276 self.year = self.get_data_from_date(child, ignore_month=True) 

2277 elif tag == "history": 

2278 history_dates = self.get_data_from_history(child) 

2279 for date in history_dates: 

2280 if date["type"] == "last-modified": 

2281 self.last_modified_iso_8601_date_str = date["date"] 

2282 elif date["type"] == "prod-deployed-date": 

2283 self.prod_deployed_date_iso_8601_date_str = date["date"] 

2284 elif tag == "issue-title": 

2285 content_type = child.get("content-type") or "" 

2286 if content_type != "subtitle" and content_type != "cover-date": 2286 ↛ 2264line 2286 didn't jump to line 2264, because the condition on line 2286 was never false

2287 # Elsevier stores contributors in subtitles. Ignore. 

2288 lang = get_normalized_attrib(child, "lang") or "und" 

2289 if not self.title_tex and ( 

2290 self.lang == "und" or lang == "und" or lang == self.lang 

2291 ): 

2292 self.parse_title(child) 

2293 # In xmldata, title_xml had the <title_group> tag: 

2294 # self.title_xml can't be set in parse_title 

2295 self.title_xml += get_xml_from_node(child) 

2296 else: 

2297 self.trans_lang = lang 

2298 ( 

2299 self.trans_title_tex, 

2300 self.trans_title_html, 

2301 ) = self.parse_node_with_mixed_content(child) 

2302 self.title_xml += get_xml_from_node(child) 

2303 elif tag == "issue-title-group": 2303 ↛ 2304line 2303 didn't jump to line 2304, because the condition on line 2303 was never true

2304 self.parse_title_group(child) 

2305 else: 

2306 fct_name = "parse_" + tag.replace("-", "_") 

2307 ftor = getattr(self, fct_name, None) 

2308 if callable(ftor): 2308 ↛ 2311line 2308 didn't jump to line 2311, because the condition on line 2308 was never false

2309 ftor(child, add_ext_link=True) 

2310 else: 

2311 self.warnings.append( 

2312 { 

2313 self.pid: self.__class__.__name__ 

2314 + "." 

2315 + inspect.currentframe().f_code.co_name 

2316 + " " 

2317 + tag 

2318 } 

2319 ) 

2320 

2321 if self.last_modified_iso_8601_date_str is None: 

2322 self.last_modified_iso_8601_date_str = timezone.now().isoformat() 

2323 

2324 

2325class JatsArticleBase(JatsBase): 

2326 def parse_custom_meta_group(self, node, **kwargs): 

2327 for child in node: 

2328 tag = normalize(child.tag) 

2329 

2330 if tag == "custom-meta": 2330 ↛ 2346line 2330 didn't jump to line 2346, because the condition on line 2330 was never false

2331 name, value = self.get_data_from_custom_meta(child) 

2332 

2333 if name == "article-number": 

2334 self.article_number = value 

2335 elif name == "talk-number": 

2336 self.talk_number = value 

2337 elif name == "presented": 2337 ↛ 2338line 2337 didn't jump to line 2338, because the condition on line 2337 was never true

2338 presenter = create_contributor() 

2339 presenter["role"] = "presenter" 

2340 presenter["string_name"] = value.replace("Presented by ", "").replace( 

2341 "Présenté par ", "" 

2342 ) 

2343 presenter["contrib_xml"] = get_contrib_xml(presenter) 

2344 self.contributors.append(presenter) 

2345 else: 

2346 self.warnings.append( 

2347 { 

2348 self.pid: self.__class__.__name__ 

2349 + "." 

2350 + inspect.currentframe().f_code.co_name 

2351 + " " 

2352 + tag 

2353 } 

2354 ) 

2355 

2356 

2357class JatsArticle(ArticleData, JatsArticleBase): 

2358 def __init__(self, *args, **kwargs): # , tree, pid=None): 

2359 super().__init__(*args, **kwargs) 

2360 self.pid = kwargs["pid"] if "pid" in kwargs else None 

2361 self.issue = kwargs["issue"] if "issue" in kwargs else None 

2362 

2363 self.add_span_around_tex_formula = ( 

2364 kwargs["add_span_around_tex_formula"] 

2365 if "add_span_around_tex_formula" in kwargs 

2366 else False 

2367 ) 

2368 self.for_tex_file = kwargs["for_tex_file"] if "for_tex_file" in kwargs else False 

2369 self.from_folder = kwargs["from_folder"] if "from_folder" in kwargs else None 

2370 

2371 self.parse_tree(kwargs["tree"]) 

2372 

2373 def parse_tree(self, tree): 

2374 super().parse_tree(tree) 

2375 

2376 self.atype = get_normalized_attrib(tree, "article-type") or "" 

2377 

2378 # First loop to catch float-groups that are inserted inside the body 

2379 for node in tree: 

2380 tag = normalize(node.tag) 

2381 

2382 if tag == "front": 

2383 for child in node: 

2384 tag = normalize(child.tag) 

2385 

2386 if tag == "article-meta": 

2387 self.parse_article_meta(child) 

2388 else: 

2389 self.warnings.append( 

2390 { 

2391 self.pid: self.__class__.__name__ 

2392 + "." 

2393 + inspect.currentframe().f_code.co_name 

2394 + " " 

2395 + tag 

2396 } 

2397 ) 

2398 elif tag == "front-stub": 2398 ↛ 2399line 2398 didn't jump to line 2399, because the condition on line 2398 was never true

2399 self.parse_article_meta(node) 

2400 elif tag == "floats-group": 2400 ↛ 2401line 2400 didn't jump to line 2401, because the condition on line 2400 was never true

2401 self.parse_floats_group(node) 

2402 

2403 for node in tree: 

2404 tag = normalize(node.tag) 

2405 if tag == "back": 

2406 for child in node: 

2407 tag = normalize(child.tag) 

2408 

2409 if tag == "ref-list": 

2410 self.parse_ref_list(child) 

2411 elif tag == "ack": 2411 ↛ 2412line 2411 didn't jump to line 2412, because the condition on line 2411 was never true

2412 self.parse_ack(child) 

2413 elif tag == "sec": 2413 ↛ 2414line 2413 didn't jump to line 2414, because the condition on line 2413 was never true

2414 self.parse_sec(child) 

2415 elif tag == "app-group": 2415 ↛ 2416line 2415 didn't jump to line 2416, because the condition on line 2415 was never true

2416 self.parse_app_group(child) 

2417 elif tag == "fn-group": 2417 ↛ 2418line 2417 didn't jump to line 2418, because the condition on line 2417 was never true

2418 self.parse_fn_group(child) 

2419 else: 

2420 self.warnings.append( 

2421 { 

2422 self.pid: self.__class__.__name__ 

2423 + "." 

2424 + inspect.currentframe().f_code.co_name 

2425 + " " 

2426 + tag 

2427 } 

2428 ) 

2429 

2430 elif tag == "body": 

2431 self.parse_body(node) 

2432 elif tag == "sub-article": 2432 ↛ 2433line 2432 didn't jump to line 2433, because the condition on line 2432 was never true

2433 self.parse_sub_article(node) 

2434 elif tag == "floats-group" or tag == "front": 2434 ↛ 2438line 2434 didn't jump to line 2438, because the condition on line 2434 was never false

2435 # Handled above 

2436 pass 

2437 else: 

2438 self.warnings.append( 

2439 { 

2440 self.pid: self.__class__.__name__ 

2441 + "." 

2442 + inspect.currentframe().f_code.co_name 

2443 + " " 

2444 + tag 

2445 } 

2446 ) 

2447 

2448 # Add the footnotes at the end 

2449 if len(self.fns) > 0: 2449 ↛ 2450line 2449 didn't jump to line 2450, because the condition on line 2449 was never true

2450 fn_text = '<div class="footnotes">' 

2451 for fn in self.fns: 

2452 fn_text += fn 

2453 fn_text += "</div>" 

2454 

2455 self.body_html = fn_text if not self.body_html else self.body_html + fn_text 

2456 

2457 if ( 2457 ↛ 2461line 2457 didn't jump to line 2461

2458 len(self.funding_statement_xml) > 0 

2459 and self.funding_statement_xml.find('<name-content content-type="fn"') == -1 

2460 ): 

2461 self.funding_statement_xml = ( 

2462 f'<name-content content-type="fn">{self.funding_statement_xml}</name-content>' 

2463 ) 

2464 

2465 # Case for XML with <body>, then <back> and <floats_group> 

2466 # The figures/tables of the floats_group are added inside the body_html 

2467 # (close to their first <xref>) 

2468 # It's too complicated to do the same for the body_xml as we use the get_xml_from_node function. 

2469 # Instead, we append the floats_group_xml to the body_xml 

2470 if hasattr(self, "floats_group_xml"): 2470 ↛ 2471line 2470 didn't jump to line 2471, because the condition on line 2470 was never true

2471 self.body_xml += self.floats_group_xml 

2472 

2473 # Special treatment for Elsevier articles: web scrapping to find the date_published 

2474 # Moved to the import management commands since Elsevier blocks IP after 1000+ requests 

2475 # if hasattr(self, 'pii') and self.date_published_iso_8601_date_str is None: 

2476 # article_data = scrapping.fetch_article(self.doi, self.pii) 

2477 # self.date_published_iso_8601_date_str = article_data.date_published_iso_8601_date_str 

2478 

2479 def update_body_content(self, node, **kwargs): 

2480 if len(node) == 0: 

2481 # Most journals do not display the Full text 

2482 # the <body> is then used to store the text for the search engine and has no children 

2483 # Let's not compute body_html in this case. 

2484 # We want the same behavior for journals that display the Full text, 

2485 # but with old articles without Full text. 

2486 return 

2487 

2488 # <front> has to be put before <body> so self.pid is defined here 

2489 if hasattr(settings, "SITE_URL_PREFIX"): 2489 ↛ 2490line 2489 didn't jump to line 2490, because the condition on line 2489 was never true

2490 prefix = settings.SITE_URL_PREFIX 

2491 base_article = settings.ARTICLE_BASE_URL 

2492 base_url = "/" + prefix + base_article + self.pid 

2493 else: 

2494 base_url = os.path.join(settings.ARTICLE_BASE_URL, self.pid) 

2495 kwargs["base_url"] = base_url 

2496 

2497 append_to_body = True 

2498 current_len = len(self.supplementary_materials) 

2499 

2500 if "use_sec" in kwargs and kwargs["use_sec"]: 2500 ↛ 2502line 2500 didn't jump to line 2502, because the condition on line 2500 was never true

2501 # Hack for Elsevier: convert <ack> into <sec> of the <body> 

2502 body_tex, body_html = self.parse_node_with_sec(node, **kwargs) 

2503 else: 

2504 body_tex, body_html = self.parse_node_with_mixed_content(node, **kwargs) 

2505 

2506 if len(self.supplementary_materials) != current_len: 2506 ↛ 2509line 2506 didn't jump to line 2509, because the condition on line 2506 was never true

2507 # Elsevier stores supplementary-material in app-group. 

2508 # They are extracted, but ignored in the body_html if the appendix has only supplements 

2509 append_to_body = False 

2510 

2511 for child in node: 

2512 if child.tag == "p": 

2513 for gchild in child: 

2514 if gchild.tag != "supplementary-material": 

2515 append_to_body = True 

2516 

2517 if append_to_body: 2517 ↛ exitline 2517 didn't return from function 'update_body_content', because the condition on line 2517 was never false

2518 self.body_tex = body_tex if not self.body_tex else self.body_tex + body_tex 

2519 self.body_html = body_html if not self.body_html else self.body_html + body_html 

2520 

2521 body_xml = get_xml_from_node(node) 

2522 if not self.body_xml: 2522 ↛ 2525line 2522 didn't jump to line 2525, because the condition on line 2522 was never false

2523 self.body_xml = body_xml 

2524 else: 

2525 if "use_sec" in kwargs and kwargs["use_sec"]: 

2526 self.body_xml = f"{self.body_xml[0:-7]}<sec>{body_xml[5:-6]}</sec></body>" 

2527 else: 

2528 self.body_xml = f"{self.body_xml[0:-7]}{body_xml}</body>" 

2529 

2530 def parse_ack(self, node, **kwargs): 

2531 content_type = node.get("content-type") or "" 

2532 if content_type == "COI-statement": 

2533 self.coi_statement = get_text_from_node(node) 

2534 else: 

2535 # Hack for Elsevier: convert <ack> into <sec> of the <body> 

2536 self.update_body_content(node, use_sec=True) 

2537 

2538 def parse_app(self, node, **kwargs): 

2539 for child in node: 

2540 tag = normalize(child.tag) 

2541 

2542 if tag == "sec": 

2543 # Elsevier can store all appendixes inside one <app> ?!? 

2544 # One of them can store the supplements and has to be ignored in the body_html 

2545 self.update_body_content(child) 

2546 else: 

2547 self.warnings.append( 

2548 { 

2549 self.pid: self.__class__.__name__ 

2550 + "." 

2551 + inspect.currentframe().f_code.co_name 

2552 + " " 

2553 + tag 

2554 } 

2555 ) 

2556 

2557 def parse_app_group(self, node, **kwargs): 

2558 for child in node: 

2559 tag = normalize(child.tag) 

2560 

2561 if tag == "app": 

2562 self.parse_app(child) 

2563 else: 

2564 self.warnings.append( 

2565 { 

2566 self.pid: self.__class__.__name__ 

2567 + "." 

2568 + inspect.currentframe().f_code.co_name 

2569 + " " 

2570 + tag 

2571 } 

2572 ) 

2573 

2574 def parse_article_categories(self, node, **kwargs): 

2575 for child in node: 

2576 tag = normalize(child.tag) 

2577 

2578 if tag == "subj-group": 2578 ↛ 2581line 2578 didn't jump to line 2581, because the condition on line 2578 was never false

2579 self.parse_subj_group(child) 

2580 else: 

2581 self.warnings.append( 

2582 { 

2583 self.pid: self.__class__.__name__ 

2584 + "." 

2585 + inspect.currentframe().f_code.co_name 

2586 + " " 

2587 + tag 

2588 } 

2589 ) 

2590 

2591 def parse_article_meta(self, node, **kwargs): 

2592 for child in node: 

2593 tag = normalize(child.tag) 

2594 

2595 if tag == "article-id": 

2596 self.parse_id(child) 

2597 elif tag == "fpage": 

2598 self.fpage = child.text 

2599 self.page_type = child.get("content-type") or "" 

2600 elif tag == "lpage": 

2601 self.lpage = child.text or "" 

2602 elif tag == "page-range": 

2603 self.page_range = child.text 

2604 elif tag in ("page-count", "size"): 2604 ↛ 2605line 2604 didn't jump to line 2605, because the condition on line 2604 was never true

2605 self.size = child.text 

2606 elif tag == "elocation-id": 2606 ↛ 2607line 2606 didn't jump to line 2607, because the condition on line 2606 was never true

2607 self.elocation = child.text 

2608 elif tag == "pub-date": 

2609 date_type = child.get("date-type") or "pub" 

2610 if date_type == "pub": 

2611 self.date_published_iso_8601_date_str = self.get_data_from_date(child) 

2612 else: 

2613 date_str = self.get_data_from_date(child) 

2614 self.history_dates.append({"type": "online", "date": date_str}) 

2615 elif tag == "history": 

2616 self.history_dates += self.get_data_from_history(child) 

2617 for date in self.history_dates: 

2618 if date["type"] == "prod-deployed-date": 

2619 self.prod_deployed_date_iso_8601_date_str = date["date"] 

2620 elif tag in ["volume", "issue-id", "permissions", "pub-date-not-available"]: 

2621 pass 

2622 # TODO: store permissions in XML 

2623 elif tag == "author-notes": 2623 ↛ 2625line 2623 didn't jump to line 2625, because the condition on line 2623 was never true

2624 # 2022/11/15 Mersenne meeting. ignore author-notes 

2625 pass 

2626 # self.parse_author_notes(child) 

2627 else: 

2628 fct_name = "parse_" + tag.replace("-", "_") 

2629 ftor = getattr(self, fct_name, None) 

2630 if callable(ftor): 

2631 ftor(child, add_ext_link=True) 

2632 else: 

2633 self.warnings.append( 

2634 { 

2635 self.pid: self.__class__.__name__ 

2636 + "." 

2637 + inspect.currentframe().f_code.co_name 

2638 + " " 

2639 + tag 

2640 } 

2641 ) 

2642 

2643 def parse_author_notes(self, node, **kwargs): 

2644 for child in node: 

2645 tag = normalize(child.tag) 

2646 if tag == "fn": 

2647 _, html = self.parse_node_with_fn(child, keep_fn=True, keep_fn_label=False) 

2648 xml = get_xml_from_node(child) 

2649 self.footnotes_xml += xml 

2650 self.footnotes_html += html 

2651 

2652 def parse_body(self, node, **kwargs): 

2653 self.body = get_text_from_node(node) 

2654 

2655 if hasattr(self, "floats"): 2655 ↛ 2656line 2655 didn't jump to line 2656, because the condition on line 2655 was never true

2656 self.floats_to_insert = [] 

2657 

2658 self.update_body_content(node, **kwargs) 

2659 

2660 if not self.body_xml: 

2661 self.body_xml = get_xml_from_node(node) 

2662 

2663 def parse_boxed_text(self, node, **kwargs): 

2664 """ 

2665 Parse <boxed-text> inside <floats-group> and fills the self.float_boxed_texts dictionary. 

2666 The dictionary is then used during parse_body to embed the boxed-text inside the body HTML. 

2667 """ 

2668 box_id = node.attrib["id"] if "id" in node.attrib else None 

2669 

2670 _, html = self.parse_node_with_boxed_text(node, **kwargs) 

2671 

2672 if box_id is not None: 

2673 self.floats[box_id] = html 

2674 

2675 def parse_floats_group(self, node, **kwargs): 

2676 if hasattr(settings, "SITE_URL_PREFIX"): 

2677 prefix = settings.SITE_URL_PREFIX 

2678 base_article = settings.ARTICLE_BASE_URL 

2679 base_url = "/" + prefix + base_article + self.pid 

2680 else: 

2681 base_url = os.path.join(settings.ARTICLE_BASE_URL, self.pid) 

2682 

2683 self.floats = {} 

2684 for child in node: 

2685 tag = normalize(child.tag) 

2686 

2687 if tag == "fig": 

2688 self.parse_node_with_fig(child, append_floats=True, base_url=base_url) 

2689 elif tag == "table-wrap": 

2690 self.parse_node_with_table_wrap(child, append_floats=True, base_url=base_url) 

2691 elif tag == "boxed-text": 

2692 self.parse_boxed_text(child, base_url=base_url) 

2693 else: 

2694 self.warnings.append( 

2695 { 

2696 self.pid: self.__class__.__name__ 

2697 + "." 

2698 + inspect.currentframe().f_code.co_name 

2699 + " " 

2700 + tag 

2701 } 

2702 ) 

2703 

2704 self.floats_group_xml = get_xml_from_node(node) 

2705 

2706 def parse_fn_group(self, node, **kwargs): 

2707 for child in node: 

2708 tag = normalize(child.tag) 

2709 

2710 if tag == "fn": 

2711 _, html = self.parse_node_with_fn(child, keep_fn=True) 

2712 xml = get_xml_from_node(child) 

2713 

2714 self.footnotes_html += html 

2715 self.footnotes_xml += xml 

2716 else: 

2717 self.warnings.append( 

2718 { 

2719 self.pid: self.__class__.__name__ 

2720 + "." 

2721 + inspect.currentframe().f_code.co_name 

2722 + " " 

2723 + tag 

2724 } 

2725 ) 

2726 

2727 def parse_funding_group(self, node, **kwargs): 

2728 for child in node: 

2729 tag = normalize(child.tag) 

2730 

2731 if tag == "award-group": 2731 ↛ 2733line 2731 didn't jump to line 2733, because the condition on line 2731 was never false

2732 self.parse_award_group(child) 

2733 elif tag == "funding-statement": 

2734 for funding_node in child: 

2735 if funding_node.tag == "name-content": 

2736 for funding_child in funding_node: 

2737 if funding_child.tag == "fn": 

2738 _, html = self.parse_node_with_fn(funding_child, keep_fn=True) 

2739 self.funding_statement_html += html 

2740 self.funding_statement_xml = get_xml_from_node(funding_node) 

2741 

2742 # TODO: handle funding-statement with simple texts 

2743 else: 

2744 self.warnings.append( 

2745 { 

2746 self.pid: self.__class__.__name__ 

2747 + "." 

2748 + inspect.currentframe().f_code.co_name 

2749 + " " 

2750 + tag 

2751 } 

2752 ) 

2753 

2754 def parse_issue(self, node, **kwargs): 

2755 # Elsevier stores bs in the seq attribute 

2756 self.seq = "0" if hasattr(self, "pii") else (node.get("seq") or "0") 

2757 

2758 

2759class JatsRef(RefBase, JatsBase): 

2760 def __init__(self, *args, **kwargs): # , tree, lang): 

2761 super().__init__(*args, **kwargs) # lang) 

2762 self.parse_tree(kwargs["tree"]) 

2763 

2764 def parse_tree(self, tree): 

2765 super().parse_tree(tree) 

2766 

2767 self.user_id = get_normalized_attrib(tree, "id") or "" 

2768 

2769 for node in tree: 

2770 tag = normalize(node.tag) 

2771 

2772 if tag == "label": 

2773 self.label = node.text or "" 

2774 

2775 if self.label: 2775 ↛ 2810line 2775 didn't jump to line 2810, because the condition on line 2775 was never false

2776 if self.label[0] != "[": 

2777 self.label = "[" + self.label + "]" 

2778 

2779 elif tag == "mixed-citation" or tag == "note": 

2780 self.parse_citation_node(node) 

2781 

2782 self.citation_tex, self.citation_html = self.parse_node_with_mixed_content( 

2783 node, 

2784 is_citation=True, 

2785 is_mixed_citation=True, 

2786 add_ext_link=True, 

2787 ref_type="misc", 

2788 ) 

2789 

2790 if self.label: 

2791 self.citation_html = self.label + " " + self.citation_html 

2792 self.citation_tex = self.label + " " + self.citation_tex 

2793 

2794 elif tag == "element-citation": 

2795 self.parse_citation_node(node) 

2796 

2797 self.citation_tex = self.citation_html = get_citation_html(self) 

2798 else: 

2799 self.warnings.append( 

2800 { 

2801 self.pid: self.__class__.__name__ 

2802 + "." 

2803 + inspect.currentframe().f_code.co_name 

2804 + " " 

2805 + tag 

2806 } 

2807 ) 

2808 

2809 # With xmldata, citation_xml does not have '<ref>', but only the text of the children 

2810 self.citation_xml += get_xml_from_node(node) 

2811 

2812 def get_data_from_name_in_ref(self, node, role): 

2813 params = create_contributor() 

2814 params["role"] = role 

2815 

2816 if node.tag == "name": 

2817 self.update_data_from_name(node, params) 

2818 elif node.tag == "string-name": 

2819 self.update_data_from_name(node, params) 

2820 if params["first_name"] == "" and params["last_name"] == "": 

2821 params["string_name"] = node.text or "" 

2822 elif node.tag == "name-alternatives": 2822 ↛ 2823line 2822 didn't jump to line 2823, because the condition on line 2822 was never true

2823 params["mid"] = self.get_data_from_name_alternatives(node) 

2824 elif node.tag == "collab": 2824 ↛ 2825line 2824 didn't jump to line 2825, because the condition on line 2824 was never true

2825 params["string_name"] = node.text or "" 

2826 

2827 use_initials = getattr(settings, "REF_JEP_STYLE", False) 

2828 helper_update_name_params(params, use_initials) 

2829 params["contrib_xml"] = "<etal/>" if node.tag == "etal" else get_xml_from_node(node) 

2830 

2831 return params 

2832 

2833 def parse_node_with_chapter_title(self, node, **kwargs): 

2834 tex, html = self.parse_inner_node(node, **kwargs) 

2835 

2836 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False 

2837 if is_mixed_citation: 

2838 html = add_span_class_to_html_from_chapter_title(html, **kwargs) 

2839 

2840 return tex, html 

2841 

2842 def parse_node_with_source(self, node, **kwargs): 

2843 tex, html = self.parse_inner_node(node, **kwargs) 

2844 

2845 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False 

2846 if is_mixed_citation: 

2847 html = add_span_class_to_html_from_source(html, **kwargs) 

2848 

2849 return tex, html 

2850 

2851 def parse_citation_node(self, node, **kwargs): 

2852 self.type = get_normalized_attrib(node, "publication-type") or "misc" 

2853 

2854 # Elsevier can store data about a translation after comments (<source>...) 

2855 # Append these tags in the comment 

2856 has_comment = False 

2857 

2858 for child in node: 

2859 tag = normalize(child.tag) 

2860 

2861 if tag in ("page-count", "size"): 

2862 if not self.size: 2862 ↛ 2858line 2862 didn't jump to line 2858, because the condition on line 2862 was never false

2863 self.size = child.text 

2864 elif tag == "comment": 

2865 has_comment = True 

2866 # comments may have ext-links or uri. HTML <a> links will be added 

2867 _, comment = self.parse_node_with_mixed_content( 

2868 child, is_citation=True, is_comment=True, add_HTML_link=True 

2869 ) 

2870 if self.comment: 

2871 self.comment += " " 

2872 self.comment += comment 

2873 elif tag == "source": 

2874 # TODO: migration to store source_tex and source_html 

2875 _, source_tex = self.parse_node_with_mixed_content(child, is_citation=True) 

2876 

2877 if self.type in ["book", "inproceedings"] and len(self.source_tex) > 0: 2877 ↛ 2879line 2877 didn't jump to line 2879, because the condition on line 2877 was never true

2878 # Multiple source for a book, store the extra source in series 

2879 if self.series and has_comment: 

2880 self.comment += " " + source_tex 

2881 else: 

2882 if self.series: 

2883 self.series += ", " 

2884 self.series += get_text_from_node(child) 

2885 else: 

2886 if self.source_tex and has_comment: 2886 ↛ 2887line 2886 didn't jump to line 2887, because the condition on line 2886 was never true

2887 self.comment += " " + source_tex 

2888 else: 

2889 self.source_tex = source_tex 

2890 elif tag == "series": 

2891 series = get_text_from_node(child) 

2892 if self.series and has_comment: 2892 ↛ 2893line 2892 didn't jump to line 2893, because the condition on line 2892 was never true

2893 self.comment += ", " + series 

2894 else: 

2895 if self.series: 2895 ↛ 2896line 2895 didn't jump to line 2896, because the condition on line 2895 was never true

2896 self.series += ", " 

2897 self.series += series 

2898 elif tag == "annotation": 2898 ↛ 2899line 2898 didn't jump to line 2899, because the condition on line 2898 was never true

2899 if not self.annotation: 

2900 self.annotation = get_text_from_node(child) 

2901 elif tag == "article-title": 

2902 # TODO: migration to store article_title_tex and article_title_html 

2903 _, article_title_tex = self.parse_node_with_mixed_content(child, is_citation=True) 

2904 

2905 if self.type == "book": 2905 ↛ 2907line 2905 didn't jump to line 2907, because the condition on line 2905 was never true

2906 # Elsevier uses article-title for books !?! 

2907 if len(self.source_tex) == 0: 

2908 if has_comment: 

2909 self.comment += " " + article_title_tex 

2910 else: 

2911 self.source_tex = article_title_tex 

2912 else: 

2913 if self.series and has_comment: 

2914 self.comment += ", " + article_title_tex 

2915 else: 

2916 self.series += get_text_from_node(child) 

2917 elif self.type == "inproceedings": 

2918 if self.chapter_title_tex and has_comment: 2918 ↛ 2919line 2918 didn't jump to line 2919, because the condition on line 2918 was never true

2919 self.comment += " " + article_title_tex 

2920 else: 

2921 self.chapter_title_tex = article_title_tex 

2922 else: 

2923 if self.article_title_tex and has_comment: 2923 ↛ 2924line 2923 didn't jump to line 2924, because the condition on line 2923 was never true

2924 self.comment += " " + article_title_tex 

2925 else: 

2926 self.article_title_tex = article_title_tex 

2927 elif tag == "chapter-title": 

2928 # TODO: migration to store chapter_title_tex and chapter_title_html 

2929 _, chapter_title_tex = self.parse_node_with_mixed_content(child, is_citation=True) 

2930 if self.chapter_title_tex and has_comment: 2930 ↛ 2931line 2930 didn't jump to line 2931, because the condition on line 2930 was never true

2931 self.comment += " " + chapter_title_tex 

2932 else: 

2933 self.chapter_title_tex = chapter_title_tex 

2934 elif tag == "conf-name": 

2935 _, conf_tex = self.parse_node_with_mixed_content(child, is_citation=True) 

2936 if self.source_tex and has_comment: 2936 ↛ 2937line 2936 didn't jump to line 2937, because the condition on line 2936 was never true

2937 self.comment += ", " + conf_tex 

2938 else: 

2939 self.source_tex = conf_tex 

2940 elif tag in ("name", "string-name", "name-alternatives", "etal", "collab"): 

2941 params = self.get_data_from_name_in_ref(child, "author") 

2942 self.contributors.append(params) 

2943 elif tag == "person-group": 

2944 self.parse_person_group(child) 

2945 elif tag == "ext-link": 

2946 self.parse_ext_link(child, add_ext_link=True) 

2947 elif tag == "pub-id": 

2948 self.parse_pub_id(child) 

2949 elif tag == "date": 2949 ↛ 2950line 2949 didn't jump to line 2950, because the condition on line 2949 was never true

2950 self.year = get_text_from_node(child) 

2951 elif tag == "date-in-citation": 2951 ↛ 2952line 2951 didn't jump to line 2952, because the condition on line 2951 was never true

2952 date_ = child.get("iso-8601-date") or "" 

2953 if date_: 

2954 if self.comment: 

2955 self.comment += ", " 

2956 self.comment += "Accessed " + date_ 

2957 elif tag == "isbn": 2957 ↛ 2958line 2957 didn't jump to line 2958, because the condition on line 2957 was never true

2958 if self.annotation: 

2959 self.annotation += ", " 

2960 self.annotation += "ISBN: " + child.text 

2961 elif tag == "issn": 2961 ↛ 2962line 2961 didn't jump to line 2962, because the condition on line 2961 was never true

2962 if self.annotation: 

2963 self.annotation += ", " 

2964 self.annotation += "ISSN: " + child.text 

2965 elif child.text is not None: 

2966 variable_name = tag.replace("-", "_") 

2967 if has_comment and hasattr(self, variable_name) and getattr(self, variable_name): 2967 ↛ 2968line 2967 didn't jump to line 2968, because the condition on line 2967 was never true

2968 if tag == "fpage": 

2969 self.comment += ", pp. " 

2970 elif tag == "lpage": 

2971 self.comment += "-" 

2972 else: 

2973 self.comment += ", " 

2974 self.comment += child.text 

2975 elif not hasattr(self, variable_name) or not getattr(self, variable_name): 

2976 setattr(self, variable_name, child.text) 

2977 

2978 def parse_person_group(self, node, **kwargs): 

2979 role = node.get("person-group-type") or "" 

2980 if role and role[-1] == "s": 2980 ↛ 2981line 2980 didn't jump to line 2981, because the condition on line 2980 was never true

2981 role = role[:-1] 

2982 

2983 for child in node: 

2984 tag = normalize(child.tag) 

2985 

2986 if tag in ("name", "string-name", "name-alternatives", "etal", "collab"): 2986 ↛ 2990line 2986 didn't jump to line 2990, because the condition on line 2986 was never false

2987 contrib = self.get_data_from_name_in_ref(child, role) 

2988 self.contributors.append(contrib) 

2989 else: 

2990 self.warnings.append( 

2991 { 

2992 self.pid: self.__class__.__name__ 

2993 + "." 

2994 + inspect.currentframe().f_code.co_name 

2995 + " " 

2996 + tag 

2997 } 

2998 ) 

2999 

3000 def parse_pub_id(self, node, **kwargs): 

3001 node_type = node.get("pub-id-type") or "" 

3002 

3003 data = { 

3004 "rel": node_type, 

3005 "mimetype": "", 

3006 "location": "", 

3007 "base": "", 

3008 "metadata": node.text, 

3009 } 

3010 

3011 self.add_extids_from_node_with_link(data) 

3012 

3013 def split_label(self): 

3014 """ 

3015 Used when sorting non-digit bibitems 

3016 """ 

3017 label = self.label.lower() 

3018 if len(label) > 1: 

3019 label = label[1:-1] 

3020 

3021 try: 

3022 self.label_prefix, self.label_suffix = re.split(r"[\d]+", label) 

3023 except ValueError: 

3024 # Special case where label is similar as "Sma" instead of "Sma15" 

3025 self.label_prefix, self.label_suffix = [label, ""] 

3026 

3027 

3028class BitsCollection(CollectionData, JatsBase): 

3029 def __init__(self, *args, **kwargs): 

3030 super().__init__(*args, **kwargs) 

3031 self.parse_tree(kwargs["tree"]) 

3032 

3033 def parse_tree(self, tree): 

3034 super().parse_tree(tree) 

3035 

3036 if tree is not None: 3036 ↛ 3079line 3036 didn't jump to line 3079, because the condition on line 3036 was never false

3037 tag = normalize(tree.tag) 

3038 collection_meta_node = None 

3039 if tag == "collection-meta": 

3040 self.parse_collection_meta(tree) 

3041 collection_meta_node = tree 

3042 elif tag == "in-collection": 3042 ↛ 3066line 3042 didn't jump to line 3066, because the condition on line 3042 was never false

3043 for node in tree: 

3044 tag = normalize(node.tag) 

3045 

3046 if tag == "collection-meta": 

3047 self.parse_collection_meta(node) 

3048 collection_meta_node = node 

3049 elif tag == "volume": 

3050 self.parse_volume(node) 

3051 elif tag == "volume-series": 3051 ↛ 3053line 3051 didn't jump to line 3053, because the condition on line 3051 was never false

3052 self.parse_volume_series(node) 

3053 elif tag == "volume-title": 

3054 self.parse_volume_title(node) 

3055 else: 

3056 self.warnings.append( 

3057 { 

3058 self.pid: self.__class__.__name__ 

3059 + "." 

3060 + inspect.currentframe().f_code.co_name 

3061 + " " 

3062 + tag 

3063 } 

3064 ) 

3065 

3066 if collection_meta_node is not None: 3066 ↛ 3069line 3066 didn't jump to line 3069, because the condition on line 3066 was never false

3067 self.set_seq(collection_meta_node) 

3068 else: 

3069 self.warnings.append( 

3070 { 

3071 self.pid: self.__class__.__name__ 

3072 + "." 

3073 + inspect.currentframe().f_code.co_name 

3074 + " " 

3075 + tag 

3076 } 

3077 ) 

3078 

3079 self.collection = Foo() 

3080 self.collection.pid = self.pid 

3081 

3082 def parse_collection_meta(self, node, **kwargs): 

3083 self.coltype = node.get("collection-type") 

3084 

3085 for child in node: 

3086 tag = normalize(child.tag) 

3087 

3088 if tag == "collection-id": 

3089 self.pid = child.text 

3090 elif tag == "title-group": 

3091 self.parse_title_group(child) 

3092 elif tag == "issn": 

3093 node_type = child.get("pub-type") 

3094 if node_type == "ppub": 3094 ↛ 3095line 3094 didn't jump to line 3095, because the condition on line 3094 was never true

3095 self.issn = child.text 

3096 self.ids.append(("issn", child.text)) 

3097 elif node_type == "epub": 3097 ↛ 3098line 3097 didn't jump to line 3098, because the condition on line 3097 was never true

3098 self.e_issn = child.text 

3099 self.ids.append(("e-issn", child.text)) 

3100 elif tag == "ext-link": 3100 ↛ 3101line 3100 didn't jump to line 3101, because the condition on line 3100 was never true

3101 data = self.get_data_from_ext_link(child) 

3102 self.ext_links.append(data) 

3103 elif tag == "volume-in-collection": 

3104 self.parse_volume_in_collection(child) 

3105 else: 

3106 self.warnings.append( 

3107 { 

3108 self.pid: self.__class__.__name__ 

3109 + "." 

3110 + inspect.currentframe().f_code.co_name 

3111 + " " 

3112 + tag 

3113 } 

3114 ) 

3115 

3116 def parse_volume(self, node, **kwargs): 

3117 self.volume = node.text 

3118 

3119 def parse_volume_in_collection(self, node, **kwargs): 

3120 for child in node: 

3121 tag = normalize(child.tag) 

3122 

3123 if tag == "volume-number": 

3124 self.parse_volume(child) 

3125 elif tag == "volume-series": 

3126 self.parse_volume_series(child) 

3127 elif tag == "volume-title": 3127 ↛ 3130line 3127 didn't jump to line 3130, because the condition on line 3127 was never false

3128 self.parse_volume_title(child) 

3129 else: 

3130 self.warnings.append( 

3131 { 

3132 self.pid: self.__class__.__name__ 

3133 + "." 

3134 + inspect.currentframe().f_code.co_name 

3135 + " " 

3136 + tag 

3137 } 

3138 ) 

3139 

3140 def parse_volume_series(self, node, **kwargs): 

3141 self.vseries = node.text 

3142 

3143 def parse_volume_title(self, node, **kwargs): 

3144 self.title_tex, self.title_html = self.parse_node_with_mixed_content(node) 

3145 self.title_xml = get_xml_from_node(node) 

3146 

3147 def set_seq(self, node): 

3148 try: 

3149 # First, use the seq attribute, if any 

3150 self.seq = int(node.get("seq") or "") 

3151 except ValueError: 

3152 # Second, use self.volume (which can be like "158-159") 

3153 if not self.volume: 3153 ↛ 3154line 3153 didn't jump to line 3154, because the condition on line 3153 was never true

3154 self.seq = 0 

3155 else: 

3156 text = self.volume.split("-")[0] 

3157 try: 

3158 self.seq = int(text) 

3159 except ValueError: 

3160 self.seq = 0 

3161 

3162 # Third, use self.vseries as an offset 

3163 try: 

3164 # pas plus de 10000 ouvrages dans une série (gasp) 

3165 self.seq = int(self.vseries) * 10000 + self.seq 

3166 except ValueError: 

3167 pass 

3168 

3169 

3170class BitsBook(BookData, JatsBase): 

3171 def __init__(self, *args, **kwargs): 

3172 super().__init__(*args, **kwargs) 

3173 self.parse_tree(kwargs["tree"]) 

3174 

3175 def parse_tree(self, tree): 

3176 super().parse_tree(tree) 

3177 

3178 book_type = get_normalized_attrib(tree, "book-type") or "Book" 

3179 self.ctype = "book-" + book_type 

3180 

3181 for node in tree: 

3182 if type(tree) == type(node): 3182 ↛ 3181line 3182 didn't jump to line 3181, because the condition on line 3182 was never false

3183 tag = normalize(node.tag) 

3184 

3185 if tag in ("collection-meta", "in-collection"): 

3186 col = BitsCollection(tree=node) 

3187 self.incollection.append(col) 

3188 elif tag == "book-meta": 

3189 self.parse_book_meta(node) 

3190 elif tag == "book-body": 

3191 self.parse_book_body(node) 

3192 elif tag == "front-matter": 

3193 self.parse_front_matter(node) 

3194 elif tag == "book-back": 

3195 for child in node: 

3196 tag = normalize(child.tag) 

3197 if tag == "ref-list": 

3198 self.parse_ref_list(child) 

3199 else: 

3200 self.warnings.append( 

3201 { 

3202 self.pid: self.__class__.__name__ 

3203 + "." 

3204 + inspect.currentframe().f_code.co_name 

3205 + " " 

3206 + tag 

3207 } 

3208 ) 

3209 else: 

3210 self.warnings.append( 

3211 { 

3212 self.pid: self.__class__.__name__ 

3213 + "." 

3214 + inspect.currentframe().f_code.co_name 

3215 + " " 

3216 + tag 

3217 } 

3218 ) 

3219 

3220 self.set_contribs() 

3221 self.set_title() 

3222 

3223 def parse_book_body(self, node, **kwargs): 

3224 for child in node: 

3225 if type(child) == type(node): 3225 ↛ 3224line 3225 didn't jump to line 3224, because the condition on line 3225 was never false

3226 tag = normalize(child.tag) 

3227 

3228 if tag == "book-part": 3228 ↛ 3233line 3228 didn't jump to line 3233, because the condition on line 3228 was never false

3229 book_part = BitsBookPart(tree=child) 

3230 self.warnings.extend(book_part.warnings) 

3231 self.parts.append(book_part) 

3232 else: 

3233 self.warnings.append( 

3234 { 

3235 self.pid: self.__class__.__name__ 

3236 + "." 

3237 + inspect.currentframe().f_code.co_name 

3238 + " " 

3239 + tag 

3240 } 

3241 ) 

3242 

3243 if not self.parts: 

3244 self.body = get_text_from_node(node) 

3245 

3246 def parse_book_meta(self, node, **kwargs): 

3247 for child in node: 

3248 tag = normalize(child.tag) 

3249 

3250 if tag == "book-id": 

3251 self.parse_id(child) 

3252 elif tag == "pub-date": 

3253 self.year = self.get_data_from_date(child) 

3254 elif tag == "book-volume-number": 3254 ↛ 3255line 3254 didn't jump to line 3255, because the condition on line 3254 was never true

3255 self.volume = child.text 

3256 self.volume_int = child.text 

3257 elif tag == "pub-history": 

3258 history_dates = self.get_data_from_history(child) 

3259 for date in history_dates: 

3260 if date["type"] == "last-modified": 

3261 self.last_modified_iso_8601_date_str = date["date"] 

3262 elif date["type"] == "prod-deployed-date": 3262 ↛ 3263line 3262 didn't jump to line 3263, because the condition on line 3262 was never true

3263 self.prod_deployed_date_iso_8601_date_str = date["date"] 

3264 elif tag == "book-title-group": 

3265 self.parse_title_group(child) 

3266 elif tag == "publisher": 

3267 self.publisher = JatsPublisher(tree=child) 

3268 else: 

3269 fct_name = "parse_" + tag.replace("-", "_") 

3270 ftor = getattr(self, fct_name, None) 

3271 if callable(ftor): 

3272 ftor(child, add_ext_link=True) 

3273 else: 

3274 self.warnings.append( 

3275 { 

3276 self.pid: self.__class__.__name__ 

3277 + "." 

3278 + inspect.currentframe().f_code.co_name 

3279 + " " 

3280 + tag 

3281 } 

3282 ) 

3283 

3284 if self.last_modified_iso_8601_date_str is None: 3284 ↛ 3285line 3284 didn't jump to line 3285, because the condition on line 3284 was never true

3285 self.last_modified_iso_8601_date_str = timezone.now().isoformat() 

3286 

3287 def parse_custom_meta_group(self, node, **kwargs): 

3288 for child in node: 

3289 tag = normalize(child.tag) 

3290 

3291 if tag == "custom-meta": 3291 ↛ 3288line 3291 didn't jump to line 3288, because the condition on line 3291 was never false

3292 name, value = self.get_data_from_custom_meta(child) 

3293 

3294 if name == "provider": 3294 ↛ 3288line 3294 didn't jump to line 3288, because the condition on line 3294 was never false

3295 self.provider = value 

3296 

3297 def set_contribs(self): 

3298 """ 

3299 Update the contrib_groups if the XML does not declare any 

3300 - with the authors of the first part 

3301 - if the book is a monograph 

3302 - if all parts are written by the same authors 

3303 

3304 :return: 

3305 """ 

3306 

3307 authors = [contrib for contrib in self.contributors if contrib["role"] == "author"] 

3308 if not authors: 

3309 if self.ctype == "book-monograph" and self.parts: 

3310 first_part = self.parts[0] 

3311 self.contributors = first_part.contributors 

3312 elif ( 3312 ↛ exitline 3312 didn't return from function 'set_contribs', because the condition on line 3312 was never false

3313 self.ctype == "book-edited-book" or self.ctype == "book-lecture-notes" 

3314 ) and self.parts: 

3315 # check if authors of the book-parts are identical 

3316 equal = True 

3317 book_part_contributors = self.parts[0].contributors 

3318 i = 1 

3319 while equal and i < len(self.parts): 

3320 part = self.parts[i] 

3321 if part.contributors != book_part_contributors: 3321 ↛ 3323line 3321 didn't jump to line 3323, because the condition on line 3321 was never false

3322 equal = False 

3323 i += 1 

3324 if equal: 3324 ↛ 3325line 3324 didn't jump to line 3325, because the condition on line 3324 was never true

3325 if self.ctype == "book-edited-book": 

3326 self.ctype = "book-monograph" 

3327 self.contributors = book_part_contributors 

3328 else: 

3329 contrib = create_contributor() 

3330 contrib["string_name"] = "Collectif" 

3331 contrib["role"] = "author" 

3332 contrib["contrib_xml"] = get_contrib_xml(contrib) 

3333 self.contributors.append(contrib) 

3334 

3335 def set_title(self): 

3336 if self.title_xml == "" and len(self.incollection) > 0: 

3337 self.title_xml = self.incollection[0].title_xml 

3338 self.title_html = self.incollection[0].title_html 

3339 self.title_tex = self.incollection[0].title_tex 

3340 

3341 

3342class BitsBookPart(BookPartData, JatsArticleBase): 

3343 def __init__(self, *args, **kwargs): 

3344 super().__init__(*args, **kwargs) 

3345 self.parse_tree(kwargs["tree"]) 

3346 

3347 def parse_tree(self, tree): 

3348 super().parse_tree(tree) 

3349 

3350 self.atype = get_normalized_attrib(tree, "book-part-type") or "" 

3351 try: 

3352 self.seq = int(get_normalized_attrib(tree, "seq") or "") 

3353 except ValueError: 

3354 pass 

3355 

3356 for node in tree: 

3357 tag = normalize(node.tag) 

3358 

3359 if tag == "book-part-meta": 

3360 self.parse_book_part_meta(node) 

3361 elif tag == "body": 

3362 self.parse_body(node) 

3363 elif tag == "front-matter": 3363 ↛ 3364line 3363 didn't jump to line 3364, because the condition on line 3363 was never true

3364 self.parse_front_matter(node) 

3365 elif tag == "back": 3365 ↛ 3382line 3365 didn't jump to line 3382, because the condition on line 3365 was never false

3366 for child in node: 

3367 tag = normalize(child.tag) 

3368 

3369 if tag == "ref-list": 3369 ↛ 3372line 3369 didn't jump to line 3372, because the condition on line 3369 was never false

3370 self.parse_ref_list(child) 

3371 else: 

3372 self.warnings.append( 

3373 { 

3374 self.pid: self.__class__.__name__ 

3375 + "." 

3376 + inspect.currentframe().f_code.co_name 

3377 + " " 

3378 + tag 

3379 } 

3380 ) 

3381 else: 

3382 self.warnings.append( 

3383 { 

3384 self.pid: self.__class__.__name__ 

3385 + "." 

3386 + inspect.currentframe().f_code.co_name 

3387 + " " 

3388 + tag 

3389 } 

3390 ) 

3391 

3392 # Workaround a numdam-plus bug where a book-part can have a trans-title without a title 

3393 # TODO: Fix numdam-plus, the books impacted and remove the hack 

3394 self.set_title() 

3395 

3396 def parse_book_part_meta(self, node, **kwargs): 

3397 for child in node: 

3398 tag = normalize(child.tag) 

3399 

3400 if tag == "book-part-id": 

3401 self.parse_id(child) 

3402 elif tag == "fpage": 

3403 self.fpage = child.text 

3404 self.page_type = get_normalized_attrib(child, "content-type") or "" 

3405 elif tag == "lpage": 

3406 self