Coverage for apps/ptf/cmds/xml/jats/jats_parser.py: 70%

2055 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-05-19 19:20 +0000

1################################################################################################## 

2# 

3# README 

4# 

5# jats_parser.py is a replacement of xmldata.py 

6# The purpose is to parse a JATS xml (or BITS) tree from top to bottom. 

7# Each node is read only once. 

8# 

9# JatsArticle, JatsIssue, JatsJournal, BitsBook are the objects created by xml_cmds. 

10# The xml tree is parsed in the class constructor (__init__) 

11# These classes have parse_<tag> functions to parse the xml nodes and set instance variables. 

12# Some parse_<tag> functions are called directly. 

13# Ex: if tag == "article-meta": 

14# self.parse_article_meta(child) 

15# Other parse_<tag> functions are called "automatically" 

16# fct_name = 'parse_' + tag.replace('-', '_') 

17# ftor = getattr(self, fct_name, None) 

18# if callable(ftor): 

19# ftor(child) 

20# 

21# JatsBase and JatsArticleBase are base classes. 

22# They provide common instance variables and their corresponding parse_<tag> functions 

23# 

24# html_from_<tag> are used to generate the HTML text of a node with mixed content: 

25# a node that mixes text, children and tail 

26# These functions can also extract data and set instance variables (ex: self.figures) 

27# 

28# get_data_from_* parse a node, but simply return data (text, dict,...) without side effects 

29# 

30# At the end of this file, there are some functions that are/were called by ptf-tools. 

31# They are kept here for simplicity: we can switch xmldata entirely with jats_parser 

32# 

33# TODO: the import OAI or the import of a collection could simply call the first function 

34# (def parser(tree)) 

35# 

36################################################################################################## 

37 

38import copy 

39import inspect 

40import os 

41import re 

42 

43from lxml import etree 

44from pylatexenc.latexencode import unicode_to_latex 

45 

46from django.conf import settings 

47from django.urls import reverse 

48from django.utils import timezone 

49 

50from matching import scrapping 

51from ptf.cmds.xml.citation_html import add_span_class_to_html_from_article_title 

52from ptf.cmds.xml.citation_html import add_span_class_to_html_from_authors 

53from ptf.cmds.xml.citation_html import add_span_class_to_html_from_chapter_title 

54from ptf.cmds.xml.citation_html import add_span_class_to_html_from_source 

55from ptf.cmds.xml.citation_html import add_span_class_to_html_from_volume 

56from ptf.cmds.xml.citation_html import get_citation_html 

57from ptf.cmds.xml.xml_base import RefBase 

58from ptf.cmds.xml.xml_base import XmlParserBase 

59from ptf.cmds.xml.xml_utils import escape 

60from ptf.cmds.xml.xml_utils import get_contrib_xml 

61from ptf.cmds.xml.xml_utils import get_elsevier_image_extensions 

62from ptf.cmds.xml.xml_utils import get_normalized_attrib 

63from ptf.cmds.xml.xml_utils import get_text_from_node 

64from ptf.cmds.xml.xml_utils import get_xml_from_node 

65from ptf.cmds.xml.xml_utils import helper_update_name_params 

66from ptf.cmds.xml.xml_utils import make_links_clickable 

67from ptf.cmds.xml.xml_utils import normalize 

68from ptf.cmds.xml.xml_utils import normalize_space 

69from ptf.cmds.xml.xml_utils import split_kwds 

70from ptf.display import resolver 

71from ptf.model_data import ArticleData 

72from ptf.model_data import BookData 

73from ptf.model_data import BookPartData 

74from ptf.model_data import CollectionData 

75from ptf.model_data import Foo 

76from ptf.model_data import IssueData 

77from ptf.model_data import JournalData 

78from ptf.model_data import MathdocPublicationData 

79from ptf.model_data import PublisherData 

80from ptf.model_data import create_contributor 

81from ptf.model_data import create_extlink 

82 

83 

84class JatsBase(XmlParserBase): 

85 def __init__(self, *args, **kwargs): 

86 super().__init__() 

87 self.warnings = [] 

88 self.fns = [] 

89 self.tree = None 

90 # Used to convert an XML value for CKEditor (ie abstract) 

91 self.add_span_around_tex_formula = False 

92 # Used to create a Tex file from an XML value (ie abstract) 

93 self.for_tex_file = False 

94 

95 def parse_tree(self, tree): 

96 self.tree = tree 

97 self.lang = get_normalized_attrib(tree, "lang") or "und" 

98 

99 def parse_node_with_article_title(self, node, **kwargs): 

100 tex, html = self.parse_inner_node(node, **kwargs) 

101 

102 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False 

103 if is_mixed_citation: 

104 html = add_span_class_to_html_from_article_title(html, **kwargs) 

105 

106 return tex, html 

107 

108 def parse_node_with_break(self, node, **kwargs): 

109 tex = "\\newline\n" if self.for_tex_file else " " 

110 html = "<br/>" 

111 

112 return tex, html 

113 

114 def parse_node_with_chem_struct_wrap(self, node, **kwargs): 

115 table_id = label = None 

116 inner_text = "" 

117 

118 if "id" in node.attrib: 

119 table_id = node.attrib["id"] 

120 

121 for child in node: 

122 tag = normalize(child.tag) 

123 if tag == "label": 

124 _, label = self.parse_node_with_mixed_content(child, **kwargs) 

125 else: 

126 _, child_text = self.parse_node_with_mixed_content(child, **kwargs) 

127 inner_text += child_text 

128 

129 text = "<table " 

130 if table_id: 

131 text += f'id="{table_id}" ' 

132 text += f'class="formula"><tr><td class="formula-inner">{inner_text}</td>' 

133 

134 text += '<td class="formula-label">' 

135 if label: 

136 text += label 

137 text += "</td></tr>" 

138 text += "</table>" 

139 

140 return text, text 

141 

142 def parse_node_with_disp_quote(self, node, **kwargs): 

143 tex, html = self.parse_inner_node(node, **kwargs) 

144 

145 html = f'<div class="disp-quote">{html}</div>' 

146 tex = f'<div class="disp-quote">{tex}</div>' 

147 

148 return tex, html 

149 

150 def parse_node_with_boxed_text(self, node, **kwargs): 

151 box_id = node.attrib["id"] if "id" in node.attrib else None 

152 

153 _, node_html = self.parse_inner_node(node, **kwargs) 

154 

155 if box_id: 

156 html = f'<div id="{box_id}" class="boxed-text">' 

157 else: 

158 html = '<div class="boxed-text">' 

159 

160 html = f"{html}{node_html}</div>" 

161 

162 return "", html 

163 

164 def parse_node_with_fig(self, node, **kwargs): 

165 """ 

166 Ex: <fig><label>LABEL</label><caption><title>TITLE</title>CAPTION</caption><graphic/></fig> 

167 becomes: <figure><img><figcaption>LABEL : TITLE<p>CAPTION</p></figcaption></figure> 

168 

169 :param node: XML node of a fig 

170 :return: the HTML text + the dict representing the image (mimetype, location,...) 

171 """ 

172 html = "" 

173 

174 fig_id = label_html = title_html = caption_html = None 

175 img_html = "" 

176 

177 if "id" in node.attrib: 

178 fig_id = node.attrib["id"] 

179 

180 for child in node: 

181 tag = normalize(child.tag) 

182 if tag == "label": 

183 _, label_html = self.parse_node_with_mixed_content(child, **kwargs) 

184 elif tag == "caption": 

185 for caption_child in child: 

186 tag = normalize(caption_child.tag) 

187 if tag == "title": 

188 _, title_html = self.parse_node_with_mixed_content(caption_child, **kwargs) 

189 elif tag == "p": 189 ↛ 203line 189 didn't jump to line 203, because the condition on line 189 was never false

190 _, caption_p_html = self.parse_node_with_mixed_content( 

191 caption_child, **kwargs 

192 ) 

193 if caption_html: 

194 caption_html = caption_html.replace( 

195 "<p>", '<p class="fig-first-caption">', 1 

196 ) 

197 caption_html += caption_p_html.replace( 

198 "<p>", '<p class="fig-small-caption">', 1 

199 ) 

200 else: 

201 caption_html = caption_p_html 

202 else: 

203 self.warnings.append( 

204 { 

205 self.pid: self.__class__.__name__ 

206 + "." 

207 + inspect.currentframe().f_code.co_name 

208 + " " 

209 + tag 

210 } 

211 ) 

212 

213 elif tag == "graphic": 

214 _, graphic_html = self.parse_node_with_graphic(child, **kwargs) 

215 img_html += graphic_html 

216 elif tag == "attrib": 

217 _, html = self.parse_node_with_mixed_content(child, **kwargs) 

218 caption_html = f'{caption_html}<p class="fig-small-caption">{html}</p>' 

219 elif tag == "permissions": 219 ↛ 225line 219 didn't jump to line 225, because the condition on line 219 was never false

220 for gchild in child: 

221 if gchild.tag == "copyright-statement": 221 ↛ 220line 221 didn't jump to line 220, because the condition on line 221 was never false

222 _, html = self.parse_node_with_mixed_content(gchild, **kwargs) 

223 caption_html = f'{caption_html}<p class="fig-small-caption">{html}</p>' 

224 else: 

225 self.warnings.append( 

226 { 

227 self.pid: self.__class__.__name__ 

228 + "." 

229 + inspect.currentframe().f_code.co_name 

230 + " " 

231 + tag 

232 } 

233 ) 

234 

235 if fig_id: 

236 html = '<figure id="' + fig_id + '">' 

237 else: 

238 html = "<figure>" 

239 

240 if len(img_html) > 0: 240 ↛ 243line 240 didn't jump to line 243, because the condition on line 240 was never false

241 html += img_html 

242 

243 if label_html or title_html or (caption_html is not None and len(caption_html) > 0): 243 ↛ 257line 243 didn't jump to line 257, because the condition on line 243 was never false

244 html += "<figcaption>" 

245 

246 if label_html: 246 ↛ 248line 246 didn't jump to line 248, because the condition on line 246 was never false

247 html += label_html 

248 if label_html and title_html: 

249 html += " : " 

250 if title_html: 

251 html += title_html 

252 if caption_html: 252 ↛ 255line 252 didn't jump to line 255, because the condition on line 252 was never false

253 html += caption_html 

254 

255 html += "</figcaption>" 

256 

257 html += "</figure>" 

258 

259 if ( 259 ↛ 265line 259 didn't jump to line 265

260 "append_floats" in kwargs 

261 and kwargs["append_floats"] 

262 and hasattr(self, "floats") 

263 and fig_id is not None 

264 ): 

265 self.floats[fig_id] = html 

266 

267 return "", html 

268 

269 def parse_node_with_fn(self, node, **kwargs): 

270 """ 

271 Ex: <fn><label>LABEL</label><p>TEXT</p></fn> 

272 

273 :param node: XML node of a fn 

274 :return: ''. the text is stripped from the HTML. but a list of fn is built 

275 """ 

276 html = fn_html = "" 

277 

278 label_html = fn_id = None 

279 

280 if "id" in node.attrib: 280 ↛ 281line 280 didn't jump to line 281, because the condition on line 280 was never true

281 fn_id = node.attrib["id"] 

282 

283 for child in node: 

284 tag = normalize(child.tag) 

285 if tag == "label": 

286 _, label_html = self.parse_node_with_mixed_content(child, **kwargs) 

287 elif tag == "p": 287 ↛ 291line 287 didn't jump to line 291

288 _, fn_html = self.parse_node_with_mixed_content(child, **kwargs) 

289 fn_html = fn_html.replace("<p>", "").replace("</p>", "") 

290 else: 

291 warning = ( 

292 self.__class__.__name__ 

293 + "." 

294 + inspect.currentframe().f_code.co_name 

295 + " " 

296 + tag 

297 ) 

298 self.warnings.append({self.pid: warning}) 

299 

300 if fn_id: 300 ↛ 301line 300 didn't jump to line 301, because the condition on line 300 was never true

301 html = '<p id="' + fn_id + '">' 

302 else: 

303 html = "<p>" 

304 

305 if label_html and ("keep_fn_label" not in kwargs or kwargs["keep_fn_label"]): 305 ↛ 308line 305 didn't jump to line 308, because the condition on line 305 was never false

306 html += f"<sup>{label_html}</sup> " 

307 

308 html += fn_html + "</p>" 

309 

310 if not kwargs["keep_fn"] and html not in self.fns: 310 ↛ 311line 310 didn't jump to line 311, because the condition on line 310 was never true

311 self.fns.append(html) 

312 

313 html = html if kwargs["keep_fn"] else "" 

314 return "", html 

315 

316 def parse_node_with_graphic(self, node, **kwargs): 

317 """ 

318 The href value of graphics used in our XML can have the following values 

319 - relative path to the issue XML folder (Elsevier JATS) 

320 - full path starting with "file:/" (Elsevier JATS created in early 2022) 

321 - simple file name (with no relative path) in the RVT FullText XML 

322 

323 After the import, we want 

324 - the files located in the src/tex/figures article folder 

325 - the url pointing to the image, built thanks to kwargs['base_url'] 

326 

327 addRelatedObjectPtfCmd will copy the images to the src/tex/figures folder if the location starts with file:/ 

328 => change the location to "file:/..." for Elsevier JATS (the xarticle has a pii attribute) 

329 """ 

330 href = "" 

331 

332 for attrib in node.attrib: 

333 name = normalize(attrib) 

334 if name == "href": 

335 href = node.attrib[attrib] 

336 

337 if href: 337 ↛ 383line 337 didn't jump to line 383, because the condition on line 337 was never false

338 basename = os.path.basename(href) 

339 ext = basename.split(".")[-1] 

340 if ext == "png": 340 ↛ 341line 340 didn't jump to line 341, because the condition on line 340 was never true

341 mimetype = "image/png" 

342 else: 

343 mimetype = "image/jpeg" 

344 

345 img_url = "src/tex/figures/" + basename 

346 

347 if ext in get_elsevier_image_extensions(): # Elsevier uses "jc3" instead of jpg. WTF ? 347 ↛ 350line 347 didn't jump to line 350, because the condition on line 347 was never false

348 img_url = img_url[0 : -len(ext)] + "jpg" 

349 

350 data_location = href if "file:/" in href else img_url 

351 if ( 351 ↛ 357line 351 didn't jump to line 357

352 hasattr(self, "pii") 

353 and hasattr(self, "issue") 

354 and "file:/" not in href 

355 and self.from_folder 

356 ): 

357 base_dir = self.issue.journal.pid 

358 if os.path.dirname(href) != base_dir: 

359 href = os.path.join(self.from_folder, base_dir, self.issue.pid, href) 

360 data_location = "file:" + href 

361 

362 data = { 

363 "rel": "html-image", 

364 "mimetype": mimetype, 

365 "location": data_location, 

366 "base": None, 

367 "metadata": node.text if node.text is not None else "", 

368 } 

369 

370 if ext == "png": 370 ↛ 371line 370 didn't jump to line 371, because the condition on line 370 was never true

371 img_url = os.path.join(kwargs["base_url"], "png", img_url) 

372 else: 

373 img_url = os.path.join(kwargs["base_url"], "jpg", img_url) 

374 img_text = '<a href="' + img_url + '" data-lightbox="image-' 

375 img_text += str(len(self.figures)) + '" title="">' 

376 img_text += '<img src="' + img_url + '" class="article-body-img" />' 

377 img_text += "</a>" 

378 

379 if data not in self.figures: 379 ↛ 383line 379 didn't jump to line 383, because the condition on line 379 was never false

380 self.figures.append(data) 

381 self.related_objects.append(data) 

382 

383 return "", img_text 

384 

385 def parse_node_with_inline_formula(self, node, **kwargs): 

386 # MathJAX is doing a good job with formulae and is now the standard 

387 # MathML could be ignored in HTML (the original XML value is preserved with value_xml) 

388 # We could simply return the tex-math text 

389 # But there are multiple errors in the TeX of the Mersenne articles. 

390 # We first need to fix those mistakes before switching to TeX 

391 

392 tex_math = "" 

393 math_text = "" 

394 formula_id = label = None 

395 

396 if "id" in node.attrib: 

397 formula_id = node.attrib["id"] 

398 

399 for child in node: 

400 tag = normalize(child.tag) 

401 if tag == "alternatives": 

402 for alternative in child: 

403 tag = normalize(alternative.tag) 

404 if tag == "tex-math": 

405 tex_math = alternative.text or "" 

406 elif tag == "math": 

407 # remove_namespace(child) 

408 # Elsevier sometimes provide the formula a an alternative image. Remove it. 

409 alternative.attrib.pop("altimg", None) 

410 

411 math_text = get_xml_from_node(alternative).replace("mml:", "") 

412 math_text = math_text.replace( 

413 'xmlns:xlink="http://www.w3.org/1999/xlink"', "" 

414 ) 

415 math_text = math_text.replace( 

416 'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"', "" 

417 ) 

418 if node.tag == "disp-formula": 

419 math_text = math_text.replace("<math", '<math display="block"') 

420 elif tag == "label": 420 ↛ 423line 420 didn't jump to line 423, because the condition on line 420 was never false

421 label = child.text or "" 

422 else: 

423 self.warnings.append( 

424 { 

425 self.pid: self.__class__.__name__ 

426 + "." 

427 + inspect.currentframe().f_code.co_name 

428 + " " 

429 + tag 

430 } 

431 ) 

432 

433 if (math_text == "" and tex_math != "") or (math_text != "" and tex_math == ""): 

434 stack = inspect.stack() 

435 stack_str = " ".join( 

436 [ 

437 frameinfo[3] 

438 for frameinfo in stack[1:] 

439 if frameinfo[3].find("parse_") == 0 

440 and frameinfo[3].find("parse_node") == -1 

441 and frameinfo[3].find("parse_inner") == -1 

442 and frameinfo[3].find("parse_tree") == -1 

443 and frameinfo[3].find("parse_article_meta") == -1 

444 ] 

445 ) 

446 print(f"{self.pid} no math formula for {stack_str}") 

447 # raise ValueError("No formula alternative") 

448 

449 if node.tag != "disp-formula": 

450 if tex_math != "" and tex_math[0] != "$": 450 ↛ 451line 450 didn't jump to line 451, because the condition on line 450 was never true

451 tex_math = "$" + tex_math 

452 if tex_math != "" and tex_math[-1] != "$": 452 ↛ 453line 452 didn't jump to line 453, because the condition on line 452 was never true

453 tex_math = tex_math + "$" 

454 

455 tex = tex_math 

456 

457 html = "" 

458 if label or node.tag == "disp-formula": 

459 html += '<table class="formula"><tr><td class="formula-inner">' 

460 

461 html += '<span class="mathjax-formula" ' 

462 if formula_id: 

463 html += 'id="' + formula_id + '" ' 

464 alt_text = tex_math.replace("\n", "") if node.tag == "disp-formula" else tex_math 

465 if math_text: 

466 html += f'data-tex="{alt_text}">{math_text}</span>' 

467 else: 

468 html += f'data-tex="{alt_text}">{tex_math}</span>' 

469 

470 if label or node.tag == "disp-formula": 

471 html += '</td><td class="formula-label">' 

472 if label: 

473 html += label 

474 html += "</td></tr>" 

475 html += "</table>" 

476 

477 if self.add_span_around_tex_formula: 477 ↛ 478line 477 didn't jump to line 478, because the condition on line 477 was never true

478 tex = f'<span class="mathjax-formula">\\({tex[1:-1]}\\)</span>' 

479 

480 return tex, html 

481 

482 def parse_node_with_institution_id(self, node, **kwargs): 

483 return "", "" 

484 

485 def parse_node_with_italic(self, node, **kwargs): 

486 tex, html = self.parse_inner_node(node, **kwargs) 

487 

488 # is_mixed_citation = kwargs['is_mixed_citation'] if 'is_mixed_citation' in kwargs else False 

489 # is_citation = kwargs['is_citation'] if 'is_citation' in kwargs else False 

490 # is_comment = kwargs['is_comment'] if 'is_comment' in kwargs else False 

491 # 

492 # if inner_text == '' or kwargs['temp_tex'] or (is_citation and not is_mixed_citation and not is_comment): 

493 # text = inner_text 

494 # else: 

495 # text = '<span class="italique">' + inner_text + '</span>' 

496 

497 html = f'<span class="italique">{html}</span>' 

498 

499 if self.for_tex_file: 499 ↛ 500line 499 didn't jump to line 500, because the condition on line 499 was never true

500 tex = "{\\it " + tex + "}" 

501 else: 

502 tex = f"<i>{tex}</i>" 

503 

504 return tex, html 

505 

506 def parse_node_with_list(self, node, **kwargs): 

507 tex, html = self.parse_inner_node(node, **kwargs) 

508 

509 start = None 

510 continued_from = node.get("continued-from") 

511 if continued_from is not None: 511 ↛ 512line 511 didn't jump to line 512, because the condition on line 511 was never true

512 start = self.get_list_start_value(node) + 1 

513 

514 list_type = node.get("list-type") 

515 if list_type == "bullet" or list_type == "simple": 

516 if self.for_tex_file: 516 ↛ 517line 516 didn't jump to line 517, because the condition on line 516 was never true

517 tex = "\n\\begin{itemize}\n" + tex + "\\end{itemize}\n" 

518 else: 

519 tex = f"<ul>{tex}</ul>" 

520 

521 html = f"<ul>{html}</ul>" 

522 else: 

523 if self.for_tex_file: 523 ↛ 524line 523 didn't jump to line 524, because the condition on line 523 was never true

524 tex = "\n\\begin{enumerate}\n" + tex + "\\end{enumerate}\n" 

525 else: 

526 if list_type == "order" or list_type == "number": 

527 if start is not None: 527 ↛ 528line 527 didn't jump to line 528, because the condition on line 527 was never true

528 html = f'<ol type="1" start="{str(start)}">{html}</ol>' 

529 tex = f'<ol type="1" start="{str(start)}">{tex}</ol>' 

530 else: 

531 html = f'<ol type="1">{html}</ol>' 

532 tex = f'<ol type="1">{tex}</ol>' 

533 elif list_type == "alpha-lower": 

534 html = f'<ol type="a">{html}</ol>' 

535 tex = f'<ol type="a">{tex}</ol>' 

536 elif list_type == "alpha-upper": 

537 html = f'<ol type="A">{html}</ol>' 

538 tex = f'<ol type="A">{tex}</ol>' 

539 elif list_type == "roman-lower": 

540 html = f'<ol type="i">{html}</ol>' 

541 tex = f'<ol type="i">{tex}</ol>' 

542 elif list_type == "roman-upper": 542 ↛ 543line 542 didn't jump to line 543, because the condition on line 542 was never true

543 html = f'<ol type="I">{html}</ol>' 

544 tex = f'<ol type="I">{tex}</ol>' 

545 else: 

546 html = f'<ul class="no-bullet" style="list-style-type:none;">{html}</ul>' 

547 tex = f'<ul class="no-bullet" style="list-style-type:none;">{tex}</ul>' 

548 

549 return tex, html 

550 

551 def parse_node_with_list_item(self, node, **kwargs): 

552 """ 

553 <list-item><label>LABEL</label><p>TEXT</p> becomes 

554 <li>LABEL TEXT</li> 

555 (same with <title>) 

556 

557 :param node: 

558 :return: 

559 """ 

560 

561 title_tex = ( 

562 title_html 

563 ) = label_tex = label_html = p_tex = p_html = content_tex = content_html = "" 

564 

565 for child in node: 

566 tag = normalize(child.tag) 

567 if tag == "label": 

568 label_tex, label_html = self.parse_node_with_mixed_content(child, **kwargs) 

569 elif tag == "title": 569 ↛ 570line 569 didn't jump to line 570, because the condition on line 569 was never true

570 title_tex, title_html = self.parse_node_with_mixed_content(child, **kwargs) 

571 elif tag == "p": 

572 if p_html == "" and content_html == "": 572 ↛ 575line 572 didn't jump to line 575, because the condition on line 572 was never false

573 p_tex, p_html = self.parse_inner_node(child, **kwargs) 

574 else: 

575 content_tex, content_html = self.parse_inner_node(child, **kwargs) 

576 content_html = f"<p>{content_html}</p>" 

577 elif tag == "list": 577 ↛ 581line 577 didn't jump to line 581, because the condition on line 577 was never false

578 content_tex, content_html = self.parse_node_with_mixed_content(child, **kwargs) 

579 # TODO if tag == "def-list": 

580 else: 

581 self.warnings.append( 

582 { 

583 self.pid: self.__class__.__name__ 

584 + "." 

585 + inspect.currentframe().f_code.co_name 

586 + " " 

587 + tag 

588 } 

589 ) 

590 

591 inner_tex = "" 

592 if label_tex: 

593 inner_tex += label_tex + " " 

594 if title_tex: 594 ↛ 595line 594 didn't jump to line 595, because the condition on line 594 was never true

595 inner_tex += title_tex + " " 

596 inner_tex += p_tex + content_tex 

597 

598 if self.for_tex_file: 598 ↛ 599line 598 didn't jump to line 599, because the condition on line 598 was never true

599 tex = "\\item " + inner_tex + "\n" 

600 else: 

601 tex = f"<li>{inner_tex}</li>" 

602 

603 html = "<li>" 

604 if label_html: 

605 html += label_html + " " 

606 if title_html: 606 ↛ 607line 606 didn't jump to line 607, because the condition on line 606 was never true

607 html += title_html + " " 

608 html += p_html + content_html + "</li>" 

609 

610 return tex, html 

611 

612 def parse_node_with_name_content(self, node, **kwargs): 

613 tex, html = self.parse_inner_node(node, **kwargs) 

614 return tex, html 

615 

616 def parse_node_with_p(self, node, **kwargs): 

617 tex, html = self.parse_inner_node(node, **kwargs) 

618 

619 if not self.for_tex_file: 

620 tex = f"<p>{tex}</p>" 

621 

622 node_type = node.get("specific-use") 

623 if node_type: 

624 html = f'<p class="{node_type}">{html}</p>' 

625 else: 

626 html = f"<p>{html}</p>" 

627 

628 if hasattr(self, "floats_to_insert") and hasattr(self, "floats"): 628 ↛ 629line 628 didn't jump to line 629, because the condition on line 628 was never true

629 while len(self.floats_to_insert) > 0: 

630 float_id = self.floats_to_insert.pop(0) 

631 if float_id in self.floats: 

632 html += self.floats[float_id] 

633 self.floats.pop(float_id) 

634 

635 return tex, html 

636 

637 def parse_node_with_sc(self, node, **kwargs): 

638 tex, html = self.parse_inner_node(node, **kwargs) 

639 html = f'<span class="smallcaps">{html}</span>' 

640 

641 return tex, html 

642 

643 def parse_node_with_sec(self, node, **kwargs): 

644 """ 

645 <sec><title>TITLE</title><p>TEXT</p> becomes 

646 <section><h@i>TITLE</h@i><p>TEXT</p> (i is the current level and is increased for children) 

647 

648 :param node: 

649 :param kwargs: 

650 :return: 

651 """ 

652 

653 label_tex = label_html = title_tex = title_html = None 

654 sec_level = kwargs["sec_level"] = kwargs["sec_level"] if "sec_level" in kwargs else 2 

655 

656 inner_tex = inner_html = "" 

657 kwargs["sec_level"] += 1 

658 

659 for child in node: 

660 tag = normalize(child.tag) 

661 if tag == "label": 

662 label_tex, label_html = self.parse_node_with_mixed_content(child) 

663 elif tag == "title": 

664 title_tex, title_html = self.parse_node_with_mixed_content(child) 

665 else: 

666 child_tex, child_html = self.parse_node_with_mixed_content(child, **kwargs) 

667 inner_tex += child_tex 

668 inner_html += child_html 

669 

670 tex = "" 

671 html = "<section>" 

672 

673 if label_html or title_html: 673 ↛ 686line 673 didn't jump to line 686, because the condition on line 673 was never false

674 html += f"<h{str(sec_level)}>" 

675 if label_html: 675 ↛ 678line 675 didn't jump to line 678, because the condition on line 675 was never false

676 tex += label_tex 

677 html += label_html 

678 if label_html and title_html: 678 ↛ 681line 678 didn't jump to line 681, because the condition on line 678 was never false

679 tex += " " 

680 html += " " 

681 if title_html: 681 ↛ 684line 681 didn't jump to line 684, because the condition on line 681 was never false

682 tex += title_tex 

683 html += title_html 

684 html += f"</h{str(sec_level)}>" 

685 

686 tex += inner_tex 

687 html += inner_html + "</section>" 

688 

689 return tex, html 

690 

691 def parse_node_with_string_name(self, node, **kwargs): 

692 tex, html = self.parse_inner_node(node, **kwargs) 

693 

694 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False 

695 if is_mixed_citation: 695 ↛ 698line 695 didn't jump to line 698, because the condition on line 695 was never false

696 html = add_span_class_to_html_from_authors(html.title(), **kwargs) 

697 

698 return tex, html 

699 

700 def parse_node_with_strong(self, node, **kwargs): 

701 tex, html = self.parse_inner_node(node, **kwargs) 

702 

703 if self.for_tex_file: 703 ↛ 704line 703 didn't jump to line 704, because the condition on line 703 was never true

704 tex = "{\\bf " + tex + "}" 

705 else: 

706 tex = f"<strong>{tex}</strong>" 

707 html = f"<strong>{html}</strong>" 

708 

709 return tex, html 

710 

711 def parse_node_with_styled_content(self, node, **kwargs): 

712 tex, html = self.parse_inner_node(node, **kwargs) 

713 

714 if "style" in node.attrib: 714 ↛ 719line 714 didn't jump to line 719, because the condition on line 714 was never false

715 style = node.attrib["style"] 

716 if style != "": 716 ↛ 719line 716 didn't jump to line 719, because the condition on line 716 was never false

717 html = f'<span style="{style}">{html}</span>' 

718 

719 return tex, html 

720 

721 def parse_node_with_sub(self, node, **kwargs): 

722 tex, html = self.parse_inner_node(node, **kwargs) 

723 

724 if self.for_tex_file: 724 ↛ 725line 724 didn't jump to line 725, because the condition on line 724 was never true

725 tex = "\\textsubscript{" + tex + "}" 

726 else: 

727 tex = f"<sub>{tex}</sub>" 

728 html = f"<sub>{html}</sub>" 

729 

730 return tex, html 

731 

732 def parse_node_with_sup(self, node, **kwargs): 

733 tex, html = self.parse_inner_node(node, **kwargs) 

734 

735 if self.for_tex_file: 735 ↛ 736line 735 didn't jump to line 736, because the condition on line 735 was never true

736 tex = "\\textsuperscript{" + tex + "}" 

737 else: 

738 tex = f"<sup>{tex}</sup>" 

739 html = f"<sup>{html}</sup>" 

740 

741 return tex, html 

742 

743 def parse_node_with_table_generic(self, node, **kwargs): 

744 tex, html = self.parse_inner_node(node, **kwargs) 

745 

746 tag = normalize(node.tag) 

747 if tag == "row": 747 ↛ 748line 747 didn't jump to line 748, because the condition on line 747 was never true

748 tag = "tr" 

749 elif tag == "entry": 749 ↛ 750line 749 didn't jump to line 750, because the condition on line 749 was never true

750 tag = "td" 

751 open_tag = "<" + tag 

752 

753 if tag == "table": 

754 class_table = "table" 

755 

756 cols = node.xpath("colgroup/col") 

757 i = 1 

758 for col in cols: 

759 if "width" in col.attrib: 

760 class_table += f" nowrap-col-{i}" 

761 i += 1 

762 

763 open_tag += f' class="{class_table}"' 

764 if "rowspan" in node.attrib: 

765 open_tag += ' rowspan="' + node.attrib["rowspan"] + '"' 

766 if "colspan" in node.attrib: 

767 open_tag += ' colspan="' + node.attrib["colspan"] + '"' 

768 if "align" in node.attrib: 

769 open_tag += ' align="' + node.attrib["align"] + '"' 

770 if "valign" in node.attrib: 

771 open_tag += ' class="td-valign-' + node.attrib["valign"] + '"' 

772 if "style" in node.attrib: 

773 open_tag += ' style="' + node.attrib["style"] + '"' 

774 open_tag += ">" 

775 

776 html = f"{open_tag}{html}</{tag}>" 

777 

778 return "", html 

779 

780 def parse_node_with_table_wrap(self, node, **kwargs): 

781 """ 

782 Create a <div class="table-wrap"> around the table 

783 :param node: 

784 :return: 

785 """ 

786 

787 table_id = label = caption = None 

788 inner_text = "" 

789 

790 if "id" in node.attrib: 790 ↛ 793line 790 didn't jump to line 793, because the condition on line 790 was never false

791 table_id = node.attrib["id"] 

792 

793 for child in node: 

794 tag = normalize(child.tag) 

795 if tag == "label": 

796 _, label = self.parse_node_with_mixed_content(child, **kwargs) 

797 elif tag == "caption": 

798 _, caption = self.parse_node_with_mixed_content(child, **kwargs) 

799 else: 

800 _, child_text = self.parse_node_with_mixed_content(child, **kwargs) 

801 inner_text += child_text 

802 

803 if table_id: 803 ↛ 806line 803 didn't jump to line 806, because the condition on line 803 was never false

804 text = '<div class="table-wrap table-responsive" id="' + table_id + '">' 

805 else: 

806 text = '<div class="table-wrap table-responsive">' 

807 

808 if label or caption: 808 ↛ 811line 808 didn't jump to line 811, because the condition on line 808 was never false

809 text += '<div class="table-wrap-header">' 

810 

811 if label: 811 ↛ 814line 811 didn't jump to line 814, because the condition on line 811 was never false

812 text += "<strong>" + label + "</strong>" 

813 

814 if caption: 814 ↛ 820line 814 didn't jump to line 820, because the condition on line 814 was never false

815 if label: 815 ↛ 817line 815 didn't jump to line 817, because the condition on line 815 was never false

816 text += " " 

817 if caption: 817 ↛ 820line 817 didn't jump to line 820, because the condition on line 817 was never false

818 text += caption 

819 

820 if label or caption: 820 ↛ 823line 820 didn't jump to line 823, because the condition on line 820 was never false

821 text += "</div>" 

822 

823 text += inner_text 

824 text += "</div>" 

825 

826 if ( 826 ↛ 832line 826 didn't jump to line 832

827 "append_floats" in kwargs 

828 and kwargs["append_floats"] 

829 and hasattr(self, "floats") 

830 and table_id is not None 

831 ): 

832 self.floats[table_id] = text 

833 

834 return "", text 

835 

836 def parse_node_with_table_wrap_foot(self, node, **kwargs): 

837 """ 

838 Create a <div class="table-wrap-foot"> at bottom of the table 

839 Keep the footnotes inside this div 

840 :param node: 

841 :return: 

842 """ 

843 

844 text = '<div class="table-wrap-foot">' 

845 kwargs["keep_fn"] = True 

846 

847 for child in node: 

848 tag = normalize(child.tag) 

849 if tag == "fn-group": 849 ↛ 847line 849 didn't jump to line 847, because the condition on line 849 was never false

850 _, html = self.parse_node_with_mixed_content(child, **kwargs) 

851 text += html 

852 

853 text += "</div>" 

854 

855 return "", text 

856 

857 def parse_node_with_toc(self, node, **kwargs): 

858 tex, html = self.parse_inner_node(node, **kwargs) 

859 

860 html = f"<table>{html}</table>" 

861 

862 # text = '<ul class="no-bullet book-toc">' 

863 # text += inner_text + '</ul>' 

864 

865 return "", html 

866 

867 def parse_node_with_toc_entry(self, node, **kwargs): 

868 html = label = title = child_text = page = anchor = "" 

869 inside_toc_entry = "inside_toc_entry" in kwargs and kwargs["inside_toc_entry"] 

870 toc_class = "inside-toc" if inside_toc_entry else "" 

871 # # toc-entry may be embedded inside toc-entry: create a wrapping <ul> 

872 # html = '<tr class="inside-toc">' 

873 # #html = '<ul class="no-bullet book-toc">' 

874 

875 for child in node: 

876 tag = normalize(child.tag) 

877 if tag == "title": 

878 _, title = self.parse_node_with_mixed_content(child, **kwargs) 

879 elif tag == "label": 

880 _, label = self.parse_node_with_mixed_content(child, **kwargs) 

881 elif tag == "nav-pointer": 

882 _, page = self.parse_node_with_mixed_content(child, **kwargs) 

883 elif tag == "nav-pointer-group": 883 ↛ 884line 883 didn't jump to line 884, because the condition on line 883 was never true

884 for grandchild in child: 

885 if ( 

886 grandchild.tag == "nav-pointer" 

887 and "specific-use" in grandchild.attrib 

888 and grandchild.attrib["specific-use"] == "pagenum" 

889 ): 

890 _, page = self.parse_node_with_mixed_content(grandchild, **kwargs) 

891 if ( 

892 grandchild.tag == "nav-pointer" 

893 and "specific-use" in grandchild.attrib 

894 and grandchild.attrib["specific-use"] == "pageindex" 

895 ): 

896 anchor = int(grandchild.text) + 1 

897 elif tag == "toc-entry": 897 ↛ 875line 897 didn't jump to line 875, because the condition on line 897 was never false

898 _, text = self.parse_node_with_mixed_content(child, inside_toc_entry=True) 

899 child_text += text 

900 

901 toc_text = f"{label} {title}" 

902 page_text = f"p. {page}" 

903 

904 if anchor: 904 ↛ 905line 904 didn't jump to line 905, because the condition on line 904 was never true

905 href = reverse("item-pdf", kwargs={"pid": self.pid, "extension": "pdf"}) 

906 href += f"#page={anchor}" 

907 toc_text = f'<a href="{href}">{toc_text}</a>' 

908 page_text = f'<a href="{href}">{page_text}</a>' 

909 

910 html += f'<tr><td class="{toc_class}">{toc_text}</td><td class="toc-page">{page_text}</td></tr>' 

911 if len(child_text) > 0: 

912 html += child_text 

913 # html += f'<li>{title} <span> p. {page}</span>{child_text}</li>' 

914 

915 # if 'inside_toc_entry' in kwargs and kwargs['inside_toc_entry']: 

916 # html += '</tr>' 

917 # #html += '</ul>' 

918 

919 return "", html 

920 

921 def parse_node_with_underline(self, node, **kwargs): 

922 tex, html = self.parse_inner_node(node, **kwargs) 

923 tex = f"<u>{tex}</u>" 

924 html = f"<u>{html}</u>" 

925 

926 return tex, html 

927 

928 def parse_node_with_volume(self, node, **kwargs): 

929 tex, html = self.parse_inner_node(node, **kwargs) 

930 

931 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False 

932 if is_mixed_citation: 932 ↛ 935line 932 didn't jump to line 935, because the condition on line 932 was never false

933 html = add_span_class_to_html_from_volume(html, **kwargs) 

934 

935 return tex, html 

936 

937 def parse_node_with_xref(self, node, **kwargs): 

938 tex = html = "" 

939 

940 if "ignore_xref" in kwargs and kwargs["ignore_xref"]: 940 ↛ 941line 940 didn't jump to line 941, because the condition on line 940 was never true

941 return tex, html 

942 

943 xref_id = node.get("rid") 

944 if xref_id: 944 ↛ 958line 944 didn't jump to line 958, because the condition on line 944 was never false

945 rids = xref_id.split() 

946 

947 tex, html = self.parse_inner_node(node, **kwargs) 

948 rid0 = rids[0] 

949 if rid0.find("bib") == 0: 949 ↛ 950line 949 didn't jump to line 950, because the condition on line 949 was never true

950 rid0 = "r" + rid0[3:] 

951 html = f'<a href="#{rid0}">{html}</a>' 

952 

953 for rid in rids: 

954 ref_type = node.get("ref-type") or None 

955 if ref_type in ["fig", "table", "textbox"] and hasattr(self, "floats_to_insert"): 955 ↛ 956line 955 didn't jump to line 956, because the condition on line 955 was never true

956 self.floats_to_insert.append(rid) 

957 

958 return tex, html 

959 

960 def parse_inner_node(self, node, **kwargs): 

961 """ 

962 Used by html_from_mixed_content for nodes that have a different tag in HTML 

963 :param node: 

964 :param kwargs: 

965 :return: 

966 """ 

967 tex = html = "" 

968 kwargs["is_top"] = False 

969 kwargs["is_body_html"] = kwargs["is_body_html"] if "is_body_html" in kwargs else False 

970 

971 if node.text: 

972 node_text = node.text 

973 if self.for_tex_file: 

974 node_text = unicode_to_latex(node_text) 

975 tex = node_text 

976 html = escape(node.text) 

977 

978 for child in node: 

979 child_tex, child_html = self.parse_node_with_mixed_content(child, **kwargs) 

980 tex += child_tex 

981 html += child_html 

982 

983 return tex, html 

984 

985 def parse_node_with_mixed_content(self, node, **kwargs): 

986 """ 

987 Parse and return the HTML text of an XML node which mixes text and XML sub-nodes. 

988 Ex: <node>text1 <a>text_a</a> text2 <b>text_b</b>b_tail</node> 

989 Some inner nodes are removed, others are kept or replaced by their HTML equivalent. 

990 html_from_mixed_content is called recursively to get the HTML text of the children. 

991 

992 :param node: XML Node 

993 :param kwargs: params of the function 

994 :return: HTML text 

995 """ 

996 

997 if node is None: 997 ↛ 998line 997 didn't jump to line 998, because the condition on line 997 was never true

998 return "", "" 

999 

1000 # The tail is the text following the end of the node 

1001 # Ex: <node>text1<a>text_a</a>a_tail</node> 

1002 # The HTML text has to include the tail 

1003 # only if html_from_mixed_content was called recursively 

1004 kwargs["is_top"] = kwargs["is_top"] if "is_top" in kwargs else True 

1005 

1006 # sec_level is used to add <h1>, <h2>,... in the HTML text while parsing nodes like <sec> 

1007 kwargs["sec_level"] = kwargs["sec_level"] if "sec_level" in kwargs else 2 

1008 

1009 # Text in <comment> is parsed to add HTML link. 

1010 kwargs["add_HTML_link"] = kwargs["add_HTML_link"] if "add_HTML_link" in kwargs else False 

1011 

1012 # base_url to image links 

1013 kwargs["base_url"] = kwargs["base_url"] if "base_url" in kwargs else "" 

1014 

1015 # footnotes are removed from the fulltext (and put at the end) except for those in a table 

1016 kwargs["keep_fn"] = kwargs["keep_fn"] if "keep_fn" in kwargs else False 

1017 

1018 kwargs["is_citation"] = kwargs["is_citation"] if "is_citation" in kwargs else False 

1019 kwargs["is_comment"] = kwargs["is_comment"] if "is_comment" in kwargs else False 

1020 # mixed-citation ignores ext-link 

1021 kwargs["add_ext_link"] = kwargs["add_ext_link"] if "add_ext_link" in kwargs else False 

1022 

1023 # TODO remove once jats_parser has been validated agains xmldata 

1024 kwargs["temp_math"] = kwargs["temp_math"] if "temp_math" in kwargs else False 

1025 kwargs["temp_tex"] = kwargs["temp_tex"] if "temp_tex" in kwargs else False 

1026 kwargs["is_mixed_citation"] = ( 

1027 kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False 

1028 ) 

1029 kwargs["is_body_html"] = kwargs["is_body_html"] if "is_body_html" in kwargs else False 

1030 

1031 tag = normalize(node.tag) 

1032 

1033 # pub-id/object-id are ignored by default are they are treated separately 

1034 if not (kwargs["is_comment"]) and tag in ("pub-id", "object-id"): 

1035 return "", "" 

1036 

1037 if tag in ("mixed-citation", "toc"): 

1038 kwargs["is_citation"] = True 

1039 elif tag == "comment": 

1040 kwargs["is_comment"] = True 

1041 

1042 tex = html = inner_tex = inner_html = "" 

1043 

1044 # I. Add the node's text. 

1045 # Some tag have a corresponding parse_node_with_@tag function to generate the HTML text. 

1046 

1047 # Check if the parse_node_with_@tag exists 

1048 tag_mapped = { 

1049 "statement": "sec", 

1050 "disp-formula": "inline-formula", 

1051 "chapter-title": "article-title", 

1052 "bold": "strong", 

1053 "table": "table-generic", 

1054 "th": "table-generic", 

1055 "tr": "table-generic", 

1056 "td": "table-generic", 

1057 "thead": "table-generic", 

1058 "tbody": "table-generic", 

1059 "colgroup": "table-generic", 

1060 "col": "table-generic", 

1061 "tgroup": "table-generic", 

1062 "entry": "table-generic", 

1063 "row": "table-generic", 

1064 } 

1065 

1066 fct_name = tag_mapped[tag] if tag in tag_mapped else tag 

1067 fct_name = "parse_node_with_" + fct_name.replace("-", "_") 

1068 ftor = getattr(self, fct_name, None) 

1069 if callable(ftor): 

1070 inner_tex, inner_html = ftor(node, **kwargs) 

1071 elif tag in ("ext-link", "uri"): 

1072 # Add HTML links 

1073 inner_tex = inner_html = self.helper_add_link_from_node(node, **kwargs) 

1074 # Update self.ext_links. Useful for <ext-link> deep in a <mixed_citation>, 

1075 # and not caught by parse_citation_node 

1076 if tag == "ext-link" and not kwargs["is_comment"] and kwargs["add_ext_link"]: 

1077 is_extid_value = self.parse_ext_link(node, **kwargs) 

1078 if is_extid_value and kwargs["is_mixed_citation"]: 

1079 # an extid has been found in a mixed_citation, no need to add the text of the id here 

1080 inner_tex = inner_html = "" 

1081 elif tag == "supplementary-material": 1081 ↛ 1082line 1081 didn't jump to line 1082, because the condition on line 1081 was never true

1082 self.parse_supplementary_material(node, **kwargs) 

1083 else: 

1084 # II.1. Add the node text (before the children text) 

1085 if node.text is not None: 

1086 node_text = node.text 

1087 if self.for_tex_file: 1087 ↛ 1088line 1087 didn't jump to line 1088, because the condition on line 1087 was never true

1088 node_text = unicode_to_latex(node_text) 

1089 inner_tex += node_text 

1090 inner_html += escape(node.text) 

1091 

1092 # II.2. children 

1093 # child_text = html_from_mixed_content(child, params) 

1094 

1095 child_kwargs = kwargs.copy() 

1096 child_kwargs["is_top"] = False 

1097 

1098 for child in node: 

1099 child_tex, child_html = self.parse_node_with_mixed_content(child, **child_kwargs) 

1100 

1101 # Case where an ext-link has been removed in a mixed-citation 

1102 # We may have "title. , (year)" 

1103 # Remove the comma that is now useless 

1104 if ( 1104 ↛ 1110line 1104 didn't jump to line 1110

1105 kwargs["is_mixed_citation"] 

1106 and child_html 

1107 and child_html[0] in [",", "."] 

1108 and inner_html[-2:] == ". " 

1109 ): 

1110 inner_html = inner_html[0:-1] 

1111 child_html = child_html[1:] 

1112 inner_tex = inner_tex[0:-1] 

1113 child_tex = child_tex[1:] 

1114 

1115 inner_tex += child_tex 

1116 inner_html += child_html 

1117 

1118 # II.3. wrap the children text with html links 

1119 if kwargs["add_HTML_link"] and node.text: 

1120 match = re.match(r"[\n ]+", node.text) 

1121 if not match: 

1122 inner_html = make_links_clickable(node.text, inner_html) 

1123 

1124 tex += inner_tex 

1125 html += inner_html 

1126 

1127 # III. Add the node's tail for children 

1128 if node.tail and not kwargs["is_top"]: 

1129 node_tail = node.tail 

1130 if self.for_tex_file: 

1131 node_tail = unicode_to_latex(node_tail) 

1132 tex += node_tail 

1133 html += escape(node.tail) 

1134 

1135 return tex, html 

1136 

1137 def parse_abstract(self, node, **kwargs): 

1138 # tag = get_normalized_attrib(node, "abstract-node_type") or "abstract" 

1139 tag = get_normalized_attrib(node, "abstract-type") or "abstract" 

1140 if tag == "author": 1140 ↛ 1141line 1140 didn't jump to line 1141, because the condition on line 1140 was never true

1141 tag = "abstract" 

1142 lang = get_normalized_attrib(node, "lang") or self.lang 

1143 value_tex, value_html = self.parse_node_with_mixed_content(node) 

1144 value_xml = get_xml_from_node(node) 

1145 self.abstracts.append( 

1146 { 

1147 "tag": tag, 

1148 "lang": lang, 

1149 "value_xml": value_xml, 

1150 "value_html": value_html, 

1151 "value_tex": value_tex, 

1152 } 

1153 ) 

1154 

1155 def parse_aff_alternatives(self, node, **kwargs): 

1156 xref_id = get_normalized_attrib(node, "id") or "" 

1157 address = "" 

1158 aff_to_all = True 

1159 

1160 for child in node: 

1161 tag = normalize(child.tag) 

1162 

1163 if tag == "aff": 1163 ↛ 1174line 1163 didn't jump to line 1174, because the condition on line 1163 was never false

1164 # Skip the formatted aff and use only the complete address text 

1165 # TODO support <aff> properly 

1166 for aff in child: 

1167 if aff.tag == "label" and address == "": 1167 ↛ 1168line 1167 didn't jump to line 1168, because the condition on line 1167 was never true

1168 label = get_text_from_node(aff) 

1169 address = get_text_from_node(child)[len(label) :] 

1170 aff_to_all = False 

1171 if address == "" and child.text: 

1172 address = child.text 

1173 else: 

1174 self.warnings.append( 

1175 { 

1176 self.pid: self.__class__.__name__ 

1177 + "." 

1178 + inspect.currentframe().f_code.co_name 

1179 + " " 

1180 + tag 

1181 } 

1182 ) 

1183 

1184 if address != "": 1184 ↛ exitline 1184 didn't return from function 'parse_aff_alternatives', because the condition on line 1184 was never false

1185 for contrib in self.contributors: 

1186 if address not in contrib["addresses"] and ( 1186 ↛ 1185line 1186 didn't jump to line 1185, because the condition on line 1186 was never false

1187 ("xrefs" in contrib and xref_id in contrib["xrefs"]) or aff_to_all 

1188 ): 

1189 contrib["addresses"].append(address) 

1190 contrib["contrib_xml"] = get_contrib_xml(contrib) 

1191 

1192 def parse_award_group(self, node, **kwargs): 

1193 abbrev = award_id = None 

1194 

1195 for child in node: 

1196 tag = normalize(child.tag) 

1197 

1198 if tag == "award-id": 

1199 award_id = child.text 

1200 elif tag == "funding-source": 1200 ↛ 1203line 1200 didn't jump to line 1203, because the condition on line 1200 was never false

1201 abbrev = get_text_from_node(child) 

1202 else: 

1203 self.warnings.append( 

1204 { 

1205 self.pid: self.__class__.__name__ 

1206 + "." 

1207 + inspect.currentframe().f_code.co_name 

1208 + " " 

1209 + tag 

1210 } 

1211 ) 

1212 

1213 if abbrev is not None and award_id is not None: 1213 ↛ exitline 1213 didn't return from function 'parse_award_group', because the condition on line 1213 was never false

1214 self.awards.append({"abbrev": abbrev, "award_id": award_id}) 

1215 

1216 def parse_contrib_group(self, node, **kwargs): 

1217 role = node.get("content-type") or "" 

1218 if role and role[-1] == "s": 1218 ↛ 1221line 1218 didn't jump to line 1221, because the condition on line 1218 was never false

1219 role = role[0:-1] 

1220 

1221 for child in node: 

1222 tag = normalize(child.tag) 

1223 

1224 if tag == "contrib": 1224 ↛ 1229line 1224 didn't jump to line 1229, because the condition on line 1224 was never false

1225 contrib = self.get_data_from_contrib(child) 

1226 contrib["role"] = f"{role}|{contrib['role']}" if contrib["role"] else role 

1227 contrib["contrib_xml"] = get_xml_from_node(child) 

1228 self.contributors.append(contrib) 

1229 elif tag == "aff-alternatives": 

1230 self.parse_aff_alternatives(child) 

1231 elif tag == "fn": 

1232 _, html = self.parse_node_with_fn(child, keep_fn=True, keep_fn_label=False) 

1233 xml = get_xml_from_node(child) 

1234 self.footnotes_xml += xml 

1235 self.footnotes_html += html 

1236 else: 

1237 self.warnings.append( 

1238 { 

1239 self.pid: self.__class__.__name__ 

1240 + "." 

1241 + inspect.currentframe().f_code.co_name 

1242 + " " 

1243 + tag 

1244 } 

1245 ) 

1246 

1247 def parse_counts(self, node, **kwargs): 

1248 for child in node: 

1249 count_value = child.get("count") 

1250 if count_value is None: 

1251 count_value = child.text 

1252 

1253 if count_value is not None: 1253 ↛ 1248line 1253 didn't jump to line 1248, because the condition on line 1253 was never false

1254 tag = normalize(child.tag) 

1255 if tag == "book-page-count": 

1256 tag = "page-count" 

1257 

1258 self.counts.append((tag, count_value)) 

1259 

1260 def parse_ext_link(self, node, **kwargs): 

1261 datas = self.get_data_from_ext_link(node) 

1262 extid_value = self.add_extids_from_node_with_link(datas) 

1263 

1264 add_ext_link = kwargs["add_ext_link"] if "add_ext_link" in kwargs else False 

1265 if ( 

1266 add_ext_link 

1267 and extid_value[0] is None 

1268 and datas not in self.ext_links 

1269 and datas["rel"] != "cover" 

1270 ): 

1271 self.ext_links.append(datas) 

1272 

1273 return extid_value[0] is not None 

1274 

1275 def parse_front_matter(self, node, **kwargs): 

1276 self.frontmatter_xml = get_xml_from_node(node) 

1277 self.frontmatter_foreword_html = "" 

1278 

1279 for child in node: 

1280 tag = normalize(child.tag) 

1281 

1282 if tag == "foreword": 1282 ↛ 1283line 1282 didn't jump to line 1283, because the condition on line 1282 was never true

1283 _, self.frontmatter_foreword_html = self.parse_node_with_mixed_content(child) 

1284 elif tag == "toc": 1284 ↛ 1279line 1284 didn't jump to line 1279, because the condition on line 1284 was never false

1285 _, self.frontmatter_toc_html = self.parse_node_with_mixed_content(child) 

1286 

1287 def parse_id(self, node, **kwargs): 

1288 node_id = node.text 

1289 if "pub-id-type" in node.attrib: 

1290 node_type = node.attrib["pub-id-type"] 

1291 elif "book-id-type" in node.attrib: 

1292 node_type = node.attrib["book-id-type"] 

1293 elif "book-part-id-type" in node.attrib: 1293 ↛ 1296line 1293 didn't jump to line 1296, because the condition on line 1293 was never false

1294 node_type = node.attrib["book-part-id-type"] 

1295 else: 

1296 node_type = "" 

1297 

1298 if node_type == "pii": 1298 ↛ 1300line 1298 didn't jump to line 1300, because the condition on line 1298 was never true

1299 # Elsevier ids get a special treatment: web scrapping to find the date_published 

1300 if self.pid and len(self.pid) > 2 and self.pid[0:2] == "CR": 

1301 self.pii = node_id 

1302 elif node_type in ("numdam-id", "mathdoc-id"): 

1303 self.pid = node_id 

1304 elif node_type == "ark": 1304 ↛ 1305line 1304 didn't jump to line 1305, because the condition on line 1304 was never true

1305 self.extids.append((node_type, node_id)) 

1306 elif node_type in ("doi", "eid"): 

1307 self.ids.append((node_type, node_id)) 

1308 if node_type == "doi": 1308 ↛ exitline 1308 didn't return from function 'parse_id', because the condition on line 1308 was never false

1309 self.doi = node_id 

1310 

1311 def parse_kwd_group(self, node, **kwargs): 

1312 kwds = [] 

1313 value_html = value_tex = "" 

1314 for child in node: 

1315 tag = normalize(child.tag) 

1316 

1317 if tag == "kwd": 

1318 kwds.append(child.text) 

1319 elif tag == "unstructured-kwd-group": 1319 ↛ 1324line 1319 didn't jump to line 1324, because the condition on line 1319 was never false

1320 # value_xml = get_xml_from_node(child) 

1321 value_tex, value_html = self.parse_node_with_mixed_content(child) 

1322 kwds = split_kwds(value_tex) 

1323 else: 

1324 self.warnings.append( 

1325 { 

1326 self.pid: self.__class__.__name__ 

1327 + "." 

1328 + inspect.currentframe().f_code.co_name 

1329 + " " 

1330 + tag 

1331 } 

1332 ) 

1333 

1334 content_type = node.get("content-node_type") or "" 

1335 if content_type == "": 1335 ↛ 1337line 1335 didn't jump to line 1337, because the condition on line 1335 was never false

1336 content_type = node.get("kwd-group-type") or "" 

1337 lang = get_normalized_attrib(node, "lang") or self.lang 

1338 

1339 self.kwds.extend([{"type": content_type, "lang": lang, "value": kwd} for kwd in kwds]) 

1340 

1341 def parse_ref_list(self, node, **kwargs): 

1342 for child in node: 

1343 tag = normalize(child.tag) 

1344 

1345 if tag == "ref": 

1346 ref = JatsRef(tree=child, lang=self.lang) 

1347 self.warnings.extend(ref.warnings) 

1348 self.bibitems.append(ref) 

1349 self.bibitem.append(ref.citation_html) 

1350 elif tag == "p": 1350 ↛ 1352line 1350 didn't jump to line 1352, because the condition on line 1350 was never true

1351 # Elsevier can store supplementary-material inside ref-list / p 

1352 self.parse_node_with_mixed_content(child) 

1353 else: 

1354 self.warnings.append( 

1355 { 

1356 self.pid: self.__class__.__name__ 

1357 + "." 

1358 + inspect.currentframe().f_code.co_name 

1359 + " " 

1360 + tag 

1361 } 

1362 ) 

1363 

1364 def parse_related_article(self, node, **kwargs): 

1365 rel_type = get_normalized_attrib(node, "related-article-type") or "" 

1366 id_value = node.text 

1367 

1368 if hasattr(self, "pii") and id_value and id_value.find("10.") == -1 and id_value != "NONE": 1368 ↛ 1371line 1368 didn't jump to line 1371, because the condition on line 1368 was never true

1369 # a pii is used instead of a DOI 

1370 # Call Elsevier to get the doi 

1371 doi = scrapping.fetch_article(self.doi, id_value, pii_doi_equivalence=True) 

1372 id_value = doi 

1373 

1374 obj = Foo() 

1375 obj.rel_type = rel_type 

1376 obj.id_value = id_value 

1377 

1378 self.relations.append(obj) 

1379 

1380 def parse_related_object(self, node, **kwargs): 

1381 node_type = node.get("content-type") or "" 

1382 rel = node.get("link-type") or "" 

1383 href = get_normalized_attrib(node, "href") or "" 

1384 base = get_normalized_attrib(node, "base") or "" 

1385 text = get_xml_from_node(node) 

1386 

1387 data = { 

1388 "rel": rel, 

1389 "mimetype": node_type, 

1390 "location": href, 

1391 "base": base, 

1392 "metadata": text, 

1393 } 

1394 

1395 document_id_type = node.get("document-id-type") or "" 

1396 if document_id_type: 1396 ↛ 1397line 1396 didn't jump to line 1397, because the condition on line 1396 was never true

1397 id_value = node.get("document-id") or "" 

1398 if id_value != "NONE": 

1399 if id_value and id_value.find("10.") == -1: 

1400 # a pii is used instead of a DOI 

1401 # Call Elsevier to get the doi 

1402 doi = scrapping.fetch_article(self.doi, id_value, pii_doi_equivalence=True) 

1403 id_value = doi 

1404 

1405 obj = Foo() 

1406 obj.rel_type = "refers to" 

1407 obj.id_value = id_value 

1408 

1409 self.relations.append(obj) 

1410 else: 

1411 self.related_objects.append(data) 

1412 

1413 def parse_sec(self, node, **kwargs): 

1414 for child in node: 

1415 tag = normalize(child.tag) 

1416 

1417 if tag == "title": 

1418 pass 

1419 elif tag == "ref-list": 

1420 self.parse_ref_list(child) 

1421 else: 

1422 self.warnings.append( 

1423 { 

1424 self.pid: self.__class__.__name__ 

1425 + "." 

1426 + inspect.currentframe().f_code.co_name 

1427 + " " 

1428 + tag 

1429 } 

1430 ) 

1431 

1432 def parse_self_uri(self, node, **kwargs): 

1433 node_type = node.get("content-type") or "text/html" 

1434 href = get_normalized_attrib(node, "href") or "" 

1435 base = get_normalized_attrib(node, "base") or "" 

1436 

1437 # The XML of the Elsevier archive do not declare the PDF location like the other Mathdoc collections: 

1438 # The collection folder is missing: add it back 

1439 if hasattr(self, "pii") and hasattr(self, "issue"): 1439 ↛ 1440line 1439 didn't jump to line 1440, because the condition on line 1439 was never true

1440 base_dir = self.issue.journal.pid 

1441 if os.path.dirname(href) != base_dir: 

1442 href = os.path.join(base_dir, self.issue.pid, href) 

1443 

1444 if self.no_bib: 1444 ↛ 1445line 1444 didn't jump to line 1445, because the condition on line 1444 was never true

1445 href = "http://www.numdam.org/item/" + os.path.basename(href) 

1446 

1447 data = { 

1448 "rel": "full-text", 

1449 "mimetype": node_type, 

1450 "location": href, 

1451 "base": base, 

1452 "text": normalize_space(node.text) if node.text is not None else "", 

1453 } 

1454 

1455 # Ext-links, Related-objects used metadata instead of text. Strange difference ? 

1456 # xml_cmds ignore "application/xml" in add_objects_with_location: they are ignored here. 

1457 if node_type != "application/xml": 

1458 self.streams.append(data) 

1459 

1460 def parse_sub_article(self, node, **kwargs): 

1461 # Used for translations 

1462 trans_article = JatsArticle(tree=node) 

1463 self.translations.append(trans_article) 

1464 

1465 def parse_subj_group(self, node, **kwargs): 

1466 lang = get_normalized_attrib(node, "lang") or self.lang 

1467 type_ = node.get("subj-group-type") or "" 

1468 

1469 for child in node: 

1470 tag = normalize(child.tag) 

1471 

1472 if tag == "subject": 1472 ↛ 1477line 1472 didn't jump to line 1477, because the condition on line 1472 was never false

1473 self.subjs.append( 

1474 {"type": type_, "lang": lang, "value": get_text_from_node(child)} 

1475 ) 

1476 else: 

1477 self.warnings.append( 

1478 { 

1479 self.pid: self.__class__.__name__ 

1480 + "." 

1481 + inspect.currentframe().f_code.co_name 

1482 + " " 

1483 + tag 

1484 } 

1485 ) 

1486 

1487 def parse_supplementary_material(self, node, **kwargs): 

1488 caption = "" 

1489 for child in node: 

1490 if child.tag == "caption": 

1491 _, caption = self.parse_node_with_mixed_content(child) 

1492 

1493 location = get_normalized_attrib(node, "href") or None 

1494 if location is None: 

1495 location = get_normalized_attrib(node, "id") or "" 

1496 

1497 mimetype = node.attrib.get("mimetype") or None 

1498 if mimetype is None: 

1499 mimetype = resolver.get_mimetype(location) 

1500 

1501 material = { 

1502 "rel": node.attrib.get("content-type") or "supplementary-material", 

1503 "mimetype": mimetype, 

1504 "location": location, 

1505 "base": "", 

1506 "metadata": "", 

1507 "caption": caption if caption else "", 

1508 } 

1509 base_location = os.path.basename(location) 

1510 found_list = [ 

1511 item 

1512 for item in self.supplementary_materials 

1513 if os.path.basename(item["location"]) == base_location 

1514 ] 

1515 if len(found_list) == 0: 

1516 self.supplementary_materials.append(material) 

1517 

1518 def parse_title(self, node, **kwargs): 

1519 self.title_tex, self.title_html = self.parse_node_with_mixed_content( 

1520 node, ignore_xref=True 

1521 ) 

1522 # In xmldata.py, title_xml had the <title_group> tag: 

1523 # self.title_xml can't be set in parse_title 

1524 

1525 def parse_title_group(self, node, **kwargs): 

1526 has_fn_group = False 

1527 

1528 for child in node: 

1529 tag = normalize(child.tag) 

1530 

1531 if tag in ("title", "journal-title", "article-title", "book-title", "issue-title"): 

1532 self.parse_title(child) 

1533 elif tag == "subtitle": 1533 ↛ 1534line 1533 didn't jump to line 1534, because the condition on line 1533 was never true

1534 title_tex, title_html = self.parse_node_with_mixed_content(child) 

1535 self.title_tex += " " + title_tex 

1536 self.title_html += " " + title_html 

1537 elif tag == "trans-title-group": 

1538 self.parse_trans_title_group(child) 

1539 elif tag == "abbrev-title": 

1540 _, self.abbrev = self.parse_node_with_mixed_content(child) 

1541 elif tag == "fn-group": 1541 ↛ 1542line 1541 didn't jump to line 1542, because the condition on line 1541 was never true

1542 has_fn_group = True 

1543 for fn_node in child: 

1544 if fn_node.tag == "fn": 

1545 _, html = self.parse_node_with_fn( 

1546 fn_node, keep_fn=True, keep_fn_label=False 

1547 ) 

1548 xml = get_xml_from_node(fn_node) 

1549 self.footnotes_xml += xml 

1550 self.footnotes_html += html 

1551 else: 

1552 self.warnings.append( 

1553 { 

1554 self.pid: self.__class__.__name__ 

1555 + "." 

1556 + inspect.currentframe().f_code.co_name 

1557 + " " 

1558 + tag 

1559 } 

1560 ) 

1561 

1562 if has_fn_group: 1562 ↛ 1565line 1562 didn't jump to line 1565, because the condition on line 1562 was never true

1563 # fn-group is now a funding statement and will be exported separately in the XML: 

1564 # => remove it from the title-group 

1565 new_node = etree.Element("title-group") 

1566 for child in node: 

1567 tag = normalize(child.tag) 

1568 if tag != "fn-group": 

1569 new_node.append(copy.deepcopy(child)) 

1570 self.title_xml = get_xml_from_node(new_node) 

1571 else: 

1572 self.title_xml = get_xml_from_node(node) 

1573 

1574 def parse_trans_abstract(self, node, **kwargs): 

1575 tag = get_normalized_attrib(node, "abstract-type") or "abstract" 

1576 if tag == "author": 1576 ↛ 1577line 1576 didn't jump to line 1577, because the condition on line 1576 was never true

1577 tag = "abstract" 

1578 lang = get_normalized_attrib(node, "lang") or "und" 

1579 value_tex, value_html = self.parse_node_with_mixed_content(node) 

1580 value_xml = get_xml_from_node(node) 

1581 self.abstracts.append( 

1582 { 

1583 "tag": tag, 

1584 "lang": lang, 

1585 "value_xml": value_xml, 

1586 "value_html": value_html, 

1587 "value_tex": value_tex, 

1588 } 

1589 ) 

1590 

1591 def parse_trans_title(self, node, **kwargs): 

1592 self.trans_title_tex, self.trans_title_html = self.parse_node_with_mixed_content(node) 

1593 self.trans_title_xml = get_xml_from_node(node) 

1594 

1595 def parse_trans_title_group(self, node, **kwargs): 

1596 for child in node: 

1597 tag = normalize(child.tag) 

1598 

1599 if tag == "trans-title": 1599 ↛ 1602line 1599 didn't jump to line 1602, because the condition on line 1599 was never false

1600 self.parse_trans_title(child) 

1601 else: 

1602 self.warnings.append( 

1603 { 

1604 self.pid: self.__class__.__name__ 

1605 + "." 

1606 + inspect.currentframe().f_code.co_name 

1607 + " " 

1608 + tag 

1609 } 

1610 ) 

1611 

1612 self.trans_lang = get_normalized_attrib(node, "lang") or "und" 

1613 

1614 def get_data_from_contrib(self, node): 

1615 """ 

1616 <contrib> creates 1 person, defined in <name>, <string-name> or <name-alternatives> 

1617 In a <mixed-citation>, each <name> creates 1 person: we can't use the same code 

1618 :param node: 

1619 :return: 

1620 """ 

1621 

1622 params = create_contributor() 

1623 

1624 for child in node: 

1625 if child.tag == "name": 

1626 self.update_data_from_name(child, params) 

1627 elif child.tag == "string-name": 

1628 self.update_data_from_name(child, params) 

1629 if params["first_name"] == "" and params["last_name"] == "": 1629 ↛ 1624line 1629 didn't jump to line 1624, because the condition on line 1629 was never false

1630 params["string_name"] = child.text or "" 

1631 elif child.tag == "name-alternatives": 

1632 params["mid"] = self.get_data_from_name_alternatives(child) 

1633 elif child.tag == "contrib-id": 

1634 type_ = child.get("contrib-id-type") or "" 

1635 if type_ == "orcid": 1635 ↛ 1637line 1635 didn't jump to line 1637, because the condition on line 1635 was never false

1636 params["orcid"] = child.text or "" 

1637 if type_ == "idref": 1637 ↛ 1638line 1637 didn't jump to line 1638, because the condition on line 1637 was never true

1638 params["idref"] = child.text or "" 

1639 elif child.tag == "address": 

1640 addr = get_text_from_node(child) 

1641 params["addresses"].append(addr) 

1642 elif child.tag == "email": 

1643 params["email"] = child.text or "" 

1644 elif child.tag == "xref": 1644 ↛ 1656line 1644 didn't jump to line 1656, because the condition on line 1644 was never false

1645 # Elsevier uses xref/aff-alternatives to store affiliations 

1646 type_ = child.get("ref-type") or "" 

1647 if type_ == "aff": 1647 ↛ 1624line 1647 didn't jump to line 1624, because the condition on line 1647 was never false

1648 xref = child.get("rid") or "" 

1649 if xref == "": 1649 ↛ 1650line 1649 didn't jump to line 1650, because the condition on line 1649 was never true

1650 xref = get_text_from_node(child) 

1651 if xref != "": 1651 ↛ 1624line 1651 didn't jump to line 1624, because the condition on line 1651 was never false

1652 if "xrefs" not in params: 1652 ↛ 1655line 1652 didn't jump to line 1655, because the condition on line 1652 was never false

1653 params["xrefs"] = [xref] 

1654 else: 

1655 params["xrefs"].append(xref) 

1656 elif child.tag == "collab": 

1657 params["string_name"] = child.text or "" 

1658 elif child.tag == "role": 

1659 pass 

1660 # Role is used in BJHTUP11 as a textual description of the role (ex "Présidente"). 

1661 # The node value can not be assigned to params['role'] as we want a controlled vocabulary 

1662 # (author /editor / organizer...) 

1663 # Ignore the value 

1664 # params["role"] = child.text or "" 

1665 else: 

1666 self.warnings.append( 

1667 { 

1668 self.pid: self.__class__.__name__ 

1669 + "." 

1670 + inspect.currentframe().f_code.co_name 

1671 + " " 

1672 + child.tag 

1673 } 

1674 ) 

1675 

1676 # Remove the sort, it causes differences between the HTML and the PDF (discovered in PCJ) 

1677 # Sort was introduced on 22/09/2020, based on differences between the Cedrics->JATS XSLT et the Cedrics import 

1678 # params['addresses'].sort() 

1679 

1680 helper_update_name_params(params) 

1681 

1682 corresp = node.get("corresp") or "" 

1683 if corresp == "yes": 

1684 params["corresponding"] = True 

1685 

1686 deceased_ = node.get("deceased") or "no" 

1687 params["deceased_before_publication"] = deceased_ == "yes" 

1688 

1689 equal_contrib_ = node.get("equal-contrib") or "no" 

1690 params["equal_contrib"] = equal_contrib_ == "yes" 

1691 

1692 return params 

1693 

1694 def get_data_from_custom_meta(self, node): 

1695 name = "" 

1696 value = "" 

1697 

1698 for child in node: 

1699 tag = normalize(child.tag) 

1700 

1701 if tag == "meta-name": 

1702 name = child.text 

1703 elif tag == "meta-value": 1703 ↛ 1706line 1703 didn't jump to line 1706, because the condition on line 1703 was never false

1704 value = child.text 

1705 else: 

1706 self.warnings.append( 

1707 { 

1708 self.pid: self.__class__.__name__ 

1709 + "." 

1710 + inspect.currentframe().f_code.co_name 

1711 + " " 

1712 + tag 

1713 } 

1714 ) 

1715 

1716 return name, value 

1717 

1718 def get_data_from_date(self, node, ignore_month=False): 

1719 date_str = "" 

1720 if "iso-8601-date" in node.attrib: 

1721 date_str = node.attrib["iso-8601-date"] 

1722 else: 

1723 year = month = day = "" 

1724 for child in node: 

1725 tag = normalize(child.tag) 

1726 

1727 if tag == "year": 1727 ↛ 1729line 1727 didn't jump to line 1729, because the condition on line 1727 was never false

1728 year = child.text 

1729 elif tag == "month" and not ignore_month: 

1730 month = child.text 

1731 elif tag == "day": 

1732 day = child.text 

1733 else: 

1734 self.warnings.append( 

1735 { 

1736 self.pid: self.__class__.__name__ 

1737 + "." 

1738 + inspect.currentframe().f_code.co_name 

1739 + " " 

1740 + tag 

1741 } 

1742 ) 

1743 

1744 date_str = year 

1745 if date_str and month: 1745 ↛ 1746line 1745 didn't jump to line 1746, because the condition on line 1745 was never true

1746 date_str += "-" + month 

1747 if date_str and day: 1747 ↛ 1748line 1747 didn't jump to line 1748, because the condition on line 1747 was never true

1748 date_str += "-" + day 

1749 

1750 return date_str 

1751 

1752 def get_data_from_ext_link(self, node, **kwargs): 

1753 link_type = node.get("ext-link-type") or "" 

1754 href = get_normalized_attrib(node, "href") or "" 

1755 base = get_normalized_attrib(node, "base") or "" 

1756 

1757 kwargs["add_HTML_link"] = False 

1758 _, metadata = self.parse_inner_node(node, **kwargs) 

1759 

1760 data = { 

1761 "rel": link_type, 

1762 "mimetype": "", 

1763 "location": href, 

1764 "base": base, 

1765 "metadata": metadata, 

1766 } 

1767 

1768 return data 

1769 

1770 def get_data_from_history(self, node): 

1771 history_dates = [] 

1772 # TODO: transform history_dates in a hash where date-type is the key 

1773 # => Change database_cmds 

1774 for child in node: 

1775 if "date-type" in child.attrib: 

1776 date_type = child.attrib["date-type"] 

1777 date_str = self.get_data_from_date(child) 

1778 history_dates.append({"type": date_type, "date": date_str}) 

1779 else: 

1780 self.warnings.append( 

1781 { 

1782 self.pid: self.__class__.__name__ 

1783 + "." 

1784 + inspect.currentframe().f_code.co_name 

1785 + " " 

1786 + child.tag 

1787 } 

1788 ) 

1789 

1790 return history_dates 

1791 

1792 def update_data_from_name(self, node, contributor): 

1793 for child in node: 

1794 if child.text is not None: 1794 ↛ 1793line 1794 didn't jump to line 1793, because the condition on line 1794 was never false

1795 if child.tag == "given-names": 

1796 contributor["first_name"] = child.text 

1797 elif child.tag == "surname": 

1798 contributor["last_name"] = child.text 

1799 elif child.tag == "prefix": 1799 ↛ 1800line 1799 didn't jump to line 1800, because the condition on line 1799 was never true

1800 contributor["prefix"] = child.text 

1801 elif child.tag == "suffix": 1801 ↛ 1804line 1801 didn't jump to line 1804, because the condition on line 1801 was never false

1802 contributor["suffix"] = child.text 

1803 else: 

1804 self.warnings.append( 

1805 { 

1806 self.pid: self.__class__.__name__ 

1807 + "." 

1808 + inspect.currentframe().f_code.co_name 

1809 + " " 

1810 + child.tag 

1811 } 

1812 ) 

1813 

1814 def get_data_from_name_alternatives(self, node): 

1815 mid = "" 

1816 

1817 for child in node: 

1818 if child.text is not None: 1818 ↛ 1817line 1818 didn't jump to line 1817, because the condition on line 1818 was never false

1819 if child.tag == "string-name": 1819 ↛ 1823line 1819 didn't jump to line 1823, because the condition on line 1819 was never false

1820 if child.get("specific-use") == "index": 1820 ↛ 1817line 1820 didn't jump to line 1817, because the condition on line 1820 was never false

1821 mid = child.text 

1822 else: 

1823 self.warnings.append( 

1824 { 

1825 self.pid: self.__class__.__name__ 

1826 + "." 

1827 + inspect.currentframe().f_code.co_name 

1828 + " " 

1829 + child.tag 

1830 } 

1831 ) 

1832 

1833 return mid 

1834 

1835 def get_data_from_uri(self, node, **kwargs): 

1836 href = get_normalized_attrib(node, "href") or "" 

1837 

1838 kwargs["add_HTML_link"] = False 

1839 _, metadata = self.parse_inner_node(node, **kwargs) 

1840 

1841 data = {"rel": None, "mimetype": "", "location": href, "base": "", "metadata": metadata} 

1842 

1843 return data 

1844 

1845 def helper_add_link_from_node(self, node, **kwargs): 

1846 text = node.text or "" 

1847 tag = normalize(node.tag) 

1848 fct_name = "get_data_from_" + tag.replace("-", "_") 

1849 meth = getattr(self, fct_name) 

1850 data = meth(node, **kwargs) 

1851 if not data["rel"] or data["rel"] == "uri": 

1852 href = data["location"] 

1853 if self.for_tex_file: 1853 ↛ 1854line 1853 didn't jump to line 1854, because the condition on line 1853 was never true

1854 text = "\\href{" + href + "}{" + data["metadata"] + "}" 

1855 else: 

1856 text = make_links_clickable(href, data["metadata"]) 

1857 return text 

1858 

1859 def get_list_start_value(self, list_node): 

1860 continued_from = list_node.get("continued-from") 

1861 if continued_from is None: 

1862 start = 0 

1863 else: 

1864 from_node = self.tree.find(f'.//*[@id="{continued_from}"]') 

1865 if from_node is not None: 

1866 start = len(from_node) + self.get_list_start_value(from_node) 

1867 

1868 return start 

1869 

1870 

1871class MathdocPublication(MathdocPublicationData, JatsBase): 

1872 def __init__(self, *args, **kwargs): 

1873 super().__init__(*args, **kwargs) 

1874 self.parse_tree(kwargs["tree"]) 

1875 

1876 def parse_tree(self, tree): 

1877 super().parse_tree(tree) 

1878 

1879 for node in tree: 

1880 tag = normalize(node.tag) 

1881 

1882 if tag in ("publication-id", "collection-id"): 

1883 node_type = node.get("publication-id-type") 

1884 if node_type is None or node_type in ["numdam-id", "mathdoc-id"]: 

1885 self.pid = node.text 

1886 elif tag == "title-group": 

1887 self.parse_title_group(node) 

1888 elif tag == "issn": 

1889 node_type = node.get("pub-type") 

1890 if node_type == "ppub": 

1891 self.issn = node.text 

1892 self.ids.append(("issn", node.text)) 

1893 elif node_type == "epub": 1893 ↛ 1879line 1893 didn't jump to line 1879, because the condition on line 1893 was never false

1894 self.e_issn = node.text 

1895 self.ids.append(("e-issn", node.text)) 

1896 elif tag == "ext-link": 

1897 data = self.get_data_from_ext_link(node) 

1898 self.ext_links.append(data) 

1899 elif tag == "custom-meta-group": 

1900 self.parse_custom_meta_group(node) 

1901 elif tag == "description": 1901 ↛ 1902line 1901 didn't jump to line 1902, because the condition on line 1901 was never true

1902 self.parse_description(node) 

1903 else: 

1904 self.warnings.append( 

1905 { 

1906 self.pid: self.__class__.__name__ 

1907 + "." 

1908 + inspect.currentframe().f_code.co_name 

1909 + " " 

1910 + tag 

1911 } 

1912 ) 

1913 

1914 def parse_custom_meta_group(self, node, **kwargs): 

1915 for child in node: 

1916 tag = normalize(child.tag) 

1917 

1918 if tag == "custom-meta": 1918 ↛ 1928line 1918 didn't jump to line 1928, because the condition on line 1918 was never false

1919 name, value = self.get_data_from_custom_meta(child) 

1920 

1921 if name == "serial-type": 

1922 self.coltype = value 

1923 elif name == "wall": 

1924 self.wall = int(value) 

1925 elif name == "provider": 1925 ↛ 1915line 1925 didn't jump to line 1915, because the condition on line 1925 was never false

1926 self.provider = value 

1927 else: 

1928 self.warnings.append( 

1929 { 

1930 self.pid: self.__class__.__name__ 

1931 + "." 

1932 + inspect.currentframe().f_code.co_name 

1933 + " " 

1934 + tag 

1935 } 

1936 ) 

1937 

1938 def parse_description(self, node, **kwargs): 

1939 # tag = get_normalized_attrib(node, "abstract-node_type") or "abstract" 

1940 tag = "description" 

1941 lang = get_normalized_attrib(node, "lang") or self.lang 

1942 value_xml = get_xml_from_node(node) 

1943 value_tex = value_html = value_xml.replace("<decription", "").replace("</description>", "") 

1944 self.abstracts.append( 

1945 { 

1946 "tag": tag, 

1947 "lang": lang, 

1948 "value_xml": value_xml, 

1949 "value_html": value_html, 

1950 "value_tex": value_tex, 

1951 } 

1952 ) 

1953 

1954 

1955class JatsPublisher(PublisherData): 

1956 def __init__(self, *args, **kwargs): 

1957 super().__init__(*args, **kwargs) 

1958 self.warnings = [] 

1959 self.parse_tree(kwargs["tree"]) 

1960 self.warnings = [] 

1961 

1962 def parse_tree(self, tree): 

1963 for node in tree: 

1964 tag = normalize(node.tag) 

1965 

1966 if tag == "publisher-name": 1966 ↛ 1968line 1966 didn't jump to line 1968, because the condition on line 1966 was never false

1967 self.name = node.text 

1968 elif tag == "publisher-loc": 

1969 self.loc = node.text 

1970 else: 

1971 self.warnings.append( 

1972 { 

1973 self.pid: self.__class__.__name__ 

1974 + "." 

1975 + inspect.currentframe().f_code.co_name 

1976 + " " 

1977 + tag 

1978 } 

1979 ) 

1980 

1981 

1982class JatsJournal(JournalData, JatsBase): 

1983 def __init__(self, *args, **kwargs): 

1984 super().__init__(*args, **kwargs) 

1985 self.parse_tree(kwargs["tree"]) 

1986 

1987 def parse_tree(self, tree): 

1988 super().parse_tree(tree) 

1989 

1990 for node in tree: 

1991 tag = normalize(node.tag) 

1992 

1993 if tag == "journal-id": 

1994 id_type = node.get("journal-id-type") or "numdam-id" 

1995 if id_type == "numdam-id" or id_type == "mathdoc-id": 1995 ↛ 1990line 1995 didn't jump to line 1990, because the condition on line 1995 was never false

1996 self.pid = node.text 

1997 elif tag == "journal-title-group": 

1998 self.parse_title_group(node) 

1999 elif tag == "publisher": 

2000 self.publisher = JatsPublisher(tree=node) 

2001 elif tag == "issn": 2001 ↛ 2010line 2001 didn't jump to line 2010, because the condition on line 2001 was never false

2002 node_type = node.get("pub-type") or "ppub" 

2003 if node_type == "ppub": 

2004 self.issn = node.text 

2005 self.ids.append(("issn", node.text)) 

2006 elif node_type == "epub": 2006 ↛ 1990line 2006 didn't jump to line 1990, because the condition on line 2006 was never false

2007 self.e_issn = node.text 

2008 self.ids.append(("e-issn", node.text)) 

2009 else: 

2010 self.warnings.append( 

2011 { 

2012 self.pid: self.__class__.__name__ 

2013 + "." 

2014 + inspect.currentframe().f_code.co_name 

2015 + " " 

2016 + tag 

2017 } 

2018 ) 

2019 

2020 

2021class JatsIssue(IssueData, JatsBase): 

2022 def __init__(self, *args, **kwargs): 

2023 super().__init__(*args, **kwargs) 

2024 # from_folder is used to change the location of Elsevier graphics to a full path location 

2025 self.from_folder = kwargs["from_folder"] if "from_folder" in kwargs else None 

2026 self.no_bib = kwargs.get("no_bib", False) 

2027 

2028 self.parse_tree(kwargs["tree"]) 

2029 

2030 def parse_tree(self, tree): 

2031 super().parse_tree(tree) 

2032 

2033 for node in tree: 

2034 tag = normalize(node.tag) 

2035 

2036 if tag == "journal-meta": 

2037 self.journal = JatsJournal(tree=node) 

2038 elif tag == "issue-meta": 

2039 self.parse_issue_meta(node) 

2040 elif tag == "body": 2040 ↛ 2064line 2040 didn't jump to line 2064, because the condition on line 2040 was never false

2041 for child in node: 

2042 tag = normalize(child.tag) 

2043 

2044 if tag == "article": 2044 ↛ 2054line 2044 didn't jump to line 2054, because the condition on line 2044 was never false

2045 article = JatsArticle( 

2046 tree=child, 

2047 issue=self, 

2048 from_folder=self.from_folder, 

2049 no_bib=self.no_bib, 

2050 ) 

2051 self.warnings.extend(article.warnings) 

2052 self.articles.append(article) 

2053 else: 

2054 self.warnings.append( 

2055 { 

2056 self.pid: self.__class__.__name__ 

2057 + "." 

2058 + inspect.currentframe().f_code.co_name 

2059 + " " 

2060 + tag 

2061 } 

2062 ) 

2063 else: 

2064 self.warnings.append( 

2065 { 

2066 self.pid: self.__class__.__name__ 

2067 + "." 

2068 + inspect.currentframe().f_code.co_name 

2069 + " " 

2070 + tag 

2071 } 

2072 ) 

2073 

2074 if self.journal is not None: 2074 ↛ 2078line 2074 didn't jump to line 2078, because the condition on line 2074 was never false

2075 self.publisher = self.journal.publisher 

2076 

2077 # Issue editors may be replicated in all the articles, remove them 

2078 issue_editors = [contrib for contrib in self.contributors if contrib["role"] == "editor"] 

2079 

2080 is_elsevier = False 

2081 for xarticle in self.articles: 

2082 if hasattr(xarticle, "pii"): 2082 ↛ 2083line 2082 didn't jump to line 2083, because the condition on line 2082 was never true

2083 is_elsevier = True 

2084 

2085 editors = [contrib for contrib in xarticle.contributors if contrib["role"] == "editor"] 

2086 is_equal = len(editors) == len(issue_editors) 

2087 i = 0 

2088 while is_equal and i < len(editors): 2088 ↛ 2089line 2088 didn't jump to line 2089, because the condition on line 2088 was never true

2089 if ( 

2090 editors[i]["last_name"] != issue_editors[i]["last_name"] 

2091 or editors[i]["first_name"] != issue_editors[i]["first_name"] 

2092 ): 

2093 is_equal = False 

2094 i += 1 

2095 if is_equal: 

2096 xarticle.contributors = [ 

2097 contrib for contrib in xarticle.contributors if contrib["role"] != "editor" 

2098 ] 

2099 

2100 if is_elsevier: 2100 ↛ 2102line 2100 didn't jump to line 2102, because the condition on line 2100 was never true

2101 # Fix location of icons 

2102 for link in self.ext_links: 

2103 if link["rel"] in ["icon", "small_icon"]: 

2104 base_dir = self.journal.pid 

2105 location = link["location"] 

2106 if os.path.dirname(location) != base_dir: 

2107 location = os.path.join(base_dir, self.pid, location) 

2108 if self.from_folder: 

2109 location = os.path.join(self.from_folder, location) 

2110 location = "file:" + location 

2111 link["location"] = location 

2112 

2113 # Fix article types and subjects 

2114 for xarticle in self.articles: 

2115 article_type = "research-article" 

2116 old_type = "" 

2117 new_subjs = [] 

2118 

2119 if xarticle.fpage != "": 

2120 try: 

2121 value = int(xarticle.fpage) 

2122 except ValueError: 

2123 # fpage is not a number: the article is an editorial 

2124 article_type = "editorial" 

2125 

2126 if article_type == "research-article": 

2127 for subj in xarticle.subjs: 

2128 if subj["type"] == "type": 

2129 # Fix article types 

2130 value = subj["value"].lower() 

2131 old_type = value 

2132 if value == "discussion": 

2133 article_type = "letter" 

2134 elif value == "editorial": 

2135 if xarticle.title_tex.lower().find("foreword") == 0: 

2136 article_type = "foreword" 

2137 else: 

2138 article_type = "editorial" 

2139 elif value in ["mini review", "review article", "book review"]: 

2140 article_type = "review" 

2141 elif value == "research article": 

2142 article_type = "research-article" 

2143 elif value == "short communication": 

2144 article_type = "foreword" 

2145 elif value == "correspondence": 

2146 article_type = "letter" 

2147 elif value.find("conference") == 0: 

2148 article_type = "congress" 

2149 elif subj["type"] == "heading" and not xarticle.title_tex: 

2150 # The title may be stored in the heading: fix it 

2151 xarticle.title_tex = xarticle.title_html = subj["value"] 

2152 xarticle.title_xml = get_title_xml(subj["value"]) 

2153 elif subj["type"] == "heading": 

2154 value = subj["value"].lower().strip() 

2155 issue_title = self.title_tex.lower() 

2156 if issue_title.find("dossier: ") == 0: 

2157 issue_title = issue_title[9:] 

2158 self.title_tex = self.title_html = self.title_tex[9:] 

2159 self.title_xml = ( 

2160 "<issue-title>" 

2161 + get_single_title_xml(issue_title) 

2162 + "</issue-title>" 

2163 ) 

2164 

2165 # Some heading values are in fact article type 

2166 if value.find("erratum") == 0: 

2167 article_type = "erratum" 

2168 elif value.find("corrigendum") == 0: 

2169 article_type = "corrigendum" 

2170 elif value.find("foreword") == 0: 

2171 article_type = "foreword" 

2172 elif value.find("nécrologie") == 0 or value.find("obituary") == 0: 

2173 article_type = "history-of-sciences" 

2174 elif ( 

2175 value.find("block calendar/éphéméride") == 0 

2176 or value.find("chronique") == 0 

2177 ): 

2178 article_type = "history-of-sciences" 

2179 elif value.find("histoire") == 0 or value.find("historic") == 0: 

2180 article_type = "history-of-sciences" 

2181 elif value.find("tribute/hommage") == 0: 

2182 article_type = "history-of-sciences" 

2183 elif value.find("note historique") == 0: 

2184 article_type = "historical-commentary" 

2185 elif ( 

2186 value.find("le point sur") == 0 or value.find("le point-sur") == 0 

2187 ): 

2188 article_type = "review" 

2189 elif ( 

2190 value.find("review") == 0 

2191 or value.find("revue") == 0 

2192 or value.find("concise review") == 0 

2193 ): 

2194 article_type = "review" 

2195 elif value.find("conférence") == 0: 

2196 article_type = "congress" 

2197 elif ( 

2198 value.find("communication") == 0 or value.find("preliminary") == 0 

2199 ): 

2200 article_type = "preliminary-communication" 

2201 elif value.find("perspective") == 0 and old_type in [ 

2202 "correspondence", 

2203 "short communication", 

2204 ]: 

2205 article_type = "opinion" 

2206 elif value.find("debate") == 0: 

2207 article_type = "opinion" 

2208 elif ( 

2209 value.find("index") == 0 

2210 or value.find("keyword") == 0 

2211 or value.find("sommaire") == 0 

2212 ): 

2213 article_type = "editorial" 

2214 elif ( 

2215 value.find("table auteurs") == 0 

2216 or value.find("table sommaire") == 0 

2217 ): 

2218 article_type = "editorial" 

2219 elif value.find("page présentation des index") == 0: 

2220 article_type = "editorial" 

2221 elif value.find("fac-similé") == 0: 

2222 # Article de crbiol, Pubmed les met en "Classical Article" 

2223 article_type = "historical-commentary" 

2224 # On ajoute le sujet dans ce cas pour garder la mention de "fac-similé" (== recopie) 

2225 new_subjs.append(subj) 

2226 # Ignore the issue titles 

2227 elif ( 

2228 not self.title_tex 

2229 or value.find(self.title_tex.lower().strip()) != 0 

2230 ): 

2231 # Exclude headings that are redundant with article types 

2232 exclude_list = [ 

2233 "editorial", 

2234 "éditorial", 

2235 "avant-propos", 

2236 "book review", 

2237 "comment", 

2238 "concise review paper", 

2239 "answer", 

2240 "commentaire", 

2241 "commentary", 

2242 "reply", 

2243 "foreword", 

2244 "full paper", 

2245 "mémoire", 

2246 ] 

2247 if len([x for x in exclude_list if value.find(x) == 0]) == 0: 

2248 new_subjs.append(subj) 

2249 else: 

2250 new_subjs.append(subj) 

2251 

2252 # print(old_type, '-', old_heading, '-', article_type, '-', xarticle.pid, '-', xarticle.fpage) 

2253 xarticle.atype = article_type 

2254 xarticle.subjs = new_subjs 

2255 

2256 def parse_custom_meta_group(self, node, **kwargs): 

2257 for child in node: 

2258 tag = normalize(child.tag) 

2259 

2260 if tag == "custom-meta": 2260 ↛ 2268line 2260 didn't jump to line 2268, because the condition on line 2260 was never false

2261 name, value = self.get_data_from_custom_meta(child) 

2262 

2263 if name == "provider": 

2264 self.provider = value 

2265 elif name == "efirst": 2265 ↛ 2257line 2265 didn't jump to line 2257, because the condition on line 2265 was never false

2266 self.with_online_first = value == "yes" 

2267 else: 

2268 self.warnings.append( 

2269 { 

2270 self.pid: self.__class__.__name__ 

2271 + "." 

2272 + inspect.currentframe().f_code.co_name 

2273 + " " 

2274 + tag 

2275 } 

2276 ) 

2277 

2278 def parse_issue_meta(self, node, **kwargs): 

2279 for child in node: 

2280 tag = normalize(child.tag) 

2281 

2282 if tag == "issue-id": 

2283 self.parse_id(child) 

2284 elif tag == "volume-series": 

2285 self.vseries = child.text 

2286 elif tag == "volume": 

2287 self.volume = child.text 

2288 elif tag == "issue": 

2289 self.number = child.text 

2290 elif tag == "pub-date": 

2291 self.year = self.get_data_from_date(child, ignore_month=True) 

2292 elif tag == "history": 

2293 history_dates = self.get_data_from_history(child) 

2294 for date in history_dates: 

2295 if date["type"] == "last-modified": 

2296 self.last_modified_iso_8601_date_str = date["date"] 

2297 elif date["type"] == "prod-deployed-date": 

2298 self.prod_deployed_date_iso_8601_date_str = date["date"] 

2299 elif tag == "issue-title": 

2300 content_type = child.get("content-type") or "" 

2301 if content_type != "subtitle" and content_type != "cover-date": 2301 ↛ 2279line 2301 didn't jump to line 2279, because the condition on line 2301 was never false

2302 # Elsevier stores contributors in subtitles. Ignore. 

2303 lang = get_normalized_attrib(child, "lang") or "und" 

2304 if not self.title_tex and ( 

2305 self.lang == "und" or lang == "und" or lang == self.lang 

2306 ): 

2307 self.parse_title(child) 

2308 # In xmldata, title_xml had the <title_group> tag: 

2309 # self.title_xml can't be set in parse_title 

2310 self.title_xml += get_xml_from_node(child) 

2311 else: 

2312 self.trans_lang = lang 

2313 ( 

2314 self.trans_title_tex, 

2315 self.trans_title_html, 

2316 ) = self.parse_node_with_mixed_content(child) 

2317 self.title_xml += get_xml_from_node(child) 

2318 elif tag == "issue-title-group": 2318 ↛ 2319line 2318 didn't jump to line 2319, because the condition on line 2318 was never true

2319 self.parse_title_group(child) 

2320 else: 

2321 fct_name = "parse_" + tag.replace("-", "_") 

2322 ftor = getattr(self, fct_name, None) 

2323 if callable(ftor): 2323 ↛ 2326line 2323 didn't jump to line 2326, because the condition on line 2323 was never false

2324 ftor(child, add_ext_link=True) 

2325 else: 

2326 self.warnings.append( 

2327 { 

2328 self.pid: self.__class__.__name__ 

2329 + "." 

2330 + inspect.currentframe().f_code.co_name 

2331 + " " 

2332 + tag 

2333 } 

2334 ) 

2335 

2336 if self.last_modified_iso_8601_date_str is None: 

2337 self.last_modified_iso_8601_date_str = timezone.now().isoformat() 

2338 

2339 

2340class JatsArticleBase(JatsBase): 

2341 def parse_custom_meta_group(self, node, **kwargs): 

2342 for child in node: 

2343 tag = normalize(child.tag) 

2344 

2345 if tag == "custom-meta": 2345 ↛ 2361line 2345 didn't jump to line 2361, because the condition on line 2345 was never false

2346 name, value = self.get_data_from_custom_meta(child) 

2347 

2348 if name == "article-number": 

2349 self.article_number = value 

2350 elif name == "talk-number": 

2351 self.talk_number = value 

2352 elif name == "presented": 2352 ↛ 2353line 2352 didn't jump to line 2353, because the condition on line 2352 was never true

2353 presenter = create_contributor() 

2354 presenter["role"] = "presenter" 

2355 presenter["string_name"] = value.replace("Presented by ", "").replace( 

2356 "Présenté par ", "" 

2357 ) 

2358 presenter["contrib_xml"] = get_contrib_xml(presenter) 

2359 self.contributors.append(presenter) 

2360 else: 

2361 self.warnings.append( 

2362 { 

2363 self.pid: self.__class__.__name__ 

2364 + "." 

2365 + inspect.currentframe().f_code.co_name 

2366 + " " 

2367 + tag 

2368 } 

2369 ) 

2370 

2371 

2372class JatsArticle(ArticleData, JatsArticleBase): 

2373 def __init__(self, *args, **kwargs): # , tree, pid=None): 

2374 super().__init__(*args, **kwargs) 

2375 self.pid = kwargs["pid"] if "pid" in kwargs else None 

2376 self.issue = kwargs["issue"] if "issue" in kwargs else None 

2377 

2378 self.add_span_around_tex_formula = ( 

2379 kwargs["add_span_around_tex_formula"] 

2380 if "add_span_around_tex_formula" in kwargs 

2381 else False 

2382 ) 

2383 self.for_tex_file = kwargs["for_tex_file"] if "for_tex_file" in kwargs else False 

2384 self.from_folder = kwargs["from_folder"] if "from_folder" in kwargs else None 

2385 self.no_bib = kwargs.get("no_bib", False) 

2386 

2387 self.parse_tree(kwargs["tree"]) 

2388 

2389 def parse_tree(self, tree): 

2390 super().parse_tree(tree) 

2391 

2392 self.atype = get_normalized_attrib(tree, "article-type") or "" 

2393 

2394 # First loop to catch float-groups that are inserted inside the body 

2395 for node in tree: 

2396 tag = normalize(node.tag) 

2397 

2398 if tag == "front": 

2399 for child in node: 

2400 tag = normalize(child.tag) 

2401 

2402 if tag == "article-meta": 

2403 self.parse_article_meta(child) 

2404 else: 

2405 self.warnings.append( 

2406 { 

2407 self.pid: self.__class__.__name__ 

2408 + "." 

2409 + inspect.currentframe().f_code.co_name 

2410 + " " 

2411 + tag 

2412 } 

2413 ) 

2414 elif tag == "front-stub": 2414 ↛ 2415line 2414 didn't jump to line 2415, because the condition on line 2414 was never true

2415 self.parse_article_meta(node) 

2416 elif tag == "floats-group": 2416 ↛ 2417line 2416 didn't jump to line 2417, because the condition on line 2416 was never true

2417 self.parse_floats_group(node) 

2418 

2419 for node in tree: 

2420 tag = normalize(node.tag) 

2421 if tag == "back": 

2422 for child in node: 

2423 tag = normalize(child.tag) 

2424 

2425 if tag == "ref-list" and not self.no_bib: 

2426 print("Parse bib") 

2427 self.parse_ref_list(child) 

2428 elif tag == "ack": 2428 ↛ 2429line 2428 didn't jump to line 2429, because the condition on line 2428 was never true

2429 self.parse_ack(child) 

2430 elif tag == "sec": 2430 ↛ 2431line 2430 didn't jump to line 2431, because the condition on line 2430 was never true

2431 self.parse_sec(child) 

2432 elif tag == "app-group": 2432 ↛ 2433line 2432 didn't jump to line 2433, because the condition on line 2432 was never true

2433 self.parse_app_group(child) 

2434 elif tag == "fn-group": 2434 ↛ 2435line 2434 didn't jump to line 2435, because the condition on line 2434 was never true

2435 self.parse_fn_group(child) 

2436 else: 

2437 self.warnings.append( 

2438 { 

2439 self.pid: self.__class__.__name__ 

2440 + "." 

2441 + inspect.currentframe().f_code.co_name 

2442 + " " 

2443 + tag 

2444 } 

2445 ) 

2446 

2447 elif tag == "body": 

2448 self.parse_body(node) 

2449 elif tag == "sub-article": 2449 ↛ 2450line 2449 didn't jump to line 2450, because the condition on line 2449 was never true

2450 self.parse_sub_article(node) 

2451 elif tag == "floats-group" or tag == "front": 2451 ↛ 2455line 2451 didn't jump to line 2455, because the condition on line 2451 was never false

2452 # Handled above 

2453 pass 

2454 else: 

2455 self.warnings.append( 

2456 { 

2457 self.pid: self.__class__.__name__ 

2458 + "." 

2459 + inspect.currentframe().f_code.co_name 

2460 + " " 

2461 + tag 

2462 } 

2463 ) 

2464 

2465 # Add the footnotes at the end 

2466 if len(self.fns) > 0: 2466 ↛ 2467line 2466 didn't jump to line 2467, because the condition on line 2466 was never true

2467 fn_text = '<div class="footnotes">' 

2468 for fn in self.fns: 

2469 fn_text += fn 

2470 fn_text += "</div>" 

2471 

2472 self.body_html = fn_text if not self.body_html else self.body_html + fn_text 

2473 

2474 if ( 2474 ↛ 2478line 2474 didn't jump to line 2478

2475 len(self.funding_statement_xml) > 0 

2476 and self.funding_statement_xml.find('<name-content content-type="fn"') == -1 

2477 ): 

2478 self.funding_statement_xml = ( 

2479 f'<name-content content-type="fn">{self.funding_statement_xml}</name-content>' 

2480 ) 

2481 

2482 # Case for XML with <body>, then <back> and <floats_group> 

2483 # The figures/tables of the floats_group are added inside the body_html 

2484 # (close to their first <xref>) 

2485 # It's too complicated to do the same for the body_xml as we use the get_xml_from_node function. 

2486 # Instead, we append the floats_group_xml to the body_xml 

2487 if hasattr(self, "floats_group_xml"): 2487 ↛ 2488line 2487 didn't jump to line 2488, because the condition on line 2487 was never true

2488 self.body_xml += self.floats_group_xml 

2489 

2490 # Special treatment for Elsevier articles: web scrapping to find the date_published 

2491 # Moved to the import management commands since Elsevier blocks IP after 1000+ requests 

2492 # if hasattr(self, 'pii') and self.date_published_iso_8601_date_str is None: 

2493 # article_data = scrapping.fetch_article(self.doi, self.pii) 

2494 # self.date_published_iso_8601_date_str = article_data.date_published_iso_8601_date_str 

2495 

2496 if self.no_bib: 2496 ↛ 2498line 2496 didn't jump to line 2498, because the condition on line 2496 was never true

2497 # For Geodesic 

2498 ext_link = create_extlink() 

2499 ext_link["rel"] = "source" 

2500 ext_link["location"] = "http://www.numdam.org/item/" + self.pid 

2501 ext_link["metadata"] = "NUMDAM" 

2502 self.ext_links.append(ext_link) 

2503 

2504 def update_body_content(self, node, **kwargs): 

2505 if len(node) == 0: 

2506 # Most journals do not display the Full text 

2507 # the <body> is then used to store the text for the search engine and has no children 

2508 # Let's not compute body_html in this case. 

2509 # We want the same behavior for journals that display the Full text, 

2510 # but with old articles without Full text. 

2511 return 

2512 

2513 # <front> has to be put before <body> so self.pid is defined here 

2514 if hasattr(settings, "SITE_URL_PREFIX"): 2514 ↛ 2515line 2514 didn't jump to line 2515, because the condition on line 2514 was never true

2515 prefix = settings.SITE_URL_PREFIX 

2516 base_article = settings.ARTICLE_BASE_URL 

2517 base_url = "/" + prefix + base_article + self.pid 

2518 else: 

2519 base_url = os.path.join(settings.ARTICLE_BASE_URL, self.pid) 

2520 kwargs["base_url"] = base_url 

2521 

2522 append_to_body = True 

2523 current_len = len(self.supplementary_materials) 

2524 

2525 if "use_sec" in kwargs and kwargs["use_sec"]: 2525 ↛ 2527line 2525 didn't jump to line 2527, because the condition on line 2525 was never true

2526 # Hack for Elsevier: convert <ack> into <sec> of the <body> 

2527 body_tex, body_html = self.parse_node_with_sec(node, **kwargs) 

2528 else: 

2529 body_tex, body_html = self.parse_node_with_mixed_content(node, **kwargs) 

2530 

2531 if len(self.supplementary_materials) != current_len: 2531 ↛ 2534line 2531 didn't jump to line 2534, because the condition on line 2531 was never true

2532 # Elsevier stores supplementary-material in app-group. 

2533 # They are extracted, but ignored in the body_html if the appendix has only supplements 

2534 append_to_body = False 

2535 

2536 for child in node: 

2537 if child.tag == "p": 

2538 for gchild in child: 

2539 if gchild.tag != "supplementary-material": 

2540 append_to_body = True 

2541 

2542 if append_to_body: 2542 ↛ exitline 2542 didn't return from function 'update_body_content', because the condition on line 2542 was never false

2543 self.body_tex = body_tex if not self.body_tex else self.body_tex + body_tex 

2544 self.body_html = body_html if not self.body_html else self.body_html + body_html 

2545 

2546 body_xml = get_xml_from_node(node) 

2547 if not self.body_xml: 2547 ↛ 2550line 2547 didn't jump to line 2550, because the condition on line 2547 was never false

2548 self.body_xml = body_xml 

2549 else: 

2550 if "use_sec" in kwargs and kwargs["use_sec"]: 

2551 self.body_xml = f"{self.body_xml[0:-7]}<sec>{body_xml[5:-6]}</sec></body>" 

2552 else: 

2553 self.body_xml = f"{self.body_xml[0:-7]}{body_xml}</body>" 

2554 

2555 def parse_ack(self, node, **kwargs): 

2556 content_type = node.get("content-type") or "" 

2557 if content_type == "COI-statement": 

2558 self.coi_statement = get_text_from_node(node) 

2559 else: 

2560 # Hack for Elsevier: convert <ack> into <sec> of the <body> 

2561 self.update_body_content(node, use_sec=True) 

2562 

2563 def parse_app(self, node, **kwargs): 

2564 for child in node: 

2565 tag = normalize(child.tag) 

2566 

2567 if tag == "sec": 

2568 # Elsevier can store all appendixes inside one <app> ?!? 

2569 # One of them can store the supplements and has to be ignored in the body_html 

2570 self.update_body_content(child) 

2571 else: 

2572 self.warnings.append( 

2573 { 

2574 self.pid: self.__class__.__name__ 

2575 + "." 

2576 + inspect.currentframe().f_code.co_name 

2577 + " " 

2578 + tag 

2579 } 

2580 ) 

2581 

2582 def parse_app_group(self, node, **kwargs): 

2583 for child in node: 

2584 tag = normalize(child.tag) 

2585 

2586 if tag == "app": 

2587 self.parse_app(child) 

2588 else: 

2589 self.warnings.append( 

2590 { 

2591 self.pid: self.__class__.__name__ 

2592 + "." 

2593 + inspect.currentframe().f_code.co_name 

2594 + " " 

2595 + tag 

2596 } 

2597 ) 

2598 

2599 def parse_article_categories(self, node, **kwargs): 

2600 for child in node: 

2601 tag = normalize(child.tag) 

2602 

2603 if tag == "subj-group": 2603 ↛ 2606line 2603 didn't jump to line 2606, because the condition on line 2603 was never false

2604 self.parse_subj_group(child) 

2605 else: 

2606 self.warnings.append( 

2607 { 

2608 self.pid: self.__class__.__name__ 

2609 + "." 

2610 + inspect.currentframe().f_code.co_name 

2611 + " " 

2612 + tag 

2613 } 

2614 ) 

2615 

2616 def parse_article_meta(self, node, **kwargs): 

2617 for child in node: 

2618 tag = normalize(child.tag) 

2619 

2620 if tag == "article-id": 

2621 self.parse_id(child) 

2622 elif tag == "fpage": 

2623 self.fpage = child.text 

2624 self.page_type = child.get("content-type") or "" 

2625 elif tag == "lpage": 

2626 self.lpage = child.text or "" 

2627 elif tag == "page-range": 

2628 self.page_range = child.text 

2629 elif tag in ("page-count", "size"): 2629 ↛ 2630line 2629 didn't jump to line 2630, because the condition on line 2629 was never true

2630 self.size = child.text 

2631 elif tag == "elocation-id": 2631 ↛ 2632line 2631 didn't jump to line 2632, because the condition on line 2631 was never true

2632 self.elocation = child.text 

2633 elif tag == "pub-date": 

2634 date_type = child.get("date-type") or "pub" 

2635 if date_type == "pub": 

2636 self.date_published_iso_8601_date_str = self.get_data_from_date(child) 

2637 else: 

2638 date_str = self.get_data_from_date(child) 

2639 self.history_dates.append({"type": "online", "date": date_str}) 

2640 elif tag == "history": 

2641 self.history_dates += self.get_data_from_history(child) 

2642 for date in self.history_dates: 

2643 if date["type"] == "prod-deployed-date": 

2644 self.prod_deployed_date_iso_8601_date_str = date["date"] 

2645 elif tag in ["volume", "issue-id", "permissions", "pub-date-not-available"]: 

2646 pass 

2647 # TODO: store permissions in XML 

2648 elif tag == "author-notes": 2648 ↛ 2650line 2648 didn't jump to line 2650, because the condition on line 2648 was never true

2649 # 2022/11/15 Mersenne meeting. ignore author-notes 

2650 pass 

2651 # self.parse_author_notes(child) 

2652 else: 

2653 fct_name = "parse_" + tag.replace("-", "_") 

2654 ftor = getattr(self, fct_name, None) 

2655 if callable(ftor): 

2656 ftor(child, add_ext_link=True) 

2657 else: 

2658 self.warnings.append( 

2659 { 

2660 self.pid: self.__class__.__name__ 

2661 + "." 

2662 + inspect.currentframe().f_code.co_name 

2663 + " " 

2664 + tag 

2665 } 

2666 ) 

2667 

2668 def parse_author_notes(self, node, **kwargs): 

2669 for child in node: 

2670 tag = normalize(child.tag) 

2671 if tag == "fn": 

2672 _, html = self.parse_node_with_fn(child, keep_fn=True, keep_fn_label=False) 

2673 xml = get_xml_from_node(child) 

2674 self.footnotes_xml += xml 

2675 self.footnotes_html += html 

2676 

2677 def parse_body(self, node, **kwargs): 

2678 self.body = get_text_from_node(node) 

2679 

2680 if hasattr(self, "floats"): 2680 ↛ 2681line 2680 didn't jump to line 2681, because the condition on line 2680 was never true

2681 self.floats_to_insert = [] 

2682 

2683 self.update_body_content(node, **kwargs) 

2684 

2685 if not self.body_xml: 

2686 self.body_xml = get_xml_from_node(node) 

2687 

2688 def parse_boxed_text(self, node, **kwargs): 

2689 """ 

2690 Parse <boxed-text> inside <floats-group> and fills the self.float_boxed_texts dictionary. 

2691 The dictionary is then used during parse_body to embed the boxed-text inside the body HTML. 

2692 """ 

2693 box_id = node.attrib["id"] if "id" in node.attrib else None 

2694 

2695 _, html = self.parse_node_with_boxed_text(node, **kwargs) 

2696 

2697 if box_id is not None: 

2698 self.floats[box_id] = html 

2699 

2700 def parse_floats_group(self, node, **kwargs): 

2701 if hasattr(settings, "SITE_URL_PREFIX"): 

2702 prefix = settings.SITE_URL_PREFIX 

2703 base_article = settings.ARTICLE_BASE_URL 

2704 base_url = "/" + prefix + base_article + self.pid 

2705 else: 

2706 base_url = os.path.join(settings.ARTICLE_BASE_URL, self.pid) 

2707 

2708 self.floats = {} 

2709 for child in node: 

2710 tag = normalize(child.tag) 

2711 

2712 if tag == "fig": 

2713 self.parse_node_with_fig(child, append_floats=True, base_url=base_url) 

2714 elif tag == "table-wrap": 

2715 self.parse_node_with_table_wrap(child, append_floats=True, base_url=base_url) 

2716 elif tag == "boxed-text": 

2717 self.parse_boxed_text(child, base_url=base_url) 

2718 else: 

2719 self.warnings.append( 

2720 { 

2721 self.pid: self.__class__.__name__ 

2722 + "." 

2723 + inspect.currentframe().f_code.co_name 

2724 + " " 

2725 + tag 

2726 } 

2727 ) 

2728 

2729 self.floats_group_xml = get_xml_from_node(node) 

2730 

2731 def parse_fn_group(self, node, **kwargs): 

2732 for child in node: 

2733 tag = normalize(child.tag) 

2734 

2735 if tag == "fn": 

2736 _, html = self.parse_node_with_fn(child, keep_fn=True) 

2737 xml = get_xml_from_node(child) 

2738 

2739 self.footnotes_html += html 

2740 self.footnotes_xml += xml 

2741 else: 

2742 self.warnings.append( 

2743 { 

2744 self.pid: self.__class__.__name__ 

2745 + "." 

2746 + inspect.currentframe().f_code.co_name 

2747 + " " 

2748 + tag 

2749 } 

2750 ) 

2751 

2752 def parse_funding_group(self, node, **kwargs): 

2753 for child in node: 

2754 tag = normalize(child.tag) 

2755 

2756 if tag == "award-group": 2756 ↛ 2758line 2756 didn't jump to line 2758, because the condition on line 2756 was never false

2757 self.parse_award_group(child) 

2758 elif tag == "funding-statement": 

2759 for funding_node in child: 

2760 if funding_node.tag == "name-content": 

2761 for funding_child in funding_node: 

2762 if funding_child.tag == "fn": 

2763 _, html = self.parse_node_with_fn(funding_child, keep_fn=True) 

2764 self.funding_statement_html += html 

2765 self.funding_statement_xml = get_xml_from_node(funding_node) 

2766 

2767 # TODO: handle funding-statement with simple texts 

2768 else: 

2769 self.warnings.append( 

2770 { 

2771 self.pid: self.__class__.__name__ 

2772 + "." 

2773 + inspect.currentframe().f_code.co_name 

2774 + " " 

2775 + tag 

2776 } 

2777 ) 

2778 

2779 def parse_issue(self, node, **kwargs): 

2780 # Elsevier stores bs in the seq attribute 

2781 self.seq = "0" if hasattr(self, "pii") else (node.get("seq") or "0") 

2782 

2783 

2784class JatsRef(RefBase, JatsBase): 

2785 def __init__(self, *args, **kwargs): # , tree, lang): 

2786 super().__init__(*args, **kwargs) # lang) 

2787 self.parse_tree(kwargs["tree"]) 

2788 

2789 def parse_tree(self, tree): 

2790 super().parse_tree(tree) 

2791 

2792 self.user_id = get_normalized_attrib(tree, "id") or "" 

2793 

2794 for node in tree: 

2795 tag = normalize(node.tag) 

2796 

2797 if tag == "label": 

2798 self.label = node.text or "" 

2799 

2800 if self.label: 2800 ↛ 2835line 2800 didn't jump to line 2835, because the condition on line 2800 was never false

2801 if self.label[0] != "[": 

2802 self.label = "[" + self.label + "]" 

2803 

2804 elif tag == "mixed-citation" or tag == "note": 

2805 self.parse_citation_node(node) 

2806 

2807 self.citation_tex, self.citation_html = self.parse_node_with_mixed_content( 

2808 node, 

2809 is_citation=True, 

2810 is_mixed_citation=True, 

2811 add_ext_link=True, 

2812 ref_type="misc", 

2813 ) 

2814 

2815 if self.label: 

2816 self.citation_html = self.label + " " + self.citation_html 

2817 self.citation_tex = self.label + " " + self.citation_tex 

2818 

2819 elif tag == "element-citation": 

2820 self.parse_citation_node(node) 

2821 

2822 self.citation_tex = self.citation_html = get_citation_html(self) 

2823 else: 

2824 self.warnings.append( 

2825 { 

2826 self.pid: self.__class__.__name__ 

2827 + "." 

2828 + inspect.currentframe().f_code.co_name 

2829 + " " 

2830 + tag 

2831 } 

2832 ) 

2833 

2834 # With xmldata, citation_xml does not have '<ref>', but only the text of the children 

2835 self.citation_xml += get_xml_from_node(node) 

2836 

2837 def get_data_from_name_in_ref(self, node, role): 

2838 params = create_contributor() 

2839 params["role"] = role 

2840 

2841 if node.tag == "name": 

2842 self.update_data_from_name(node, params) 

2843 elif node.tag == "string-name": 

2844 self.update_data_from_name(node, params) 

2845 if params["first_name"] == "" and params["last_name"] == "": 

2846 params["string_name"] = node.text or "" 

2847 elif node.tag == "name-alternatives": 2847 ↛ 2848line 2847 didn't jump to line 2848, because the condition on line 2847 was never true

2848 params["mid"] = self.get_data_from_name_alternatives(node) 

2849 elif node.tag == "collab": 2849 ↛ 2850line 2849 didn't jump to line 2850, because the condition on line 2849 was never true

2850 params["string_name"] = node.text or "" 

2851 

2852 use_initials = getattr(settings, "REF_JEP_STYLE", False) 

2853 helper_update_name_params(params, use_initials) 

2854 params["contrib_xml"] = "<etal/>" if node.tag == "etal" else get_xml_from_node(node) 

2855 

2856 return params 

2857 

2858 def parse_node_with_chapter_title(self, node, **kwargs): 

2859 tex, html = self.parse_inner_node(node, **kwargs) 

2860 

2861 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False 

2862 if is_mixed_citation: 

2863 html = add_span_class_to_html_from_chapter_title(html, **kwargs) 

2864 

2865 return tex, html 

2866 

2867 def parse_node_with_source(self, node, **kwargs): 

2868 tex, html = self.parse_inner_node(node, **kwargs) 

2869 

2870 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False 

2871 if is_mixed_citation: 

2872 html = add_span_class_to_html_from_source(html, **kwargs) 

2873 

2874 return tex, html 

2875 

2876 def parse_citation_node(self, node, **kwargs): 

2877 self.type = get_normalized_attrib(node, "publication-type") or "misc" 

2878 

2879 # Elsevier can store data about a translation after comments (<source>...) 

2880 # Append these tags in the comment 

2881 has_comment = False 

2882 

2883 for child in node: 

2884 tag = normalize(child.tag) 

2885 

2886 if tag in ("page-count", "size"): 

2887 if not self.size: 2887 ↛ 2883line 2887 didn't jump to line 2883, because the condition on line 2887 was never false

2888 self.size = child.text 

2889 elif tag == "comment": 

2890 has_comment = True 

2891 # comments may have ext-links or uri. HTML <a> links will be added 

2892 _, comment = self.parse_node_with_mixed_content( 

2893 child, is_citation=True, is_comment=True, add_HTML_link=True 

2894 ) 

2895 if self.comment: 

2896 self.comment += " " 

2897 self.comment += comment 

2898 elif tag == "source": 

2899 # TODO: migration to store source_tex and source_html 

2900 _, source_tex = self.parse_node_with_mixed_content(child, is_citation=True) 

2901 

2902 if self.type in ["book", "inproceedings"] and len(self.source_tex) > 0: 2902 ↛ 2904line 2902 didn't jump to line 2904, because the condition on line 2902 was never true

2903 # Multiple source for a book, store the extra source in series 

2904 if self.series and has_comment: 

2905 self.comment += " " + source_tex 

2906 else: 

2907 if self.series: 

2908 self.series += ", " 

2909 self.series += get_text_from_node(child) 

2910 else: 

2911 if self.source_tex and has_comment: 2911 ↛ 2912line 2911 didn't jump to line 2912, because the condition on line 2911 was never true

2912 self.comment += " " + source_tex 

2913 else: 

2914 self.source_tex = source_tex 

2915 elif tag == "series": 

2916 series = get_text_from_node(child) 

2917 if self.series and has_comment: 2917 ↛ 2918line 2917 didn't jump to line 2918, because the condition on line 2917 was never true

2918 self.comment += ", " + series 

2919 else: 

2920 if self.series: 2920 ↛ 2921line 2920 didn't jump to line 2921, because the condition on line 2920 was never true

2921 self.series += ", " 

2922 self.series += series 

2923 elif tag == "annotation": 2923 ↛ 2924line 2923 didn't jump to line 2924, because the condition on line 2923 was never true

2924 if not self.annotation: 

2925 self.annotation = get_text_from_node(child) 

2926 elif tag == "article-title": 

2927 # TODO: migration to store article_title_tex and article_title_html 

2928 _, article_title_tex = self.parse_node_with_mixed_content(child, is_citation=True) 

2929 

2930 if self.type == "book": 2930 ↛ 2932line 2930 didn't jump to line 2932, because the condition on line 2930 was never true

2931 # Elsevier uses article-title for books !?! 

2932 if len(self.source_tex) == 0: 

2933 if has_comment: 

2934 self.comment += " " + article_title_tex 

2935 else: 

2936 self.source_tex = article_title_tex 

2937 else: 

2938 if self.series and has_comment: 

2939 self.comment += ", " + article_title_tex 

2940 else: 

2941 self.series += get_text_from_node(child) 

2942 elif self.type == "inproceedings": 

2943 if self.chapter_title_tex and has_comment: 2943 ↛ 2944line 2943 didn't jump to line 2944, because the condition on line 2943 was never true

2944 self.comment += " " + article_title_tex 

2945 else: 

2946 self.chapter_title_tex = article_title_tex 

2947 else: 

2948 if self.article_title_tex and has_comment: 2948 ↛ 2949line 2948 didn't jump to line 2949, because the condition on line 2948 was never true

2949 self.comment += " " + article_title_tex 

2950 else: 

2951 self.article_title_tex = article_title_tex 

2952 elif tag == "chapter-title": 

2953 # TODO: migration to store chapter_title_tex and chapter_title_html 

2954 _, chapter_title_tex = self.parse_node_with_mixed_content(child, is_citation=True) 

2955 if self.chapter_title_tex and has_comment: 2955 ↛ 2956line 2955 didn't jump to line 2956, because the condition on line 2955 was never true

2956 self.comment += " " + chapter_title_tex 

2957 else: 

2958 self.chapter_title_tex = chapter_title_tex 

2959 elif tag == "conf-name": 

2960 _, conf_tex = self.parse_node_with_mixed_content(child, is_citation=True) 

2961 if self.source_tex and has_comment: 2961 ↛ 2962line 2961 didn't jump to line 2962, because the condition on line 2961 was never true

2962 self.comment += ", " + conf_tex 

2963 else: 

2964 self.source_tex = conf_tex 

2965 elif tag in ("name", "string-name", "name-alternatives", "etal", "collab"): 

2966 params = self.get_data_from_name_in_ref(child, "author") 

2967 self.contributors.append(params) 

2968 elif tag == "person-group": 

2969 self.parse_person_group(child) 

2970 elif tag == "ext-link": 

2971 self.parse_ext_link(child, add_ext_link=True) 

2972 elif tag == "pub-id": 

2973 self.parse_pub_id(child) 

2974 elif tag == "date": 2974 ↛ 2975line 2974 didn't jump to line 2975, because the condition on line 2974 was never true

2975 self.year = get_text_from_node(child) 

2976 elif tag == "date-in-citation": 2976 ↛ 2977line 2976 didn't jump to line 2977, because the condition on line 2976 was never true

2977 date_ = child.get("iso-8601-date") or "" 

2978 if date_: 

2979 if self.comment: 

2980 self.comment += ", " 

2981 self.comment += "Accessed " + date_ 

2982 elif tag == "isbn": 2982 ↛ 2983line 2982 didn't jump to line 2983, because the condition on line 2982 was never true

2983 if self.annotation: 

2984 self.annotation += ", " 

2985 self.annotation += "ISBN: " + child.text 

2986 elif tag == "issn": 2986 ↛ 2987line 2986 didn't jump to line 2987, because the condition on line 2986 was never true

2987 if self.annotation: 

2988 self.annotation += ", " 

2989 self.annotation += "ISSN: " + child.text 

2990 elif child.text is not None: 

2991 variable_name = tag.replace("-", "_") 

2992 if has_comment and hasattr(self, variable_name) and getattr(self, variable_name): 2992 ↛ 2993line 2992 didn't jump to line 2993, because the condition on line 2992 was never true

2993 if tag == "fpage": 

2994 self.comment += ", pp. " 

2995 elif tag == "lpage": 

2996 self.comment += "-" 

2997 else: 

2998 self.comment += ", " 

2999 self.comment += child.text 

3000 elif not hasattr(self, variable_name) or not getattr(self, variable_name): 

3001 setattr(self, variable_name, child.text) 

3002 

3003 def parse_person_group(self, node, **kwargs): 

3004 role = node.get("person-group-type") or "" 

3005 if role and role[-1] == "s": 3005 ↛ 3006line 3005 didn't jump to line 3006, because the condition on line 3005 was never true

3006 role = role[:-1] 

3007 

3008 for child in node: 

3009 tag = normalize(child.tag) 

3010 

3011 if tag in ("name", "string-name", "name-alternatives", "etal", "collab"): 3011 ↛ 3015line 3011 didn't jump to line 3015, because the condition on line 3011 was never false

3012 contrib = self.get_data_from_name_in_ref(child, role) 

3013 self.contributors.append(contrib) 

3014 else: 

3015 self.warnings.append( 

3016 { 

3017 self.pid: self.__class__.__name__ 

3018 + "." 

3019 + inspect.currentframe().f_code.co_name 

3020 + " " 

3021 + tag 

3022 } 

3023 ) 

3024 

3025 def parse_pub_id(self, node, **kwargs): 

3026 node_type = node.get("pub-id-type") or "" 

3027 

3028 data = { 

3029 "rel": node_type, 

3030 "mimetype": "", 

3031 "location": "", 

3032 "base": "", 

3033 "metadata": node.text, 

3034 } 

3035 

3036 self.add_extids_from_node_with_link(data) 

3037 

3038 def split_label(self): 

3039 """ 

3040 Used when sorting non-digit bibitems 

3041 """ 

3042 label = self.label.lower() 

3043 if len(label) > 1: 

3044 label = label[1:-1] 

3045 

3046 try: 

3047 self.label_prefix, self.label_suffix = re.split(r"[\d]+", label) 

3048 except ValueError: 

3049 # Special case where label is similar as "Sma" instead of "Sma15" 

3050 self.label_prefix, self.label_suffix = [label, ""] 

3051 

3052 

3053class BitsCollection(CollectionData, JatsBase): 

3054 def __init__(self, *args, **kwargs): 

3055 super().__init__(*args, **kwargs) 

3056 self.parse_tree(kwargs["tree"]) 

3057 

3058 def parse_tree(self, tree): 

3059 super().parse_tree(tree) 

3060 

3061 if tree is not None: 3061 ↛ 3104line 3061 didn't jump to line 3104, because the condition on line 3061 was never false

3062 tag = normalize(tree.tag) 

3063 collection_meta_node = None 

3064 if tag == "collection-meta": 

3065 self.parse_collection_meta(tree) 

3066 collection_meta_node = tree 

3067 elif tag == "in-collection": 3067 ↛ 3091line 3067 didn't jump to line 3091, because the condition on line 3067 was never false

3068 for node in tree: 

3069 tag = normalize(node.tag) 

3070 

3071 if tag == "collection-meta": 

3072 self.parse_collection_meta(node) 

3073 collection_meta_node = node 

3074 elif tag == "volume": 

3075 self.parse_volume(node) 

3076 elif tag == "volume-series": 3076 ↛ 3078line 3076 didn't jump to line 3078, because the condition on line 3076 was never false

3077 self.parse_volume_series(node) 

3078 elif tag == "volume-title": 

3079 self.parse_volume_title(node) 

3080 else: 

3081 self.warnings.append( 

3082 { 

3083 self.pid: self.__class__.__name__ 

3084 + "." 

3085 + inspect.currentframe().f_code.co_name 

3086 + " " 

3087 + tag 

3088 } 

3089 ) 

3090 

3091 if collection_meta_node is not None: 3091 ↛ 3094line 3091 didn't jump to line 3094, because the condition on line 3091 was never false

3092 self.set_seq(collection_meta_node) 

3093 else: 

3094 self.warnings.append( 

3095 { 

3096 self.pid: self.__class__.__name__ 

3097 + "." 

3098 + inspect.currentframe().f_code.co_name 

3099 + " " 

3100 + tag 

3101 } 

3102 ) 

3103 

3104 self.collection = Foo() 

3105 self.collection.pid = self.pid 

3106 

3107 def parse_collection_meta(self, node, **kwargs): 

3108 self.coltype = node.get("collection-type") 

3109 

3110 for child in node: 

3111 tag = normalize(child.tag) 

3112 

3113 if tag == "collection-id": 

3114 self.pid = child.text 

3115 elif tag == "title-group": 

3116 self.parse_title_group(child) 

3117 elif tag == "issn": 

3118 node_type = child.get("pub-type") 

3119 if node_type == "ppub": 3119 ↛ 3120line 3119 didn't jump to line 3120, because the condition on line 3119 was never true

3120 self.issn = child.text 

3121 self.ids.append(("issn", child.text)) 

3122 elif node_type == "epub": 3122 ↛ 3123line 3122 didn't jump to line 3123, because the condition on line 3122 was never true

3123 self.e_issn = child.text 

3124 self.ids.append(("e-issn", child.text)) 

3125 elif tag == "ext-link": 3125 ↛ 3126line 3125 didn't jump to line 3126, because the condition on line 3125 was never true

3126 data = self.get_data_from_ext_link(child) 

3127 self.ext_links.append(data) 

3128 elif tag == "volume-in-collection": 

3129 self.parse_volume_in_collection(child) 

3130 else: 

3131 self.warnings.append( 

3132 { 

3133 self.pid: self.__class__.__name__ 

3134 + "." 

3135 + inspect.currentframe().f_code.co_name 

3136 + " " 

3137 + tag 

3138 } 

3139 ) 

3140 

3141 def parse_volume(self, node, **kwargs): 

3142 self.volume = node.text 

3143 

3144 def parse_volume_in_collection(self, node, **kwargs): 

3145 for child in node: 

3146 tag = normalize(child.tag) 

3147 

3148 if tag == "volume-number": 

3149 self.parse_volume(child) 

3150 elif tag == "volume-series": 

3151 self.parse_volume_series(child) 

3152 elif tag == "volume-title": 3152 ↛ 3155line 3152 didn't jump to line 3155, because the condition on line 3152 was never false

3153 self.parse_volume_title(child) 

3154 else: 

3155 self.warnings.append( 

3156 { 

3157 self.pid: self.__class__.__name__ 

3158 + "." 

3159 + inspect.currentframe().f_code.co_name 

3160 + " " 

3161 + tag 

3162 } 

3163 ) 

3164 

3165 def parse_volume_series(self, node, **kwargs): 

3166 self.vseries = node.text 

3167 

3168 def parse_volume_title(self, node, **kwargs): 

3169 self.title_tex, self.title_html = self.parse_node_with_mixed_content(node) 

3170 self.title_xml = get_xml_from_node(node) 

3171 

3172 def set_seq(self, node): 

3173 try: 

3174 # First, use the seq attribute, if any 

3175 self.seq = int(node.get("seq") or "") 

3176 except ValueError: 

3177 # Second, use self.volume (which can be like "158-159") 

3178 if not self.volume: 3178 ↛ 3179line 3178 didn't jump to line 3179, because the condition on line 3178 was never true

3179 self.seq = 0 

3180 else: 

3181 text = self.volume.split("-")[0] 

3182 try: 

3183 self.seq = int(text) 

3184 except ValueError: 

3185 self.seq = 0 

3186 

3187 # Third, use self.vseries as an offset 

3188 try: 

3189 # pas plus de 10000 ouvrages dans une série (gasp) 

3190 self.seq = int(self.vseries) * 10000 + self.seq 

3191 except ValueError: 

3192 pass 

3193 

3194 

3195class BitsBook(BookData, JatsBase): 

3196 def __init__(self, *args, **kwargs): 

3197 super().__init__(*args, **kwargs) 

3198 self.no_bib = kwargs.get("no_bib", False) 

3199 

3200 self.parse_tree(kwargs["tree"]) 

3201 

3202 def parse_tree(self, tree): 

3203 super().parse_tree(tree) 

3204 

3205 book_type = get_normalized_attrib(tree, "book-type") or "Book" 

3206 self.ctype = "book-" + book_type 

3207 

3208 for node in tree: 

3209 if type(tree) == type(node): 3209 ↛ 3208line 3209 didn't jump to line 3208, because the condition on line 3209 was never false

3210 tag = normalize(node.tag) 

3211 

3212 if tag in ("collection-meta", "in-collection"): 

3213 col = BitsCollection(tree=node) 

3214 self.incollection.append(col) 

3215 elif tag == "book-meta": 

3216 self.parse_book_meta(node) 

3217 elif tag == "book-body": 

3218 self.parse_book_body(node) 

3219 elif tag == "front-matter": 

3220 self.parse_front_matter(node) 

3221 elif tag == "book-back": 

3222 for child in node: 

3223 tag = normalize(child.tag) 

3224 if tag == "ref-list": 

3225 self.parse_ref_list(child) 

3226 else: 

3227 self.warnings.append( 

3228 { 

3229 self.pid: self.__class__.__name__ 

3230 + "." 

3231 + inspect.currentframe().f_code.co_name 

3232 + " " 

3233 + tag 

3234 } 

3235 ) 

3236 else: 

3237 self.warnings.append( 

3238 { 

3239 self.pid: self.__class__.__name__ 

3240 + "." 

3241 + inspect.currentframe().f_code.co_name 

3242 + " " 

3243 + tag 

3244 } 

3245 ) 

3246 

3247 self.set_contribs() 

3248 self.set_title() 

3249 

3250 def parse_book_body(self, node, **kwargs): 

3251 for child in node: 

3252 if type(child) == type(node): 3252 ↛ 3251line 3252 didn't jump to line 3251, because the condition on line 3252 was never false

3253 tag = normalize(child.tag) 

3254 

3255 if tag == "book-part": 3255 ↛ 3260line 3255 didn't jump to line 3260, because the condition on line 3255 was never false

3256 book_part = BitsBookPart(tree=child) 

3257 self.warnings.extend(book_part.warnings) 

3258 self.parts.append(book_part) 

3259 else: 

3260 self.warnings.append( 

3261 { 

3262 self.pid: self.__class__.__name__ 

3263 + "." 

3264 + inspect.currentframe().f_code.co_name 

3265 + " " 

3266 + tag 

3267 } 

3268 ) 

3269 

3270 if not self.parts: 

3271 self.body = get_text_from_node(node) 

3272 

3273 def parse_book_meta(self, node, **kwargs): 

3274 for child in node: 

3275 tag = normalize(child.tag) 

3276 

3277 if tag == "book-id": 

3278 self.parse_id(child) 

3279 elif tag == "pub-date": 

3280 self.year = self.get_data_from_date(child) 

3281 elif tag == "book-volume-number": 3281 ↛ 3282line 3281 didn't jump to line 3282, because the condition on line 3281 was never true

3282 self.volume = child.text 

3283 self.volume_int = child.text 

3284 elif tag == "pub-history": 

3285 history_dates = self.get_data_from_history(child) 

3286 for date in history_dates: 

3287 if date["type"] == "last-modified": 

3288 self.last_modified_iso_8601_date_str = date["date"] 

3289 elif date["type"] == "prod-deployed-date": 3289 ↛ 3290line 3289 didn't jump to line 3290, because the condition on line 3289 was never true

3290 self.prod_deployed_date_iso_8601_date_str = date["date"] 

3291 elif tag == "book-title-group": 

3292 self.parse_title_group(child) 

3293 elif tag == "publisher": 

3294 self.publisher = JatsPublisher(tree=child) 

3295 else: 

3296 fct_name = "parse_" + tag.replace("-", "_") 

3297 ftor = getattr(self, fct_name, None) 

3298 if callable(ftor): 

3299 ftor(child, add_ext_link=True) 

3300 else: 

3301 self.warnings.append( 

3302 { 

3303 self.pid: self.__class__.__name__ 

3304 + "." 

3305 + inspect.currentframe().f_code.co_name 

3306 + " " 

3307 + tag 

3308 } 

3309 ) 

3310 

3311 if self.last_modified_iso_8601_date_str is None: 3311 ↛ 3312line 3311 didn't jump to line 3312, because the condition on line 3311 was never true

3312 self.last_modified_iso_8601_date_str = timezone.now().isoformat() 

3313 

3314 def parse_custom_meta_group(self, node, **kwargs): 

3315 for child in node: 

3316 tag = normalize(child.tag) 

3317 

3318 if tag == "custom-meta": 3318 ↛ 3315line 3318 didn't jump to line 3315, because the condition on line 3318 was never false

3319 name, value = self.get_data_from_custom_meta(child) 

3320 

3321 if name == "provider": 3321 ↛ 3315line 3321 didn't jump to line 3315, because the condition on line 3321 was never false

3322 self.provider = value 

3323 

3324 def set_contribs(self): 

3325 """ 

3326 Update the contrib_groups if the XML does not declare any 

3327 - with the authors of the first part 

3328 - if the book is a monograph 

3329 - if all parts are written by the same authors 

3330 

3331 :return: 

3332 """ 

3333 

3334 authors = [contrib for contrib in self.contributors if contrib["role"] == "author"] 

3335 if not authors: 

3336 if self.ctype == "book-monograph" and self.parts: 

3337 first_part = self.parts[0] 

3338 self.contributors = first_part.contributors 

3339 elif ( 3339 ↛ exitline 3339 didn't return from function 'set_contribs', because the condition on line 3339 was never false

3340 self.ctype == "book-edited-book" or self.ctype == "book-lecture-notes" 

3341 ) and self.parts: 

3342 # check if authors of the book-parts are identical 

3343 equal = True 

3344 book_part_contributors = self.parts[0].contributors 

3345 i = 1 

3346 while equal and i < len(self.parts): 

3347 part = self.parts[i] 

3348 if part.contributors != book_part_contributors: 3348 ↛ 3350line 3348 didn't jump to line 3350, because the condition on line 3348 was never false

3349 equal = False 

3350 i += 1 

3351 if equal: 3351 ↛ 3352line 3351 didn't jump to line 3352, because the condition on line 3351 was never true

3352 if self.ctype == "book-edited-book": 

3353 self.ctype = "book-monograph" 

3354 self.contributors = book_part_contributors 

3355 else: 

3356 contrib = create_contributor() 

3357 contrib["string_name"] = "Collectif" 

3358 contrib["role"] = "author" 

3359 contrib["contrib_xml"] = get_contrib_xml(contrib) 

3360 self.contributors.append(contrib) 

3361 

3362 def set_title(self): 

3363 if self.title_xml == "" and len(self.incollection) > 0: 

3364 self.title_xml = self.incollection[0].title_xml 

3365 self.title_html = self.incollection[0].title_html 

3366 self.title_tex = self.incollection[0].title_tex 

3367 

3368 

3369class BitsBookPart(BookPartData, JatsArticleBase): 

3370 def __init__(self, *args, **kwargs): 

3371 super().__init__(*args, **kwargs) 

3372 self.no_bib = kwargs.get("no_bib", False) 

3373 self.parse_tree(kwargs["tree"]) 

3374 

3375 def parse_tree(self, tree): 

3376 super().parse_tree(tree) 

3377 

3378 self.atype = get_normalized_attrib(tree, "book-part-type") or "" 

3379 try: 

3380 self.seq = int(get_normalized_attrib(tree, "seq") or "") 

3381 except ValueError: 

3382 pass 

3383 

3384 for node in tree: 

3385 tag = normalize(node.tag) 

3386 

3387 if tag == "book-part-meta": 

3388 self.parse_book_part_meta(node) 

3389 elif tag == "body": 

3390 self.parse_body(node) 

3391 elif tag == "front-matter": 3391 ↛ 3392line 3391 didn't jump to line 3392, because the condition on line 3391 was never true

3392 self.parse_front_matter(node) 

3393 elif tag == "back": 3393 ↛ 3410line 3393 didn't jump to line 3410, because the condition on line 3393 was never false

3394 for child in node: 

3395 tag = normalize(child.tag) 

3396 

3397 if tag == "ref-list": 3397 ↛ 3400line 3397 didn't jump to line 3400, because the condition on line 3397 was never false

3398 self.parse_ref_list(child) 

3399 else: 

3400 self.warnings.append( 

3401 { 

3402 self.pid: self.__class__.__name__ 

3403 + "." 

3404 + inspect.currentframe().f_code.co_name 

3405 + " " 

3406 + tag 

3407 } 

3408 ) 

3409 else: 

3410 self.warnings.append( 

3411 { 

3412 self.pid: self.__class__.__name__ 

3413 + "." 

3414 + inspect.currentframe().f_code.co_name 

3415 + " " 

3416 + tag 

3417 } 

3418 ) 

3419 

3420 # Workaround a numdam-plus bug where a book-part can have a trans-title without a title 

3421 # TODO: Fix numdam-plus, the books impacted and remove the hack 

3422 self.set_title() 

3423 

3424 def parse_book_part_meta(self, node, **kwargs): 

3425 for child in node: 

3426 tag = normalize(child.tag) 

3427 

3428 if tag == "book-part-id": 

3429 self.parse_id(child) 

3430 elif tag == "fpage": 

3431 self.fpage = child.text 

3432 self.page_type = get_normalized_attrib(child, "content-type") or "" 

3433 elif tag == "lpage": 

3434 self.lpage = child.text 

3435 elif tag == "page-range": 3435 ↛ 3436line 3435 didn't jump to line 3436, because the condition on line 3435 was never true

3436 self.page_range = child.text 

3437 else: 

3438 fct_name = "parse_" + tag.replace("-", "_") 

3439 ftor = getattr(self, fct_name, None) 

3440 if callable(ftor): 3440 ↛ 3443line 3440 didn't jump to line 3443, because the condition on line 3440 was never false

3441 ftor(child) 

3442 else: 

3443 self.warnings.append( 

3444 { 

3445 self.pid: self.__class__.__name__ 

3446 + "." 

3447 + inspect.currentframe().f_code.co_name 

3448 + " " 

3449 + tag 

3450 } 

3451 ) 

3452 

3453 def parse_body(self, node, **kwargs): 

3454 for child in node: 

3455 tag = normalize(child.tag) 

3456 

3457 if tag == "book-part": 

3458 book_part = BitsBookPart(tree=child) 

3459 self.warnings.extend(book_part.warnings) 

3460 self.parts.append(book_part) 

3461 else: 

3462 self.warnings.append( 

3463 { 

3464 self.pid: self.__class__.__name__ 

3465 + "." 

3466 + inspect.currentframe().f_code.co_name 

3467 + " " 

3468 + tag 

3469 } 

3470 ) 

3471 

3472 self.body = get_text_from_node(node) 

3473 

3474 def set_title(self): 

3475 """ 

3476 Bug in some books: some chapters may have a trans-title, but no title ! 

3477 Hack and manually set the title* 

3478 :return: 

3479 """ 

3480 

3481 if self.trans_title_html and not self.title_html: 

3482 self.title_html = self.trans_title_html 

3483 self.title_tex = self.trans_title_tex 

3484 

3485 

3486###################################################################################### 

3487# 

3488# Functions used by ptf-tools 

3489# 

3490###################################################################################### 

3491 

3492 

3493def update_bibitem_xml(bibitem, new_ids): 

3494 xml = "<ref>" + bibitem.citation_xml + "</ref>" 

3495 the_parser = etree.XMLParser( 

3496 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True 

3497 ) 

3498 tree = etree.fromstring(xml, parser=the_parser) 

3499 

3500 node = tree.find("element-citation") 

3501 if node is None: 

3502 node = tree.find("mixed-citation") 

3503 if node is not None: 3503 ↛ 3544line 3503 didn't jump to line 3544, because the condition on line 3503 was never false

3504 children_to_remove = [] 

3505 for child in node: 

3506 if child.tag == "ext-link": 

3507 child_type = child.get("ext-link-type") 

3508 if child_type and child_type in [ 

3509 "zbl-item-id", 

3510 "mr-item-id", 

3511 "doi", 

3512 "numdam-id", 

3513 "mathdoc-id", 

3514 "eid", 

3515 ]: 

3516 children_to_remove.append(child) 

3517 elif child.tag == "pub-id": 

3518 child_type = child.get("pub-id-type") 

3519 if child_type and child_type in [ 

3520 "zbl-item-id", 

3521 "mr-item-id", 

3522 "doi", 

3523 "numdam-id", 

3524 "mathdoc-id", 

3525 ]: 

3526 children_to_remove.append(child) 

3527 

3528 for child in children_to_remove: 

3529 node.remove(child) 

3530 

3531 for id_type, value_dict in new_ids.items(): 

3532 if value_dict["checked"] and not value_dict["false_positive"]: 

3533 if id_type in ["doi", "arxiv", "tel", "hal", "theses.fr"]: 

3534 new_node = etree.Element("pub-id") 

3535 new_node.set("pub-id-type", id_type) 

3536 else: 

3537 new_node = etree.Element("ext-link") 

3538 new_node.set("ext-link-type", id_type) 

3539 

3540 new_node.text = value_dict["id_value"] 

3541 node.append(new_node) 

3542 

3543 # TODO Modify the call to update_bibitem_xml and pass the parent's lang 

3544 result = JatsRef(tree=tree, lang="und") 

3545 return result 

3546 

3547 

3548def check_bibitem_xml(bibitem): 

3549 xml = "<ref>" + bibitem.citation_xml + "</ref>" 

3550 the_parser = etree.XMLParser( 

3551 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True 

3552 ) 

3553 tree = etree.fromstring(xml, parser=the_parser) 

3554 

3555 result = JatsRef(tree=tree, lang="und") 

3556 return result 

3557 

3558 

3559# Create XML strings based on internal data 

3560 

3561 

3562def get_single_title_xml(title): 

3563 has_italic = title.find("<i>") > -1 and title.find("</i>") > -1 

3564 has_superscript = title.find("<sup>") > -1 and title.find("</sup>") > -1 

3565 has_subscript = title.find("<sub>") > -1 and title.find("</sub>") > -1 

3566 

3567 if has_italic: 3567 ↛ 3568line 3567 didn't jump to line 3568, because the condition on line 3567 was never true

3568 title = title.replace("<i>", "|||i|||").replace("</i>", "|||/i|||") 

3569 if has_superscript: 3569 ↛ 3570line 3569 didn't jump to line 3570, because the condition on line 3569 was never true

3570 title = title.replace("<sup>", "|||sup|||").replace("</sup>", "|||/sup|||") 

3571 if has_subscript: 3571 ↛ 3572line 3571 didn't jump to line 3572, because the condition on line 3571 was never true

3572 title = title.replace("<sub>", "|||sub|||").replace("</sub>", "|||/sub|||") 

3573 

3574 title = escape(title) 

3575 

3576 if has_italic: 3576 ↛ 3577line 3576 didn't jump to line 3577, because the condition on line 3576 was never true

3577 title = title.replace("|||i|||", "<italic>").replace("|||/i|||", "</italic>") 

3578 

3579 if has_superscript: 3579 ↛ 3580line 3579 didn't jump to line 3580, because the condition on line 3579 was never true

3580 title = title.replace("|||sup|||", "<sup>").replace("|||/sup|||", "</sup>") 

3581 

3582 if has_subscript: 3582 ↛ 3583line 3582 didn't jump to line 3583, because the condition on line 3582 was never true

3583 title = title.replace("|||sub|||", "<sub>").replace("|||/sub|||", "</sub>") 

3584 

3585 return title 

3586 

3587 

3588def get_title_xml(title, trans_title=None, trans_lang=None, with_tex_values=True): 

3589 """ 

3590 Get the title_xml given a simple title 

3591 If the title has formulas, use CKeditorParser first, then call this function with the value_xml returned by the parser 

3592 and set with_tex_values to False 

3593 TODO: enhance CkeditorParser to accept both title and trans_title to build the xml in 1 shot. 

3594 """ 

3595 if with_tex_values: 

3596 title = get_single_title_xml(title) 

3597 

3598 xml = '<title-group xmlns:xlink="http://www.w3.org/1999/xlink">' 

3599 xml += f'<article-title xml:space="preserve">{title}</article-title>' 

3600 

3601 if trans_title and trans_lang: 

3602 if with_tex_values: 

3603 trans_title = get_single_title_xml(trans_title) 

3604 xml += f'<trans-title-group xml:lang="{trans_lang}"><trans-title>{trans_title}</trans-title></trans-title-group>' 

3605 

3606 xml += "</title-group>" 

3607 

3608 return xml 

3609 

3610 

3611def get_issue_title_xml(title, lang, trans_title=None, trans_lang=None): 

3612 """ 

3613 Get the title_xml given a simple title 

3614 """ 

3615 title = get_single_title_xml(title) 

3616 xml = f'<issue-title xml:lang="{lang}" xml:space="preserve">{title}</issue-title>' 

3617 

3618 if trans_title and trans_lang: 

3619 trans_title = get_single_title_xml(trans_title) 

3620 xml += f'<issue-title xml:lang="{trans_lang}" xml:space="preserve">{trans_title}</issue-title>' 

3621 

3622 return xml 

3623 

3624 

3625def get_name_params(first_name, last_name, prefix, suffix, orcid): 

3626 params = { 

3627 "first_name": first_name, 

3628 "last_name": last_name, 

3629 "prefix": prefix, 

3630 "suffix": suffix, 

3631 "orcid": orcid, 

3632 } 

3633 helper_update_name_params(params) 

3634 

3635 return params 

3636 

3637 

3638def get_tex_from_xml(xml, tag, **kwargs): 

3639 parser_ = etree.XMLParser( 

3640 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True 

3641 ) 

3642 etree.register_namespace("mml", "http://www.w3.org/1998/Math/MathML") 

3643 # text = xml.replace('xmlns:xlink="http://www.w3.org/1999/xlink"', '') 

3644 text = xml 

3645 

3646 if tag in ["abstract", "title"]: 3646 ↛ 3649line 3646 didn't jump to line 3649, because the condition on line 3646 was never false

3647 text = f"<article><front><article-meta>{text}</article-meta></front></article>" 

3648 

3649 tree = etree.fromstring(text.encode("utf-8"), parser=parser_) 

3650 xarticle = JatsArticle(tree=tree, **kwargs) 

3651 

3652 result = "" 

3653 if tag == "abstract": 3653 ↛ 3655line 3653 didn't jump to line 3655, because the condition on line 3653 was never false

3654 result = xarticle.abstracts[0]["value_tex"] 

3655 elif tag == "title": 

3656 result = xarticle.title_tex, xarticle.trans_title_tex 

3657 

3658 return result