Coverage for apps/ptf/cmds/xml/jats/jats

1##################################################################################################

3# README

5# jats_parser.py is a replacement of xmldata.py

6# The purpose is to parse a JATS xml (or BITS) tree from top to bottom.

7# Each node is read only once.

9# JatsArticle, JatsIssue, JatsJournal, BitsBook are the objects created by xml_cmds.

10# The xml tree is parsed in the class constructor (__init__)

11# These classes have parse_<tag> functions to parse the xml nodes and set instance variables.

12# Some parse_<tag> functions are called directly.

13# Ex: if tag == "article-meta":

14# self.parse_article_meta(child)

15# Other parse_<tag> functions are called "automatically"

16# fct_name = 'parse_' + tag.replace('-', '_')

17# ftor = getattr(self, fct_name, None)

18# if callable(ftor):

19# ftor(child)

20#

21# JatsBase and JatsArticleBase are base classes.

22# They provide common instance variables and their corresponding parse_<tag> functions

23#

24# html_from_<tag> are used to generate the HTML text of a node with mixed content:

25# a node that mixes text, children and tail

26# These functions can also extract data and set instance variables (ex: self.figures)

27#

28# get_data_from_* parse a node, but simply return data (text, dict,...) without side effects

29#

30# At the end of this file, there are some functions that are/were called by ptf-tools.

31# They are kept here for simplicity: we can switch xmldata entirely with jats_parser

32#

33# TODO: the import OAI or the import of a collection could simply call the first function

34# (def parser(tree))

35#

36##################################################################################################

38import copy

39import inspect

40import os

41import re

43from lxml import etree

44from pylatexenc.latexencode import unicode_to_latex

46from django.conf import settings

47from django.urls import reverse

48from django.utils import timezone

50from matching import scrapping

51from ptf.cmds.xml.citation_html import add_span_class_to_html_from_article_title

52from ptf.cmds.xml.citation_html import add_span_class_to_html_from_authors

53from ptf.cmds.xml.citation_html import add_span_class_to_html_from_chapter_title

54from ptf.cmds.xml.citation_html import add_span_class_to_html_from_source

55from ptf.cmds.xml.citation_html import add_span_class_to_html_from_volume

56from ptf.cmds.xml.citation_html import get_citation_html

57from ptf.cmds.xml.xml_base import RefBase

58from ptf.cmds.xml.xml_base import XmlParserBase

59from ptf.cmds.xml.xml_utils import escape

60from ptf.cmds.xml.xml_utils import get_contrib_xml

61from ptf.cmds.xml.xml_utils import get_elsevier_image_extensions

62from ptf.cmds.xml.xml_utils import get_normalized_attrib

63from ptf.cmds.xml.xml_utils import get_text_from_node

64from ptf.cmds.xml.xml_utils import get_xml_from_node

65from ptf.cmds.xml.xml_utils import helper_update_name_params

66from ptf.cmds.xml.xml_utils import make_links_clickable

67from ptf.cmds.xml.xml_utils import normalize

68from ptf.cmds.xml.xml_utils import normalize_space

69from ptf.cmds.xml.xml_utils import split_kwds

70from ptf.display import resolver

71from ptf.model_data import ArticleData

72from ptf.model_data import BookData

73from ptf.model_data import BookPartData

74from ptf.model_data import CollectionData

75from ptf.model_data import Foo

76from ptf.model_data import IssueData

77from ptf.model_data import JournalData

78from ptf.model_data import MathdocPublicationData

79from ptf.model_data import PublisherData

80from ptf.model_data import create_contributor

81from ptf.model_data import create_extlink

84class JatsBase(XmlParserBase):

85 def __init__(self, *args, **kwargs):

86 super().__init__()

87 self.warnings = []

88 self.fns = []

89 self.tree = None

90 # Used to convert an XML value for CKEditor (ie abstract)

91 self.add_span_around_tex_formula = False

92 # Used to create a Tex file from an XML value (ie abstract)

93 self.for_tex_file = False

95 def parse_tree(self, tree):

96 self.tree = tree

97 self.lang = get_normalized_attrib(tree, "lang") or "und"

99 def parse_node_with_article_title(self, node, **kwargs):

100 tex, html = self.parse_inner_node(node, **kwargs)

101

102 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False

103 if is_mixed_citation:

104 html = add_span_class_to_html_from_article_title(html, **kwargs)

105

106 return tex, html

107

108 def parse_node_with_break(self, node, **kwargs):

109 tex = "\\newline\n" if self.for_tex_file else " "

110 html = "<br/>"

111

112 return tex, html

113

114 def parse_node_with_chem_struct_wrap(self, node, **kwargs):

115 table_id = label = None

116 inner_text = ""

117

118 if "id" in node.attrib:

119 table_id = node.attrib["id"]

120

121 for child in node:

122 tag = normalize(child.tag)

123 if tag == "label":

124 _, label = self.parse_node_with_mixed_content(child, **kwargs)

125 else:

126 _, child_text = self.parse_node_with_mixed_content(child, **kwargs)

127 inner_text += child_text

128

129 text = "<table "

130 if table_id:

131 text += f'id="{table_id}" '

132 text += f'class="formula"><tr><td class="formula-inner">{inner_text}</td>'

133

134 text += '<td class="formula-label">'

135 if label:

136 text += label

137 text += "</td></tr>"

138 text += "</table>"

139

140 return text, text

141

142 def parse_node_with_disp_quote(self, node, **kwargs):

143 tex, html = self.parse_inner_node(node, **kwargs)

144

145 html = f'<div class="disp-quote">{html}</div>'

146 tex = f'<div class="disp-quote">{tex}</div>'

147

148 return tex, html

149

150 def parse_node_with_boxed_text(self, node, **kwargs):

151 box_id = node.attrib["id"] if "id" in node.attrib else None

152

153 _, node_html = self.parse_inner_node(node, **kwargs)

154

155 if box_id:

156 html = f'<div id="{box_id}" class="boxed-text">'

157 else:

158 html = '<div class="boxed-text">'

159

160 html = f"{html}{node_html}</div>"

161

162 return "", html

163

164 def parse_node_with_fig(self, node, **kwargs):

165 """

166 Ex: <fig><label>LABEL</label><caption><title>TITLE</title>CAPTION</caption><graphic/></fig>

167 becomes: <figure><img><figcaption>LABEL : TITLE<p>CAPTION</p></figcaption></figure>

168

169 :param node: XML node of a fig

170 :return: the HTML text + the dict representing the image (mimetype, location,...)

171 """

172 html = ""

173

174 fig_id = label_html = title_html = caption_html = None

175 img_html = ""

176

177 if "id" in node.attrib:

178 fig_id = node.attrib["id"]

179

180 for child in node:

181 tag = normalize(child.tag)

182 if tag == "label":

183 _, label_html = self.parse_node_with_mixed_content(child, **kwargs)

184 elif tag == "caption":

185 for caption_child in child:

186 tag = normalize(caption_child.tag)

187 if tag == "title":

188 _, title_html = self.parse_node_with_mixed_content(caption_child, **kwargs)

189 elif tag == "p": 189 ↛ 203line 189 didn't jump to line 203, because the condition on line 189 was never false

190 _, caption_p_html = self.parse_node_with_mixed_content(

191 caption_child, **kwargs

192 )

193 if caption_html:

194 caption_html = caption_html.replace(

195 "<p>", '<p class="fig-first-caption">', 1

196 )

197 caption_html += caption_p_html.replace(

198 "<p>", '<p class="fig-small-caption">', 1

199 )

200 else:

201 caption_html = caption_p_html

202 else:

203 self.warnings.append(

204 {

205 self.pid: self.__class__.__name__

206 + "."

207 + inspect.currentframe().f_code.co_name

208 + " "

209 + tag

210 }

211 )

212

213 elif tag == "graphic":

214 _, graphic_html = self.parse_node_with_graphic(child, **kwargs)

215 img_html += graphic_html

216 elif tag == "attrib":

217 _, html = self.parse_node_with_mixed_content(child, **kwargs)

218 caption_html = f'{caption_html}<p class="fig-small-caption">{html}</p>'

219 elif tag == "permissions": 219 ↛ 225line 219 didn't jump to line 225, because the condition on line 219 was never false

220 for gchild in child:

221 if gchild.tag == "copyright-statement": 221 ↛ 220line 221 didn't jump to line 220, because the condition on line 221 was never false

222 _, html = self.parse_node_with_mixed_content(gchild, **kwargs)

223 caption_html = f'{caption_html}<p class="fig-small-caption">{html}</p>'

224 else:

225 self.warnings.append(

226 {

227 self.pid: self.__class__.__name__

228 + "."

229 + inspect.currentframe().f_code.co_name

230 + " "

231 + tag

232 }

233 )

234

235 if fig_id:

236 html = '<figure id="' + fig_id + '">'

237 else:

238 html = "<figure>"

239

240 if len(img_html) > 0: 240 ↛ 243line 240 didn't jump to line 243, because the condition on line 240 was never false

241 html += img_html

242

243 if label_html or title_html or (caption_html is not None and len(caption_html) > 0): 243 ↛ 257line 243 didn't jump to line 257, because the condition on line 243 was never false

244 html += "<figcaption>"

245

246 if label_html: 246 ↛ 248line 246 didn't jump to line 248, because the condition on line 246 was never false

247 html += label_html

248 if label_html and title_html:

249 html += " : "

250 if title_html:

251 html += title_html

252 if caption_html: 252 ↛ 255line 252 didn't jump to line 255, because the condition on line 252 was never false

253 html += caption_html

254

255 html += "</figcaption>"

256

257 html += "</figure>"

258

259 if ( 259 ↛ 265line 259 didn't jump to line 265

260 "append_floats" in kwargs

261 and kwargs["append_floats"]

262 and hasattr(self, "floats")

263 and fig_id is not None

264 ):

265 self.floats[fig_id] = html

266

267 return "", html

268

269 def parse_node_with_fn(self, node, **kwargs):

270 """

271 Ex: <fn><label>LABEL</label><p>TEXT</p></fn>

272

273 :param node: XML node of a fn

274 :return: ''. the text is stripped from the HTML. but a list of fn is built

275 """

276 html = fn_html = ""

277

278 label_html = fn_id = None

279

280 if "id" in node.attrib: 280 ↛ 281line 280 didn't jump to line 281, because the condition on line 280 was never true

281 fn_id = node.attrib["id"]

282

283 for child in node:

284 tag = normalize(child.tag)

285 if tag == "label":

286 _, label_html = self.parse_node_with_mixed_content(child, **kwargs)

287 elif tag == "p": 287 ↛ 291line 287 didn't jump to line 291

288 _, fn_html = self.parse_node_with_mixed_content(child, **kwargs)

289 fn_html = fn_html.replace("<p>", "").replace("</p>", "")

290 else:

291 warning = (

292 self.__class__.__name__

293 + "."

294 + inspect.currentframe().f_code.co_name

295 + " "

296 + tag

297 )

298 self.warnings.append({self.pid: warning})

299

300 if fn_id: 300 ↛ 301line 300 didn't jump to line 301, because the condition on line 300 was never true

301 html = '<p id="' + fn_id + '">'

302 else:

303 html = "<p>"

304

305 if label_html and ("keep_fn_label" not in kwargs or kwargs["keep_fn_label"]): 305 ↛ 308line 305 didn't jump to line 308, because the condition on line 305 was never false

306 html += f"<sup>{label_html}</sup> "

307

308 html += fn_html + "</p>"

309

310 if not kwargs["keep_fn"] and html not in self.fns: 310 ↛ 311line 310 didn't jump to line 311, because the condition on line 310 was never true

311 self.fns.append(html)

312

313 html = html if kwargs["keep_fn"] else ""

314 return "", html

315

316 def parse_node_with_graphic(self, node, **kwargs):

317 """

318 The href value of graphics used in our XML can have the following values

319 - relative path to the issue XML folder (Elsevier JATS)

320 - full path starting with "file:/" (Elsevier JATS created in early 2022)

321 - simple file name (with no relative path) in the RVT FullText XML

322

323 After the import, we want

324 - the files located in the src/tex/figures article folder

325 - the url pointing to the image, built thanks to kwargs['base_url']

326

327 addRelatedObjectPtfCmd will copy the images to the src/tex/figures folder if the location starts with file:/

328 => change the location to "file:/..." for Elsevier JATS (the xarticle has a pii attribute)

329 """

330 href = ""

331

332 for attrib in node.attrib:

333 name = normalize(attrib)

334 if name == "href":

335 href = node.attrib[attrib]

336

337 if href: 337 ↛ 383line 337 didn't jump to line 383, because the condition on line 337 was never false

338 basename = os.path.basename(href)

339 ext = basename.split(".")[-1]

340 if ext == "png": 340 ↛ 341line 340 didn't jump to line 341, because the condition on line 340 was never true

341 mimetype = "image/png"

342 else:

343 mimetype = "image/jpeg"

344

345 img_url = "src/tex/figures/" + basename

346

347 if ext in get_elsevier_image_extensions(): # Elsevier uses "jc3" instead of jpg. WTF ? 347 ↛ 350line 347 didn't jump to line 350, because the condition on line 347 was never false

348 img_url = img_url[0 : -len(ext)] + "jpg"

349

350 data_location = href if "file:/" in href else img_url

351 if ( 351 ↛ 357line 351 didn't jump to line 357

352 hasattr(self, "pii")

353 and hasattr(self, "issue")

354 and "file:/" not in href

355 and self.from_folder

356 ):

357 base_dir = self.issue.journal.pid

358 if os.path.dirname(href) != base_dir:

359 href = os.path.join(self.from_folder, base_dir, self.issue.pid, href)

360 data_location = "file:" + href

361

362 data = {

363 "rel": "html-image",

364 "mimetype": mimetype,

365 "location": data_location,

366 "base": None,

367 "metadata": node.text if node.text is not None else "",

368 }

369

370 if ext == "png": 370 ↛ 371line 370 didn't jump to line 371, because the condition on line 370 was never true

371 img_url = os.path.join(kwargs["base_url"], "png", img_url)

372 else:

373 img_url = os.path.join(kwargs["base_url"], "jpg", img_url)

374 img_text = '<a href="' + img_url + '" data-lightbox="image-'

375 img_text += str(len(self.figures)) + '" title="">'

376 img_text += '<img src="' + img_url + '" class="article-body-img" />'

377 img_text += "</a>"

378

379 if data not in self.figures: 379 ↛ 383line 379 didn't jump to line 383, because the condition on line 379 was never false

380 self.figures.append(data)

381 self.related_objects.append(data)

382

383 return "", img_text

384

385 def parse_node_with_inline_formula(self, node, **kwargs):

386 # MathJAX is doing a good job with formulae and is now the standard

387 # MathML could be ignored in HTML (the original XML value is preserved with value_xml)

388 # We could simply return the tex-math text

389 # But there are multiple errors in the TeX of the Mersenne articles.

390 # We first need to fix those mistakes before switching to TeX

391

392 tex_math = ""

393 math_text = ""

394 formula_id = label = None

395

396 if "id" in node.attrib:

397 formula_id = node.attrib["id"]

398

399 for child in node:

400 tag = normalize(child.tag)

401 if tag == "alternatives":

402 for alternative in child:

403 tag = normalize(alternative.tag)

404 if tag == "tex-math":

405 tex_math = alternative.text or ""

406 elif tag == "math":

407 # remove_namespace(child)

408 # Elsevier sometimes provide the formula a an alternative image. Remove it.

409 alternative.attrib.pop("altimg", None)

410

411 math_text = get_xml_from_node(alternative).replace("mml:", "")

412 math_text = math_text.replace(

413 'xmlns:xlink="http://www.w3.org/1999/xlink"', ""

414 )

415 math_text = math_text.replace(

416 'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"', ""

417 )

418 if node.tag == "disp-formula":

419 math_text = math_text.replace("<math", '<math display="block"')

420 elif tag == "label": 420 ↛ 423line 420 didn't jump to line 423, because the condition on line 420 was never false

421 label = child.text or ""

422 else:

423 self.warnings.append(

424 {

425 self.pid: self.__class__.__name__

426 + "."

427 + inspect.currentframe().f_code.co_name

428 + " "

429 + tag

430 }

431 )

432

433 if (math_text == "" and tex_math != "") or (math_text != "" and tex_math == ""):

434 stack = inspect.stack()

435 stack_str = " ".join(

436 [

437 frameinfo[3]

438 for frameinfo in stack[1:]

439 if frameinfo[3].find("parse_") == 0

440 and frameinfo[3].find("parse_node") == -1

441 and frameinfo[3].find("parse_inner") == -1

442 and frameinfo[3].find("parse_tree") == -1

443 and frameinfo[3].find("parse_article_meta") == -1

444 ]

445 )

446 print(f"{self.pid} no math formula for {stack_str}")

447 # raise ValueError("No formula alternative")

448

449 if node.tag != "disp-formula":

450 if tex_math != "" and tex_math[0] != "$": 450 ↛ 451line 450 didn't jump to line 451, because the condition on line 450 was never true

451 tex_math = "$" + tex_math

452 if tex_math != "" and tex_math[-1] != "$": 452 ↛ 453line 452 didn't jump to line 453, because the condition on line 452 was never true

453 tex_math = tex_math + "$"

454

455 tex = tex_math

456

457 html = ""

458 if label or node.tag == "disp-formula":

459 html += '<table class="formula"><tr><td class="formula-inner">'

460

461 html += '<span class="mathjax-formula" '

462 if formula_id:

463 html += 'id="' + formula_id + '" '

464 alt_text = tex_math.replace("\n", "") if node.tag == "disp-formula" else tex_math

465 if math_text:

466 html += f'data-tex="{alt_text}">{math_text}</span>'

467 else:

468 html += f'data-tex="{alt_text}">{tex_math}</span>'

469

470 if label or node.tag == "disp-formula":

471 html += '</td><td class="formula-label">'

472 if label:

473 html += label

474 html += "</td></tr>"

475 html += "</table>"

476

477 if self.add_span_around_tex_formula: 477 ↛ 478line 477 didn't jump to line 478, because the condition on line 477 was never true

478 tex = f'<span class="mathjax-formula">\${tex[1:-1]}\$</span>'

479

480 return tex, html

481

482 def parse_node_with_institution_id(self, node, **kwargs):

483 return "", ""

484

485 def parse_node_with_italic(self, node, **kwargs):

486 tex, html = self.parse_inner_node(node, **kwargs)

487

488 # is_mixed_citation = kwargs['is_mixed_citation'] if 'is_mixed_citation' in kwargs else False

489 # is_citation = kwargs['is_citation'] if 'is_citation' in kwargs else False

490 # is_comment = kwargs['is_comment'] if 'is_comment' in kwargs else False

491 #

492 # if inner_text == '' or kwargs['temp_tex'] or (is_citation and not is_mixed_citation and not is_comment):

493 # text = inner_text

494 # else:

495 # text = '<span class="italique">' + inner_text + '</span>'

496

497 html = f'<span class="italique">{html}</span>'

498

499 if self.for_tex_file: 499 ↛ 500line 499 didn't jump to line 500, because the condition on line 499 was never true

500 tex = "{\\it " + tex + "}"

501 else:

502 tex = f"<i>{tex}</i>"

503

504 return tex, html

505

506 def parse_node_with_list(self, node, **kwargs):

507 tex, html = self.parse_inner_node(node, **kwargs)

508

509 start = None

510 continued_from = node.get("continued-from")

511 if continued_from is not None: 511 ↛ 512line 511 didn't jump to line 512, because the condition on line 511 was never true

512 start = self.get_list_start_value(node) + 1

513

514 list_type = node.get("list-type")

515 if list_type == "bullet" or list_type == "simple":

516 if self.for_tex_file: 516 ↛ 517line 516 didn't jump to line 517, because the condition on line 516 was never true

517 tex = "\n\\begin{itemize}\n" + tex + "\\end{itemize}\n"

518 else:

519 tex = f"<ul>{tex}</ul>"

520

521 html = f"<ul>{html}</ul>"

522 else:

523 if self.for_tex_file: 523 ↛ 524line 523 didn't jump to line 524, because the condition on line 523 was never true

524 tex = "\n\\begin{enumerate}\n" + tex + "\\end{enumerate}\n"

525 else:

526 if list_type == "order" or list_type == "number":

527 if start is not None: 527 ↛ 528line 527 didn't jump to line 528, because the condition on line 527 was never true

528 html = f'<ol type="1" start="{str(start)}">{html}</ol>'

529 tex = f'<ol type="1" start="{str(start)}">{tex}</ol>'

530 else:

531 html = f'<ol type="1">{html}</ol>'

532 tex = f'<ol type="1">{tex}</ol>'

533 elif list_type == "alpha-lower":

534 html = f'<ol type="a">{html}</ol>'

535 tex = f'<ol type="a">{tex}</ol>'

536 elif list_type == "alpha-upper":

537 html = f'<ol type="A">{html}</ol>'

538 tex = f'<ol type="A">{tex}</ol>'

539 elif list_type == "roman-lower":

540 html = f'<ol type="i">{html}</ol>'

541 tex = f'<ol type="i">{tex}</ol>'

542 elif list_type == "roman-upper": 542 ↛ 543line 542 didn't jump to line 543, because the condition on line 542 was never true

543 html = f'<ol type="I">{html}</ol>'

544 tex = f'<ol type="I">{tex}</ol>'

545 else:

546 html = f'<ul class="no-bullet" style="list-style-type:none;">{html}</ul>'

547 tex = f'<ul class="no-bullet" style="list-style-type:none;">{tex}</ul>'

548

549 return tex, html

550

551 def parse_node_with_list_item(self, node, **kwargs):

552 """

553 <list-item><label>LABEL</label><p>TEXT</p> becomes

554 <li>LABEL TEXT</li>

555 (same with <title>)

556

557 :param node:

558 :return:

559 """

560

561 title_tex = (

562 title_html

563 ) = label_tex = label_html = p_tex = p_html = content_tex = content_html = ""

564

565 for child in node:

566 tag = normalize(child.tag)

567 if tag == "label":

568 label_tex, label_html = self.parse_node_with_mixed_content(child, **kwargs)

569 elif tag == "title": 569 ↛ 570line 569 didn't jump to line 570, because the condition on line 569 was never true

570 title_tex, title_html = self.parse_node_with_mixed_content(child, **kwargs)

571 elif tag == "p":

572 if p_html == "" and content_html == "": 572 ↛ 575line 572 didn't jump to line 575, because the condition on line 572 was never false

573 p_tex, p_html = self.parse_inner_node(child, **kwargs)

574 else:

575 content_tex, content_html = self.parse_inner_node(child, **kwargs)

576 content_html = f"<p>{content_html}</p>"

577 elif tag == "list": 577 ↛ 581line 577 didn't jump to line 581, because the condition on line 577 was never false

578 content_tex, content_html = self.parse_node_with_mixed_content(child, **kwargs)

579 # TODO if tag == "def-list":

580 else:

581 self.warnings.append(

582 {

583 self.pid: self.__class__.__name__

584 + "."

585 + inspect.currentframe().f_code.co_name

586 + " "

587 + tag

588 }

589 )

590

591 inner_tex = ""

592 if label_tex:

593 inner_tex += label_tex + " "

594 if title_tex: 594 ↛ 595line 594 didn't jump to line 595, because the condition on line 594 was never true

595 inner_tex += title_tex + " "

596 inner_tex += p_tex + content_tex

597

598 if self.for_tex_file: 598 ↛ 599line 598 didn't jump to line 599, because the condition on line 598 was never true

599 tex = "\\item " + inner_tex + "\n"

600 else:

601 tex = f"<li>{inner_tex}</li>"

602

603 html = "<li>"

604 if label_html:

605 html += label_html + " "

606 if title_html: 606 ↛ 607line 606 didn't jump to line 607, because the condition on line 606 was never true

607 html += title_html + " "

608 html += p_html + content_html + "</li>"

609

610 return tex, html

611

612 def parse_node_with_name_content(self, node, **kwargs):

613 tex, html = self.parse_inner_node(node, **kwargs)

614 return tex, html

615

616 def parse_node_with_p(self, node, **kwargs):

617 tex, html = self.parse_inner_node(node, **kwargs)

618

619 if not self.for_tex_file:

620 tex = f"<p>{tex}</p>"

621

622 node_type = node.get("specific-use")

623 if node_type:

624 html = f'<p class="{node_type}">{html}</p>'

625 else:

626 html = f"<p>{html}</p>"

627

628 if hasattr(self, "floats_to_insert") and hasattr(self, "floats"): 628 ↛ 629line 628 didn't jump to line 629, because the condition on line 628 was never true

629 while len(self.floats_to_insert) > 0:

630 float_id = self.floats_to_insert.pop(0)

631 if float_id in self.floats:

632 html += self.floats[float_id]

633 self.floats.pop(float_id)

634

635 return tex, html

636

637 def parse_node_with_sc(self, node, **kwargs):

638 tex, html = self.parse_inner_node(node, **kwargs)

639 html = f'<span class="smallcaps">{html}</span>'

640

641 return tex, html

642

643 def parse_node_with_sec(self, node, **kwargs):

644 """

645 <sec><title>TITLE</title><p>TEXT</p> becomes

646 <section><h@i>TITLE</h@i><p>TEXT</p> (i is the current level and is increased for children)

647

648 :param node:

649 :param kwargs:

650 :return:

651 """

652

653 label_tex = label_html = title_tex = title_html = None

654 sec_level = kwargs["sec_level"] = kwargs["sec_level"] if "sec_level" in kwargs else 2

655

656 inner_tex = inner_html = ""

657 kwargs["sec_level"] += 1

658

659 for child in node:

660 tag = normalize(child.tag)

661 if tag == "label":

662 label_tex, label_html = self.parse_node_with_mixed_content(child)

663 elif tag == "title":

664 title_tex, title_html = self.parse_node_with_mixed_content(child)

665 else:

666 child_tex, child_html = self.parse_node_with_mixed_content(child, **kwargs)

667 inner_tex += child_tex

668 inner_html += child_html

669

670 tex = ""

671 html = "<section>"

672

673 if label_html or title_html: 673 ↛ 686line 673 didn't jump to line 686, because the condition on line 673 was never false

674 html += f"<h{str(sec_level)}>"

675 if label_html: 675 ↛ 678line 675 didn't jump to line 678, because the condition on line 675 was never false

676 tex += label_tex

677 html += label_html

678 if label_html and title_html: 678 ↛ 681line 678 didn't jump to line 681, because the condition on line 678 was never false

679 tex += " "

680 html += " "

681 if title_html: 681 ↛ 684line 681 didn't jump to line 684, because the condition on line 681 was never false

682 tex += title_tex

683 html += title_html

684 html += f"</h{str(sec_level)}>"

685

686 tex += inner_tex

687 html += inner_html + "</section>"

688

689 return tex, html

690

691 def parse_node_with_string_name(self, node, **kwargs):

692 tex, html = self.parse_inner_node(node, **kwargs)

693

694 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False

695 if is_mixed_citation: 695 ↛ 698line 695 didn't jump to line 698, because the condition on line 695 was never false

696 html = add_span_class_to_html_from_authors(html.title(), **kwargs)

697

698 return tex, html

699

700 def parse_node_with_strong(self, node, **kwargs):

701 tex, html = self.parse_inner_node(node, **kwargs)

702

703 if self.for_tex_file: 703 ↛ 704line 703 didn't jump to line 704, because the condition on line 703 was never true

704 tex = "{\\bf " + tex + "}"

705 else:

706 tex = f"<strong>{tex}</strong>"

707 html = f"<strong>{html}</strong>"

708

709 return tex, html

710

711 def parse_node_with_styled_content(self, node, **kwargs):

712 tex, html = self.parse_inner_node(node, **kwargs)

713

714 if "style" in node.attrib: 714 ↛ 719line 714 didn't jump to line 719, because the condition on line 714 was never false

715 style = node.attrib["style"]

716 if style != "": 716 ↛ 719line 716 didn't jump to line 719, because the condition on line 716 was never false

717 html = f'<span style="{style}">{html}</span>'

718

719 return tex, html

720

721 def parse_node_with_sub(self, node, **kwargs):

722 tex, html = self.parse_inner_node(node, **kwargs)

723

724 if self.for_tex_file: 724 ↛ 725line 724 didn't jump to line 725, because the condition on line 724 was never true

725 tex = "\\textsubscript{" + tex + "}"

726 else:

727 tex = f"<sub>{tex}</sub>"

728 html = f"<sub>{html}</sub>"

729

730 return tex, html

731

732 def parse_node_with_sup(self, node, **kwargs):

733 tex, html = self.parse_inner_node(node, **kwargs)

734

735 if self.for_tex_file: 735 ↛ 736line 735 didn't jump to line 736, because the condition on line 735 was never true

736 tex = "\\textsuperscript{" + tex + "}"

737 else:

738 tex = f"<sup>{tex}</sup>"

739 html = f"<sup>{html}</sup>"

740

741 return tex, html

742

743 def parse_node_with_table_generic(self, node, **kwargs):

744 tex, html = self.parse_inner_node(node, **kwargs)

745

746 tag = normalize(node.tag)

747 if tag == "row": 747 ↛ 748line 747 didn't jump to line 748, because the condition on line 747 was never true

748 tag = "tr"

749 elif tag == "entry": 749 ↛ 750line 749 didn't jump to line 750, because the condition on line 749 was never true

750 tag = "td"

751 open_tag = "<" + tag

752

753 if tag == "table":

754 class_table = "table"

755

756 cols = node.xpath("colgroup/col")

757 i = 1

758 for col in cols:

759 if "width" in col.attrib:

760 class_table += f" nowrap-col-{i}"

761 i += 1

762

763 open_tag += f' class="{class_table}"'

764 if "rowspan" in node.attrib:

765 open_tag += ' rowspan="' + node.attrib["rowspan"] + '"'

766 if "colspan" in node.attrib:

767 open_tag += ' colspan="' + node.attrib["colspan"] + '"'

768 if "align" in node.attrib:

769 open_tag += ' align="' + node.attrib["align"] + '"'

770 if "valign" in node.attrib:

771 open_tag += ' class="td-valign-' + node.attrib["valign"] + '"'

772 if "style" in node.attrib:

773 open_tag += ' style="' + node.attrib["style"] + '"'

774 open_tag += ">"

775

776 html = f"{open_tag}{html}</{tag}>"

777

778 return "", html

779

780 def parse_node_with_table_wrap(self, node, **kwargs):

781 """

782 Create a <div class="table-wrap"> around the table

783 :param node:

784 :return:

785 """

786

787 table_id = label = caption = None

788 inner_text = ""

789

790 if "id" in node.attrib: 790 ↛ 793line 790 didn't jump to line 793, because the condition on line 790 was never false

791 table_id = node.attrib["id"]

792

793 for child in node:

794 tag = normalize(child.tag)

795 if tag == "label":

796 _, label = self.parse_node_with_mixed_content(child, **kwargs)

797 elif tag == "caption":

798 _, caption = self.parse_node_with_mixed_content(child, **kwargs)

799 else:

800 _, child_text = self.parse_node_with_mixed_content(child, **kwargs)

801 inner_text += child_text

802

803 if table_id: 803 ↛ 806line 803 didn't jump to line 806, because the condition on line 803 was never false

804 text = '<div class="table-wrap table-responsive" id="' + table_id + '">'

805 else:

806 text = '<div class="table-wrap table-responsive">'

807

808 if label or caption: 808 ↛ 811line 808 didn't jump to line 811, because the condition on line 808 was never false

809 text += '<div class="table-wrap-header">'

810

811 if label: 811 ↛ 814line 811 didn't jump to line 814, because the condition on line 811 was never false

812 text += "<strong>" + label + "</strong>"

813

814 if caption: 814 ↛ 820line 814 didn't jump to line 820, because the condition on line 814 was never false

815 if label: 815 ↛ 817line 815 didn't jump to line 817, because the condition on line 815 was never false

816 text += " "

817 if caption: 817 ↛ 820line 817 didn't jump to line 820, because the condition on line 817 was never false

818 text += caption

819

820 if label or caption: 820 ↛ 823line 820 didn't jump to line 823, because the condition on line 820 was never false

821 text += "</div>"

822

823 text += inner_text

824 text += "</div>"

825

826 if ( 826 ↛ 832line 826 didn't jump to line 832

827 "append_floats" in kwargs

828 and kwargs["append_floats"]

829 and hasattr(self, "floats")

830 and table_id is not None

831 ):

832 self.floats[table_id] = text

833

834 return "", text

835

836 def parse_node_with_table_wrap_foot(self, node, **kwargs):

837 """

838 Create a <div class="table-wrap-foot"> at bottom of the table

839 Keep the footnotes inside this div

840 :param node:

841 :return:

842 """

843

844 text = '<div class="table-wrap-foot">'

845 kwargs["keep_fn"] = True

846

847 for child in node:

848 tag = normalize(child.tag)

849 if tag == "fn-group": 849 ↛ 847line 849 didn't jump to line 847, because the condition on line 849 was never false

850 _, html = self.parse_node_with_mixed_content(child, **kwargs)

851 text += html

852

853 text += "</div>"

854

855 return "", text

856

857 def parse_node_with_toc(self, node, **kwargs):

858 tex, html = self.parse_inner_node(node, **kwargs)

859

860 html = f"<table>{html}</table>"

861

862 # text = '<ul class="no-bullet book-toc">'

863 # text += inner_text + '</ul>'

864

865 return "", html

866

867 def parse_node_with_toc_entry(self, node, **kwargs):

868 html = label = title = child_text = page = anchor = ""

869 inside_toc_entry = "inside_toc_entry" in kwargs and kwargs["inside_toc_entry"]

870 toc_class = "inside-toc" if inside_toc_entry else ""

871 # # toc-entry may be embedded inside toc-entry: create a wrapping <ul>

872 # html = '<tr class="inside-toc">'

873 # #html = '<ul class="no-bullet book-toc">'

874

875 for child in node:

876 tag = normalize(child.tag)

877 if tag == "title":

878 _, title = self.parse_node_with_mixed_content(child, **kwargs)

879 elif tag == "label":

880 _, label = self.parse_node_with_mixed_content(child, **kwargs)

881 elif tag == "nav-pointer":

882 _, page = self.parse_node_with_mixed_content(child, **kwargs)

883 elif tag == "nav-pointer-group": 883 ↛ 884line 883 didn't jump to line 884, because the condition on line 883 was never true

884 for grandchild in child:

885 if (

886 grandchild.tag == "nav-pointer"

887 and "specific-use" in grandchild.attrib

888 and grandchild.attrib["specific-use"] == "pagenum"

889 ):

890 _, page = self.parse_node_with_mixed_content(grandchild, **kwargs)

891 if (

892 grandchild.tag == "nav-pointer"

893 and "specific-use" in grandchild.attrib

894 and grandchild.attrib["specific-use"] == "pageindex"

895 ):

896 anchor = int(grandchild.text) + 1

897 elif tag == "toc-entry": 897 ↛ 875line 897 didn't jump to line 875, because the condition on line 897 was never false

898 _, text = self.parse_node_with_mixed_content(child, inside_toc_entry=True)

899 child_text += text

900

901 toc_text = f"{label} {title}"

902 page_text = f"p. {page}"

903

904 if anchor: 904 ↛ 905line 904 didn't jump to line 905, because the condition on line 904 was never true

905 href = reverse("item-pdf", kwargs={"pid": self.pid, "extension": "pdf"})

906 href += f"#page={anchor}"

907 toc_text = f'<a href="{href}">{toc_text}</a>'

908 page_text = f'<a href="{href}">{page_text}</a>'

909

910 html += f'<tr><td class="{toc_class}">{toc_text}</td><td class="toc-page">{page_text}</td></tr>'

911 if len(child_text) > 0:

912 html += child_text

913 # html += f'<li>{title} <span> p. {page}</span>{child_text}</li>'

914

915 # if 'inside_toc_entry' in kwargs and kwargs['inside_toc_entry']:

916 # html += '</tr>'

917 # #html += '</ul>'

918

919 return "", html

920

921 def parse_node_with_underline(self, node, **kwargs):

922 tex, html = self.parse_inner_node(node, **kwargs)

923 tex = f"<u>{tex}</u>"

924 html = f"<u>{html}</u>"

925

926 return tex, html

927

928 def parse_node_with_volume(self, node, **kwargs):

929 tex, html = self.parse_inner_node(node, **kwargs)

930

931 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False

932 if is_mixed_citation: 932 ↛ 935line 932 didn't jump to line 935, because the condition on line 932 was never false

933 html = add_span_class_to_html_from_volume(html, **kwargs)

934

935 return tex, html

936

937 def parse_node_with_xref(self, node, **kwargs):

938 tex = html = ""

939

940 if "ignore_xref" in kwargs and kwargs["ignore_xref"]: 940 ↛ 941line 940 didn't jump to line 941, because the condition on line 940 was never true

941 return tex, html

942

943 xref_id = node.get("rid")

944 if xref_id: 944 ↛ 958line 944 didn't jump to line 958, because the condition on line 944 was never false

945 rids = xref_id.split()

946

947 tex, html = self.parse_inner_node(node, **kwargs)

948 rid0 = rids[0]

949 if rid0.find("bib") == 0: 949 ↛ 950line 949 didn't jump to line 950, because the condition on line 949 was never true

950 rid0 = "r" + rid0[3:]

951 html = f'<a href="#{rid0}">{html}</a>'

952

953 for rid in rids:

954 ref_type = node.get("ref-type") or None

955 if ref_type in ["fig", "table", "textbox"] and hasattr(self, "floats_to_insert"): 955 ↛ 956line 955 didn't jump to line 956, because the condition on line 955 was never true

956 self.floats_to_insert.append(rid)

957

958 return tex, html

959

960 def parse_inner_node(self, node, **kwargs):

961 """

962 Used by html_from_mixed_content for nodes that have a different tag in HTML

963 :param node:

964 :param kwargs:

965 :return:

966 """

967 tex = html = ""

968 kwargs["is_top"] = False

969 kwargs["is_body_html"] = kwargs["is_body_html"] if "is_body_html" in kwargs else False

970

971 if node.text:

972 node_text = node.text

973 if self.for_tex_file:

974 node_text = unicode_to_latex(node_text)

975 tex = node_text

976 html = escape(node.text)

977

978 for child in node:

979 child_tex, child_html = self.parse_node_with_mixed_content(child, **kwargs)

980 tex += child_tex

981 html += child_html

982

983 return tex, html

984

985 def parse_node_with_mixed_content(self, node, **kwargs):

986 """

987 Parse and return the HTML text of an XML node which mixes text and XML sub-nodes.

988 Ex: <node>text1 <a>text_a</a> text2 <b>text_b</b>b_tail</node>

989 Some inner nodes are removed, others are kept or replaced by their HTML equivalent.

990 html_from_mixed_content is called recursively to get the HTML text of the children.

991

992 :param node: XML Node

993 :param kwargs: params of the function

994 :return: HTML text

995 """

996

997 if node is None: 997 ↛ 998line 997 didn't jump to line 998, because the condition on line 997 was never true

998 return "", ""

999

1000 # The tail is the text following the end of the node

1001 # Ex: <node>text1<a>text_a</a>a_tail</node>

1002 # The HTML text has to include the tail

1003 # only if html_from_mixed_content was called recursively

1004 kwargs["is_top"] = kwargs["is_top"] if "is_top" in kwargs else True

1005

1006 # sec_level is used to add <h1>, <h2>,... in the HTML text while parsing nodes like <sec>

1007 kwargs["sec_level"] = kwargs["sec_level"] if "sec_level" in kwargs else 2

1008

1009 # Text in <comment> is parsed to add HTML link.

1010 kwargs["add_HTML_link"] = kwargs["add_HTML_link"] if "add_HTML_link" in kwargs else False

1011

1012 # base_url to image links

1013 kwargs["base_url"] = kwargs["base_url"] if "base_url" in kwargs else ""

1014

1015 # footnotes are removed from the fulltext (and put at the end) except for those in a table

1016 kwargs["keep_fn"] = kwargs["keep_fn"] if "keep_fn" in kwargs else False

1017

1018 kwargs["is_citation"] = kwargs["is_citation"] if "is_citation" in kwargs else False

1019 kwargs["is_comment"] = kwargs["is_comment"] if "is_comment" in kwargs else False

1020 # mixed-citation ignores ext-link

1021 kwargs["add_ext_link"] = kwargs["add_ext_link"] if "add_ext_link" in kwargs else False

1022

1023 # TODO remove once jats_parser has been validated agains xmldata

1024 kwargs["temp_math"] = kwargs["temp_math"] if "temp_math" in kwargs else False

1025 kwargs["temp_tex"] = kwargs["temp_tex"] if "temp_tex" in kwargs else False

1026 kwargs["is_mixed_citation"] = (

1027 kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False

1028 )

1029 kwargs["is_body_html"] = kwargs["is_body_html"] if "is_body_html" in kwargs else False

1030

1031 tag = normalize(node.tag)

1032

1033 # pub-id/object-id are ignored by default are they are treated separately

1034 if not (kwargs["is_comment"]) and tag in ("pub-id", "object-id"):

1035 return "", ""

1036

1037 if tag in ("mixed-citation", "toc"):

1038 kwargs["is_citation"] = True

1039 elif tag == "comment":

1040 kwargs["is_comment"] = True

1041

1042 tex = html = inner_tex = inner_html = ""

1043

1044 # I. Add the node's text.

1045 # Some tag have a corresponding parse_node_with_@tag function to generate the HTML text.

1046

1047 # Check if the parse_node_with_@tag exists

1048 tag_mapped = {

1049 "statement": "sec",

1050 "disp-formula": "inline-formula",

1051 "chapter-title": "article-title",

1052 "bold": "strong",

1053 "table": "table-generic",

1054 "th": "table-generic",

1055 "tr": "table-generic",

1056 "td": "table-generic",

1057 "thead": "table-generic",

1058 "tbody": "table-generic",

1059 "colgroup": "table-generic",

1060 "col": "table-generic",

1061 "tgroup": "table-generic",

1062 "entry": "table-generic",

1063 "row": "table-generic",

1064 }

1065

1066 fct_name = tag_mapped[tag] if tag in tag_mapped else tag

1067 fct_name = "parse_node_with_" + fct_name.replace("-", "_")

1068 ftor = getattr(self, fct_name, None)

1069 if callable(ftor):

1070 inner_tex, inner_html = ftor(node, **kwargs)

1071 elif tag in ("ext-link", "uri"):

1072 # Add HTML links

1073 inner_tex = inner_html = self.helper_add_link_from_node(node, **kwargs)

1074 # Update self.ext_links. Useful for <ext-link> deep in a <mixed_citation>,

1075 # and not caught by parse_citation_node

1076 if tag == "ext-link" and not kwargs["is_comment"] and kwargs["add_ext_link"]:

1077 is_extid_value = self.parse_ext_link(node, **kwargs)

1078 if is_extid_value and kwargs["is_mixed_citation"]:

1079 # an extid has been found in a mixed_citation, no need to add the text of the id here

1080 inner_tex = inner_html = ""

1081 elif tag == "supplementary-material": 1081 ↛ 1082line 1081 didn't jump to line 1082, because the condition on line 1081 was never true

1082 self.parse_supplementary_material(node, **kwargs)

1083 else:

1084 # II.1. Add the node text (before the children text)

1085 if node.text is not None:

1086 node_text = node.text

1087 if self.for_tex_file: 1087 ↛ 1088line 1087 didn't jump to line 1088, because the condition on line 1087 was never true

1088 node_text = unicode_to_latex(node_text)

1089 inner_tex += node_text

1090 inner_html += escape(node.text)

1091

1092 # II.2. children

1093 # child_text = html_from_mixed_content(child, params)

1094

1095 child_kwargs = kwargs.copy()

1096 child_kwargs["is_top"] = False

1097

1098 for child in node:

1099 child_tex, child_html = self.parse_node_with_mixed_content(child, **child_kwargs)

1100

1101 # Case where an ext-link has been removed in a mixed-citation

1102 # We may have "title. , (year)"

1103 # Remove the comma that is now useless

1104 if ( 1104 ↛ 1110line 1104 didn't jump to line 1110

1105 kwargs["is_mixed_citation"]

1106 and child_html

1107 and child_html[0] in [",", "."]

1108 and inner_html[-2:] == ". "

1109 ):

1110 inner_html = inner_html[0:-1]

1111 child_html = child_html[1:]

1112 inner_tex = inner_tex[0:-1]

1113 child_tex = child_tex[1:]

1114

1115 inner_tex += child_tex

1116 inner_html += child_html

1117

1118 # II.3. wrap the children text with html links

1119 if kwargs["add_HTML_link"] and node.text:

1120 match = re.match(r"[\n ]+", node.text)

1121 if not match:

1122 inner_html = make_links_clickable(node.text, inner_html)

1123

1124 tex += inner_tex

1125 html += inner_html

1126

1127 # III. Add the node's tail for children

1128 if node.tail and not kwargs["is_top"]:

1129 node_tail = node.tail

1130 if self.for_tex_file:

1131 node_tail = unicode_to_latex(node_tail)

1132 tex += node_tail

1133 html += escape(node.tail)

1134

1135 return tex, html

1136

1137 def parse_abstract(self, node, **kwargs):

1138 # tag = get_normalized_attrib(node, "abstract-node_type") or "abstract"

1139 tag = get_normalized_attrib(node, "abstract-type") or "abstract"

1140 if tag == "author": 1140 ↛ 1141line 1140 didn't jump to line 1141, because the condition on line 1140 was never true

1141 tag = "abstract"

1142 lang = get_normalized_attrib(node, "lang") or self.lang

1143 value_tex, value_html = self.parse_node_with_mixed_content(node)

1144 value_xml = get_xml_from_node(node)

1145 self.abstracts.append(

1146 {

1147 "tag": tag,

1148 "lang": lang,

1149 "value_xml": value_xml,

1150 "value_html": value_html,

1151 "value_tex": value_tex,

1152 }

1153 )

1154

1155 def parse_aff_alternatives(self, node, **kwargs):

1156 xref_id = get_normalized_attrib(node, "id") or ""

1157 address = ""

1158 aff_to_all = True

1159

1160 for child in node:

1161 tag = normalize(child.tag)

1162

1163 if tag == "aff": 1163 ↛ 1174line 1163 didn't jump to line 1174, because the condition on line 1163 was never false

1164 # Skip the formatted aff and use only the complete address text

1165 # TODO support <aff> properly

1166 for aff in child:

1167 if aff.tag == "label" and address == "": 1167 ↛ 1168line 1167 didn't jump to line 1168, because the condition on line 1167 was never true

1168 label = get_text_from_node(aff)

1169 address = get_text_from_node(child)[len(label) :]

1170 aff_to_all = False

1171 if address == "" and child.text:

1172 address = child.text

1173 else:

1174 self.warnings.append(

1175 {

1176 self.pid: self.__class__.__name__

1177 + "."

1178 + inspect.currentframe().f_code.co_name

1179 + " "

1180 + tag

1181 }

1182 )

1183

1184 if address != "": 1184 ↛ exitline 1184 didn't return from function 'parse_aff_alternatives', because the condition on line 1184 was never false

1185 for contrib in self.contributors:

1186 if address not in contrib["addresses"] and ( 1186 ↛ 1185line 1186 didn't jump to line 1185, because the condition on line 1186 was never false

1187 ("xrefs" in contrib and xref_id in contrib["xrefs"]) or aff_to_all

1188 ):

1189 contrib["addresses"].append(address)

1190 contrib["contrib_xml"] = get_contrib_xml(contrib)

1191

1192 def parse_award_group(self, node, **kwargs):

1193 abbrev = award_id = None

1194

1195 for child in node:

1196 tag = normalize(child.tag)

1197

1198 if tag == "award-id":

1199 award_id = child.text

1200 elif tag == "funding-source": 1200 ↛ 1203line 1200 didn't jump to line 1203, because the condition on line 1200 was never false

1201 abbrev = get_text_from_node(child)

1202 else:

1203 self.warnings.append(

1204 {

1205 self.pid: self.__class__.__name__

1206 + "."

1207 + inspect.currentframe().f_code.co_name

1208 + " "

1209 + tag

1210 }

1211 )

1212

1213 if abbrev is not None and award_id is not None: 1213 ↛ exitline 1213 didn't return from function 'parse_award_group', because the condition on line 1213 was never false

1214 self.awards.append({"abbrev": abbrev, "award_id": award_id})

1215

1216 def parse_contrib_group(self, node, **kwargs):

1217 role = node.get("content-type") or ""

1218 if role and role[-1] == "s": 1218 ↛ 1221line 1218 didn't jump to line 1221, because the condition on line 1218 was never false

1219 role = role[0:-1]

1220

1221 for child in node:

1222 tag = normalize(child.tag)

1223

1224 if tag == "contrib": 1224 ↛ 1229line 1224 didn't jump to line 1229, because the condition on line 1224 was never false

1225 contrib = self.get_data_from_contrib(child)

1226 contrib["role"] = f"{role}|{contrib['role']}" if contrib["role"] else role

1227 contrib["contrib_xml"] = get_xml_from_node(child)

1228 self.contributors.append(contrib)

1229 elif tag == "aff-alternatives":

1230 self.parse_aff_alternatives(child)

1231 elif tag == "fn":

1232 _, html = self.parse_node_with_fn(child, keep_fn=True, keep_fn_label=False)

1233 xml = get_xml_from_node(child)

1234 self.footnotes_xml += xml

1235 self.footnotes_html += html

1236 else:

1237 self.warnings.append(

1238 {

1239 self.pid: self.__class__.__name__

1240 + "."

1241 + inspect.currentframe().f_code.co_name

1242 + " "

1243 + tag

1244 }

1245 )

1246

1247 def parse_counts(self, node, **kwargs):

1248 for child in node:

1249 count_value = child.get("count")

1250 if count_value is None:

1251 count_value = child.text

1252

1253 if count_value is not None: 1253 ↛ 1248line 1253 didn't jump to line 1248, because the condition on line 1253 was never false

1254 tag = normalize(child.tag)

1255 if tag == "book-page-count":

1256 tag = "page-count"

1257

1258 self.counts.append((tag, count_value))

1259

1260 def parse_ext_link(self, node, **kwargs):

1261 datas = self.get_data_from_ext_link(node)

1262 extid_value = self.add_extids_from_node_with_link(datas)

1263

1264 add_ext_link = kwargs["add_ext_link"] if "add_ext_link" in kwargs else False

1265 if (

1266 add_ext_link

1267 and extid_value[0] is None

1268 and datas not in self.ext_links

1269 and datas["rel"] != "cover"

1270 ):

1271 self.ext_links.append(datas)

1272

1273 return extid_value[0] is not None

1274

1275 def parse_front_matter(self, node, **kwargs):

1276 self.frontmatter_xml = get_xml_from_node(node)

1277 self.frontmatter_foreword_html = ""

1278

1279 for child in node:

1280 tag = normalize(child.tag)

1281

1282 if tag == "foreword": 1282 ↛ 1283line 1282 didn't jump to line 1283, because the condition on line 1282 was never true

1283 _, self.frontmatter_foreword_html = self.parse_node_with_mixed_content(child)

1284 elif tag == "toc": 1284 ↛ 1279line 1284 didn't jump to line 1279, because the condition on line 1284 was never false

1285 _, self.frontmatter_toc_html = self.parse_node_with_mixed_content(child)

1286

1287 def parse_id(self, node, **kwargs):

1288 node_id = node.text

1289 if "pub-id-type" in node.attrib:

1290 node_type = node.attrib["pub-id-type"]

1291 elif "book-id-type" in node.attrib:

1292 node_type = node.attrib["book-id-type"]

1293 elif "book-part-id-type" in node.attrib: 1293 ↛ 1296line 1293 didn't jump to line 1296, because the condition on line 1293 was never false

1294 node_type = node.attrib["book-part-id-type"]

1295 else:

1296 node_type = ""

1297

1298 if node_type == "pii": 1298 ↛ 1300line 1298 didn't jump to line 1300, because the condition on line 1298 was never true

1299 # Elsevier ids get a special treatment: web scrapping to find the date_published

1300 if self.pid and len(self.pid) > 2 and self.pid[0:2] == "CR":

1301 self.pii = node_id

1302 elif node_type in ("numdam-id", "mathdoc-id"):

1303 self.pid = node_id

1304 elif node_type == "ark": 1304 ↛ 1305line 1304 didn't jump to line 1305, because the condition on line 1304 was never true

1305 self.extids.append((node_type, node_id))

1306 elif node_type in ("doi", "eid"):

1307 self.ids.append((node_type, node_id))

1308 if node_type == "doi": 1308 ↛ exitline 1308 didn't return from function 'parse_id', because the condition on line 1308 was never false

1309 self.doi = node_id

1310

1311 def parse_kwd_group(self, node, **kwargs):

1312 kwds = []

1313 value_html = value_tex = ""

1314 for child in node:

1315 tag = normalize(child.tag)

1316

1317 if tag == "kwd":

1318 kwds.append(child.text)

1319 elif tag == "unstructured-kwd-group": 1319 ↛ 1324line 1319 didn't jump to line 1324, because the condition on line 1319 was never false

1320 # value_xml = get_xml_from_node(child)

1321 value_tex, value_html = self.parse_node_with_mixed_content(child)

1322 kwds = split_kwds(value_tex)

1323 else:

1324 self.warnings.append(

1325 {

1326 self.pid: self.__class__.__name__

1327 + "."

1328 + inspect.currentframe().f_code.co_name

1329 + " "

1330 + tag

1331 }

1332 )

1333

1334 content_type = node.get("content-node_type") or ""

1335 if content_type == "": 1335 ↛ 1337line 1335 didn't jump to line 1337, because the condition on line 1335 was never false

1336 content_type = node.get("kwd-group-type") or ""

1337 lang = get_normalized_attrib(node, "lang") or self.lang

1338

1339 self.kwds.extend([{"type": content_type, "lang": lang, "value": kwd} for kwd in kwds])

1340

1341 def parse_ref_list(self, node, **kwargs):

1342 for child in node:

1343 tag = normalize(child.tag)

1344

1345 if tag == "ref":

1346 ref = JatsRef(tree=child, lang=self.lang)

1347 self.warnings.extend(ref.warnings)

1348 self.bibitems.append(ref)

1349 self.bibitem.append(ref.citation_html)

1350 elif tag == "p": 1350 ↛ 1352line 1350 didn't jump to line 1352, because the condition on line 1350 was never true

1351 # Elsevier can store supplementary-material inside ref-list / p

1352 self.parse_node_with_mixed_content(child)

1353 else:

1354 self.warnings.append(

1355 {

1356 self.pid: self.__class__.__name__

1357 + "."

1358 + inspect.currentframe().f_code.co_name

1359 + " "

1360 + tag

1361 }

1362 )

1363

1364 def parse_related_article(self, node, **kwargs):

1365 rel_type = get_normalized_attrib(node, "related-article-type") or ""

1366 id_value = node.text

1367

1368 if hasattr(self, "pii") and id_value and id_value.find("10.") == -1 and id_value != "NONE": 1368 ↛ 1371line 1368 didn't jump to line 1371, because the condition on line 1368 was never true

1369 # a pii is used instead of a DOI

1370 # Call Elsevier to get the doi

1371 doi = scrapping.fetch_article(self.doi, id_value, pii_doi_equivalence=True)

1372 id_value = doi

1373

1374 obj = Foo()

1375 obj.rel_type = rel_type

1376 obj.id_value = id_value

1377

1378 self.relations.append(obj)

1379

1380 def parse_related_object(self, node, **kwargs):

1381 node_type = node.get("content-type") or ""

1382 rel = node.get("link-type") or ""

1383 href = get_normalized_attrib(node, "href") or ""

1384 base = get_normalized_attrib(node, "base") or ""

1385 text = get_xml_from_node(node)

1386

1387 data = {

1388 "rel": rel,

1389 "mimetype": node_type,

1390 "location": href,

1391 "base": base,

1392 "metadata": text,

1393 }

1394

1395 document_id_type = node.get("document-id-type") or ""

1396 if document_id_type: 1396 ↛ 1397line 1396 didn't jump to line 1397, because the condition on line 1396 was never true

1397 id_value = node.get("document-id") or ""

1398 if id_value != "NONE":

1399 if id_value and id_value.find("10.") == -1:

1400 # a pii is used instead of a DOI

1401 # Call Elsevier to get the doi

1402 doi = scrapping.fetch_article(self.doi, id_value, pii_doi_equivalence=True)

1403 id_value = doi

1404

1405 obj = Foo()

1406 obj.rel_type = "refers to"

1407 obj.id_value = id_value

1408

1409 self.relations.append(obj)

1410 else:

1411 self.related_objects.append(data)

1412

1413 def parse_sec(self, node, **kwargs):

1414 for child in node:

1415 tag = normalize(child.tag)

1416

1417 if tag == "title":

1418 pass

1419 elif tag == "ref-list":

1420 self.parse_ref_list(child)

1421 else:

1422 self.warnings.append(

1423 {

1424 self.pid: self.__class__.__name__

1425 + "."

1426 + inspect.currentframe().f_code.co_name

1427 + " "

1428 + tag

1429 }

1430 )

1431

1432 def parse_self_uri(self, node, **kwargs):

1433 node_type = node.get("content-type") or "text/html"

1434 href = get_normalized_attrib(node, "href") or ""

1435 base = get_normalized_attrib(node, "base") or ""

1436

1437 # The XML of the Elsevier archive do not declare the PDF location like the other Mathdoc collections:

1438 # The collection folder is missing: add it back

1439 if hasattr(self, "pii") and hasattr(self, "issue"): 1439 ↛ 1440line 1439 didn't jump to line 1440, because the condition on line 1439 was never true

1440 base_dir = self.issue.journal.pid

1441 if os.path.dirname(href) != base_dir:

1442 href = os.path.join(base_dir, self.issue.pid, href)

1443

1444 if self.no_bib: 1444 ↛ 1445line 1444 didn't jump to line 1445, because the condition on line 1444 was never true

1445 href = "http://www.numdam.org/item/" + os.path.basename(href)

1446

1447 data = {

1448 "rel": "full-text",

1449 "mimetype": node_type,

1450 "location": href,

1451 "base": base,

1452 "text": normalize_space(node.text) if node.text is not None else "",

1453 }

1454

1455 # Ext-links, Related-objects used metadata instead of text. Strange difference ?

1456 # xml_cmds ignore "application/xml" in add_objects_with_location: they are ignored here.

1457 if node_type != "application/xml":

1458 self.streams.append(data)

1459

1460 def parse_sub_article(self, node, **kwargs):

1461 # Used for translations

1462 trans_article = JatsArticle(tree=node)

1463 self.translations.append(trans_article)

1464

1465 def parse_subj_group(self, node, **kwargs):

1466 lang = get_normalized_attrib(node, "lang") or self.lang

1467 type_ = node.get("subj-group-type") or ""

1468

1469 for child in node:

1470 tag = normalize(child.tag)

1471

1472 if tag == "subject": 1472 ↛ 1477line 1472 didn't jump to line 1477, because the condition on line 1472 was never false

1473 self.subjs.append(

1474 {"type": type_, "lang": lang, "value": get_text_from_node(child)}

1475 )

1476 else:

1477 self.warnings.append(

1478 {

1479 self.pid: self.__class__.__name__

1480 + "."

1481 + inspect.currentframe().f_code.co_name

1482 + " "

1483 + tag

1484 }

1485 )

1486

1487 def parse_supplementary_material(self, node, **kwargs):

1488 caption = ""

1489 for child in node:

1490 if child.tag == "caption":

1491 _, caption = self.parse_node_with_mixed_content(child)

1492

1493 location = get_normalized_attrib(node, "href") or None

1494 if location is None:

1495 location = get_normalized_attrib(node, "id") or ""

1496

1497 mimetype = node.attrib.get("mimetype") or None

1498 if mimetype is None:

1499 mimetype = resolver.get_mimetype(location)

1500

1501 material = {

1502 "rel": node.attrib.get("content-type") or "supplementary-material",

1503 "mimetype": mimetype,

1504 "location": location,

1505 "base": "",

1506 "metadata": "",

1507 "caption": caption if caption else "",

1508 }

1509 base_location = os.path.basename(location)

1510 found_list = [

1511 item

1512 for item in self.supplementary_materials

1513 if os.path.basename(item["location"]) == base_location

1514 ]

1515 if len(found_list) == 0:

1516 self.supplementary_materials.append(material)

1517

1518 def parse_title(self, node, **kwargs):

1519 self.title_tex, self.title_html = self.parse_node_with_mixed_content(

1520 node, ignore_xref=True

1521 )

1522 # In xmldata.py, title_xml had the <title_group> tag:

1523 # self.title_xml can't be set in parse_title

1524

1525 def parse_title_group(self, node, **kwargs):

1526 has_fn_group = False

1527

1528 for child in node:

1529 tag = normalize(child.tag)

1530

1531 if tag in ("title", "journal-title", "article-title", "book-title", "issue-title"):

1532 self.parse_title(child)

1533 elif tag == "subtitle": 1533 ↛ 1534line 1533 didn't jump to line 1534, because the condition on line 1533 was never true

1534 title_tex, title_html = self.parse_node_with_mixed_content(child)

1535 self.title_tex += " " + title_tex

1536 self.title_html += " " + title_html

1537 elif tag == "trans-title-group":

1538 self.parse_trans_title_group(child)

1539 elif tag == "abbrev-title":

1540 _, self.abbrev = self.parse_node_with_mixed_content(child)

1541 elif tag == "fn-group": 1541 ↛ 1542line 1541 didn't jump to line 1542, because the condition on line 1541 was never true

1542 has_fn_group = True

1543 for fn_node in child:

1544 if fn_node.tag == "fn":

1545 _, html = self.parse_node_with_fn(

1546 fn_node, keep_fn=True, keep_fn_label=False

1547 )

1548 xml = get_xml_from_node(fn_node)

1549 self.footnotes_xml += xml

1550 self.footnotes_html += html

1551 else:

1552 self.warnings.append(

1553 {

1554 self.pid: self.__class__.__name__

1555 + "."

1556 + inspect.currentframe().f_code.co_name

1557 + " "

1558 + tag

1559 }

1560 )

1561

1562 if has_fn_group: 1562 ↛ 1565line 1562 didn't jump to line 1565, because the condition on line 1562 was never true

1563 # fn-group is now a funding statement and will be exported separately in the XML:

1564 # => remove it from the title-group

1565 new_node = etree.Element("title-group")

1566 for child in node:

1567 tag = normalize(child.tag)

1568 if tag != "fn-group":

1569 new_node.append(copy.deepcopy(child))

1570 self.title_xml = get_xml_from_node(new_node)

1571 else:

1572 self.title_xml = get_xml_from_node(node)

1573

1574 def parse_trans_abstract(self, node, **kwargs):

1575 tag = get_normalized_attrib(node, "abstract-type") or "abstract"

1576 if tag == "author": 1576 ↛ 1577line 1576 didn't jump to line 1577, because the condition on line 1576 was never true

1577 tag = "abstract"

1578 lang = get_normalized_attrib(node, "lang") or "und"

1579 value_tex, value_html = self.parse_node_with_mixed_content(node)

1580 value_xml = get_xml_from_node(node)

1581 self.abstracts.append(

1582 {

1583 "tag": tag,

1584 "lang": lang,

1585 "value_xml": value_xml,

1586 "value_html": value_html,

1587 "value_tex": value_tex,

1588 }

1589 )

1590

1591 def parse_trans_title(self, node, **kwargs):

1592 self.trans_title_tex, self.trans_title_html = self.parse_node_with_mixed_content(node)

1593 self.trans_title_xml = get_xml_from_node(node)

1594

1595 def parse_trans_title_group(self, node, **kwargs):

1596 for child in node:

1597 tag = normalize(child.tag)

1598

1599 if tag == "trans-title": 1599 ↛ 1602line 1599 didn't jump to line 1602, because the condition on line 1599 was never false

1600 self.parse_trans_title(child)

1601 else:

1602 self.warnings.append(

1603 {

1604 self.pid: self.__class__.__name__

1605 + "."

1606 + inspect.currentframe().f_code.co_name

1607 + " "

1608 + tag

1609 }

1610 )

1611

1612 self.trans_lang = get_normalized_attrib(node, "lang") or "und"

1613

1614 def get_data_from_contrib(self, node):

1615 """

1616 <contrib> creates 1 person, defined in <name>, <string-name> or <name-alternatives>

1617 In a <mixed-citation>, each <name> creates 1 person: we can't use the same code

1618 :param node:

1619 :return:

1620 """

1621

1622 params = create_contributor()

1623

1624 for child in node:

1625 if child.tag == "name":

1626 self.update_data_from_name(child, params)

1627 elif child.tag == "string-name":

1628 self.update_data_from_name(child, params)

1629 if params["first_name"] == "" and params["last_name"] == "": 1629 ↛ 1624line 1629 didn't jump to line 1624, because the condition on line 1629 was never false

1630 params["string_name"] = child.text or ""

1631 elif child.tag == "name-alternatives":

1632 params["mid"] = self.get_data_from_name_alternatives(child)

1633 elif child.tag == "contrib-id":

1634 type_ = child.get("contrib-id-type") or ""

1635 if type_ == "orcid": 1635 ↛ 1637line 1635 didn't jump to line 1637, because the condition on line 1635 was never false

1636 params["orcid"] = child.text or ""

1637 if type_ == "idref": 1637 ↛ 1638line 1637 didn't jump to line 1638, because the condition on line 1637 was never true

1638 params["idref"] = child.text or ""

1639 elif child.tag == "address":

1640 addr = get_text_from_node(child)

1641 params["addresses"].append(addr)

1642 elif child.tag == "email":

1643 params["email"] = child.text or ""

1644 elif child.tag == "xref": 1644 ↛ 1656line 1644 didn't jump to line 1656, because the condition on line 1644 was never false

1645 # Elsevier uses xref/aff-alternatives to store affiliations

1646 type_ = child.get("ref-type") or ""

1647 if type_ == "aff": 1647 ↛ 1624line 1647 didn't jump to line 1624, because the condition on line 1647 was never false

1648 xref = child.get("rid") or ""

1649 if xref == "": 1649 ↛ 1650line 1649 didn't jump to line 1650, because the condition on line 1649 was never true

1650 xref = get_text_from_node(child)

1651 if xref != "": 1651 ↛ 1624line 1651 didn't jump to line 1624, because the condition on line 1651 was never false

1652 if "xrefs" not in params: 1652 ↛ 1655line 1652 didn't jump to line 1655, because the condition on line 1652 was never false

1653 params["xrefs"] = [xref]

1654 else:

1655 params["xrefs"].append(xref)

1656 elif child.tag == "collab":

1657 params["string_name"] = child.text or ""

1658 elif child.tag == "role":

1659 pass

1660 # Role is used in BJHTUP11 as a textual description of the role (ex "Présidente").

1661 # The node value can not be assigned to params['role'] as we want a controlled vocabulary

1662 # (author /editor / organizer...)

1663 # Ignore the value

1664 # params["role"] = child.text or ""

1665 else:

1666 self.warnings.append(

1667 {

1668 self.pid: self.__class__.__name__

1669 + "."

1670 + inspect.currentframe().f_code.co_name

1671 + " "

1672 + child.tag

1673 }

1674 )

1675

1676 # Remove the sort, it causes differences between the HTML and the PDF (discovered in PCJ)

1677 # Sort was introduced on 22/09/2020, based on differences between the Cedrics->JATS XSLT et the Cedrics import

1678 # params['addresses'].sort()

1679

1680 helper_update_name_params(params)

1681

1682 corresp = node.get("corresp") or ""

1683 if corresp == "yes":

1684 params["corresponding"] = True

1685

1686 deceased_ = node.get("deceased") or "no"

1687 params["deceased_before_publication"] = deceased_ == "yes"

1688

1689 equal_contrib_ = node.get("equal-contrib") or "no"

1690 params["equal_contrib"] = equal_contrib_ == "yes"

1691

1692 return params

1693

1694 def get_data_from_custom_meta(self, node):

1695 name = ""

1696 value = ""

1697

1698 for child in node:

1699 tag = normalize(child.tag)

1700

1701 if tag == "meta-name":

1702 name = child.text

1703 elif tag == "meta-value": 1703 ↛ 1706line 1703 didn't jump to line 1706, because the condition on line 1703 was never false

1704 value = child.text

1705 else:

1706 self.warnings.append(

1707 {

1708 self.pid: self.__class__.__name__

1709 + "."

1710 + inspect.currentframe().f_code.co_name

1711 + " "

1712 + tag

1713 }

1714 )

1715

1716 return name, value

1717

1718 def get_data_from_date(self, node, ignore_month=False):

1719 date_str = ""

1720 if "iso-8601-date" in node.attrib:

1721 date_str = node.attrib["iso-8601-date"]

1722 else:

1723 year = month = day = ""

1724 for child in node:

1725 tag = normalize(child.tag)

1726

1727 if tag == "year": 1727 ↛ 1729line 1727 didn't jump to line 1729, because the condition on line 1727 was never false

1728 year = child.text

1729 elif tag == "month" and not ignore_month:

1730 month = child.text

1731 elif tag == "day":

1732 day = child.text

1733 else:

1734 self.warnings.append(

1735 {

1736 self.pid: self.__class__.__name__

1737 + "."

1738 + inspect.currentframe().f_code.co_name

1739 + " "

1740 + tag

1741 }

1742 )

1743

1744 date_str = year

1745 if date_str and month: 1745 ↛ 1746line 1745 didn't jump to line 1746, because the condition on line 1745 was never true

1746 date_str += "-" + month

1747 if date_str and day: 1747 ↛ 1748line 1747 didn't jump to line 1748, because the condition on line 1747 was never true

1748 date_str += "-" + day

1749

1750 return date_str

1751

1752 def get_data_from_ext_link(self, node, **kwargs):

1753 link_type = node.get("ext-link-type") or ""

1754 href = get_normalized_attrib(node, "href") or ""

1755 base = get_normalized_attrib(node, "base") or ""

1756

1757 kwargs["add_HTML_link"] = False

1758 _, metadata = self.parse_inner_node(node, **kwargs)

1759

1760 data = {

1761 "rel": link_type,

1762 "mimetype": "",

1763 "location": href,

1764 "base": base,

1765 "metadata": metadata,

1766 }

1767

1768 return data

1769

1770 def get_data_from_history(self, node):

1771 history_dates = []

1772 # TODO: transform history_dates in a hash where date-type is the key

1773 # => Change database_cmds

1774 for child in node:

1775 if "date-type" in child.attrib:

1776 date_type = child.attrib["date-type"]

1777 date_str = self.get_data_from_date(child)

1778 history_dates.append({"type": date_type, "date": date_str})

1779 else:

1780 self.warnings.append(

1781 {

1782 self.pid: self.__class__.__name__

1783 + "."

1784 + inspect.currentframe().f_code.co_name

1785 + " "

1786 + child.tag

1787 }

1788 )

1789

1790 return history_dates

1791

1792 def update_data_from_name(self, node, contributor):

1793 for child in node:

1794 if child.text is not None: 1794 ↛ 1793line 1794 didn't jump to line 1793, because the condition on line 1794 was never false

1795 if child.tag == "given-names":

1796 contributor["first_name"] = child.text

1797 elif child.tag == "surname":

1798 contributor["last_name"] = child.text

1799 elif child.tag == "prefix": 1799 ↛ 1800line 1799 didn't jump to line 1800, because the condition on line 1799 was never true

1800 contributor["prefix"] = child.text

1801 elif child.tag == "suffix": 1801 ↛ 1804line 1801 didn't jump to line 1804, because the condition on line 1801 was never false

1802 contributor["suffix"] = child.text

1803 else:

1804 self.warnings.append(

1805 {

1806 self.pid: self.__class__.__name__

1807 + "."

1808 + inspect.currentframe().f_code.co_name

1809 + " "

1810 + child.tag

1811 }

1812 )

1813

1814 def get_data_from_name_alternatives(self, node):

1815 mid = ""

1816

1817 for child in node:

1818 if child.text is not None: 1818 ↛ 1817line 1818 didn't jump to line 1817, because the condition on line 1818 was never false

1819 if child.tag == "string-name": 1819 ↛ 1823line 1819 didn't jump to line 1823, because the condition on line 1819 was never false

1820 if child.get("specific-use") == "index": 1820 ↛ 1817line 1820 didn't jump to line 1817, because the condition on line 1820 was never false

1821 mid = child.text

1822 else:

1823 self.warnings.append(

1824 {

1825 self.pid: self.__class__.__name__

1826 + "."

1827 + inspect.currentframe().f_code.co_name

1828 + " "

1829 + child.tag

1830 }

1831 )

1832

1833 return mid

1834

1835 def get_data_from_uri(self, node, **kwargs):

1836 href = get_normalized_attrib(node, "href") or ""

1837

1838 kwargs["add_HTML_link"] = False

1839 _, metadata = self.parse_inner_node(node, **kwargs)

1840

1841 data = {"rel": None, "mimetype": "", "location": href, "base": "", "metadata": metadata}

1842

1843 return data

1844

1845 def helper_add_link_from_node(self, node, **kwargs):

1846 text = node.text or ""

1847 tag = normalize(node.tag)

1848 fct_name = "get_data_from_" + tag.replace("-", "_")

1849 meth = getattr(self, fct_name)

1850 data = meth(node, **kwargs)

1851 if not data["rel"] or data["rel"] == "uri":

1852 href = data["location"]

1853 if self.for_tex_file: 1853 ↛ 1854line 1853 didn't jump to line 1854, because the condition on line 1853 was never true

1854 text = "\\href{" + href + "}{" + data["metadata"] + "}"

1855 else:

1856 text = make_links_clickable(href, data["metadata"])

1857 return text

1858

1859 def get_list_start_value(self, list_node):

1860 continued_from = list_node.get("continued-from")

1861 if continued_from is None:

1862 start = 0

1863 else:

1864 from_node = self.tree.find(f'.//*[@id="{continued_from}"]')

1865 if from_node is not None:

1866 start = len(from_node) + self.get_list_start_value(from_node)

1867

1868 return start

1869

1870

1871class MathdocPublication(MathdocPublicationData, JatsBase):

1872 def __init__(self, *args, **kwargs):

1873 super().__init__(*args, **kwargs)

1874 self.parse_tree(kwargs["tree"])

1875

1876 def parse_tree(self, tree):

1877 super().parse_tree(tree)

1878

1879 for node in tree:

1880 tag = normalize(node.tag)

1881

1882 if tag in ("publication-id", "collection-id"):

1883 node_type = node.get("publication-id-type")

1884 if node_type is None or node_type in ["numdam-id", "mathdoc-id"]:

1885 self.pid = node.text

1886 elif tag == "title-group":

1887 self.parse_title_group(node)

1888 elif tag == "issn":

1889 node_type = node.get("pub-type")

1890 if node_type == "ppub":

1891 self.issn = node.text

1892 self.ids.append(("issn", node.text))

1893 elif node_type == "epub": 1893 ↛ 1879line 1893 didn't jump to line 1879, because the condition on line 1893 was never false

1894 self.e_issn = node.text

1895 self.ids.append(("e-issn", node.text))

1896 elif tag == "ext-link":

1897 data = self.get_data_from_ext_link(node)

1898 self.ext_links.append(data)

1899 elif tag == "custom-meta-group":

1900 self.parse_custom_meta_group(node)

1901 elif tag == "description": 1901 ↛ 1902line 1901 didn't jump to line 1902, because the condition on line 1901 was never true

1902 self.parse_description(node)

1903 else:

1904 self.warnings.append(

1905 {

1906 self.pid: self.__class__.__name__

1907 + "."

1908 + inspect.currentframe().f_code.co_name

1909 + " "

1910 + tag

1911 }

1912 )

1913

1914 def parse_custom_meta_group(self, node, **kwargs):

1915 for child in node:

1916 tag = normalize(child.tag)

1917

1918 if tag == "custom-meta": 1918 ↛ 1928line 1918 didn't jump to line 1928, because the condition on line 1918 was never false

1919 name, value = self.get_data_from_custom_meta(child)

1920

1921 if name == "serial-type":

1922 self.coltype = value

1923 elif name == "wall":

1924 self.wall = int(value)

1925 elif name == "provider": 1925 ↛ 1915line 1925 didn't jump to line 1915, because the condition on line 1925 was never false

1926 self.provider = value

1927 else:

1928 self.warnings.append(

1929 {

1930 self.pid: self.__class__.__name__

1931 + "."

1932 + inspect.currentframe().f_code.co_name

1933 + " "

1934 + tag

1935 }

1936 )

1937

1938 def parse_description(self, node, **kwargs):

1939 # tag = get_normalized_attrib(node, "abstract-node_type") or "abstract"

1940 tag = "description"

1941 lang = get_normalized_attrib(node, "lang") or self.lang

1942 value_xml = get_xml_from_node(node)

1943 value_tex = value_html = value_xml.replace("<decription", "").replace("</description>", "")

1944 self.abstracts.append(

1945 {

1946 "tag": tag,

1947 "lang": lang,

1948 "value_xml": value_xml,

1949 "value_html": value_html,

1950 "value_tex": value_tex,

1951 }

1952 )

1953

1954

1955class JatsPublisher(PublisherData):

1956 def __init__(self, *args, **kwargs):

1957 super().__init__(*args, **kwargs)

1958 self.warnings = []

1959 self.parse_tree(kwargs["tree"])

1960 self.warnings = []

1961

1962 def parse_tree(self, tree):

1963 for node in tree:

1964 tag = normalize(node.tag)

1965

1966 if tag == "publisher-name": 1966 ↛ 1968line 1966 didn't jump to line 1968, because the condition on line 1966 was never false

1967 self.name = node.text

1968 elif tag == "publisher-loc":

1969 self.loc = node.text

1970 else:

1971 self.warnings.append(

1972 {

1973 self.pid: self.__class__.__name__

1974 + "."

1975 + inspect.currentframe().f_code.co_name

1976 + " "

1977 + tag

1978 }

1979 )

1980

1981

1982class JatsJournal(JournalData, JatsBase):

1983 def __init__(self, *args, **kwargs):

1984 super().__init__(*args, **kwargs)

1985 self.parse_tree(kwargs["tree"])

1986

1987 def parse_tree(self, tree):

1988 super().parse_tree(tree)

1989

1990 for node in tree:

1991 tag = normalize(node.tag)

1992

1993 if tag == "journal-id":

1994 id_type = node.get("journal-id-type") or "numdam-id"

1995 if id_type == "numdam-id" or id_type == "mathdoc-id": 1995 ↛ 1990line 1995 didn't jump to line 1990, because the condition on line 1995 was never false

1996 self.pid = node.text

1997 elif tag == "journal-title-group":

1998 self.parse_title_group(node)

1999 elif tag == "publisher":

2000 self.publisher = JatsPublisher(tree=node)

2001 elif tag == "issn": 2001 ↛ 2010line 2001 didn't jump to line 2010, because the condition on line 2001 was never false

2002 node_type = node.get("pub-type") or "ppub"

2003 if node_type == "ppub":

2004 self.issn = node.text

2005 self.ids.append(("issn", node.text))

2006 elif node_type == "epub": 2006 ↛ 1990line 2006 didn't jump to line 1990, because the condition on line 2006 was never false

2007 self.e_issn = node.text

2008 self.ids.append(("e-issn", node.text))

2009 else:

2010 self.warnings.append(

2011 {

2012 self.pid: self.__class__.__name__

2013 + "."

2014 + inspect.currentframe().f_code.co_name

2015 + " "

2016 + tag

2017 }

2018 )

2019

2020

2021class JatsIssue(IssueData, JatsBase):

2022 def __init__(self, *args, **kwargs):

2023 super().__init__(*args, **kwargs)

2024 # from_folder is used to change the location of Elsevier graphics to a full path location

2025 self.from_folder = kwargs["from_folder"] if "from_folder" in kwargs else None

2026 self.no_bib = kwargs.get("no_bib", False)

2027

2028 self.parse_tree(kwargs["tree"])

2029

2030 def parse_tree(self, tree):

2031 super().parse_tree(tree)

2032

2033 for node in tree:

2034 tag = normalize(node.tag)

2035

2036 if tag == "journal-meta":

2037 self.journal = JatsJournal(tree=node)

2038 elif tag == "issue-meta":

2039 self.parse_issue_meta(node)

2040 elif tag == "body": 2040 ↛ 2064line 2040 didn't jump to line 2064, because the condition on line 2040 was never false

2041 for child in node:

2042 tag = normalize(child.tag)

2043

2044 if tag == "article": 2044 ↛ 2054line 2044 didn't jump to line 2054, because the condition on line 2044 was never false

2045 article = JatsArticle(

2046 tree=child,

2047 issue=self,

2048 from_folder=self.from_folder,

2049 no_bib=self.no_bib,

2050 )

2051 self.warnings.extend(article.warnings)

2052 self.articles.append(article)

2053 else:

2054 self.warnings.append(

2055 {

2056 self.pid: self.__class__.__name__

2057 + "."

2058 + inspect.currentframe().f_code.co_name

2059 + " "

2060 + tag

2061 }

2062 )

2063 else:

2064 self.warnings.append(

2065 {

2066 self.pid: self.__class__.__name__

2067 + "."

2068 + inspect.currentframe().f_code.co_name

2069 + " "

2070 + tag

2071 }

2072 )

2073

2074 if self.journal is not None: 2074 ↛ 2078line 2074 didn't jump to line 2078, because the condition on line 2074 was never false

2075 self.publisher = self.journal.publisher

2076

2077 # Issue editors may be replicated in all the articles, remove them

2078 issue_editors = [contrib for contrib in self.contributors if contrib["role"] == "editor"]

2079

2080 is_elsevier = False

2081 for xarticle in self.articles:

2082 if hasattr(xarticle, "pii"): 2082 ↛ 2083line 2082 didn't jump to line 2083, because the condition on line 2082 was never true

2083 is_elsevier = True

2084

2085 editors = [contrib for contrib in xarticle.contributors if contrib["role"] == "editor"]

2086 is_equal = len(editors) == len(issue_editors)

2087 i = 0

2088 while is_equal and i < len(editors): 2088 ↛ 2089line 2088 didn't jump to line 2089, because the condition on line 2088 was never true

2089 if (

2090 editors[i]["last_name"] != issue_editors[i]["last_name"]

2091 or editors[i]["first_name"] != issue_editors[i]["first_name"]

2092 ):

2093 is_equal = False

2094 i += 1

2095 if is_equal:

2096 xarticle.contributors = [

2097 contrib for contrib in xarticle.contributors if contrib["role"] != "editor"

2098 ]

2099

2100 if is_elsevier: 2100 ↛ 2102line 2100 didn't jump to line 2102, because the condition on line 2100 was never true

2101 # Fix location of icons

2102 for link in self.ext_links:

2103 if link["rel"] in ["icon", "small_icon"]:

2104 base_dir = self.journal.pid

2105 location = link["location"]

2106 if os.path.dirname(location) != base_dir:

2107 location = os.path.join(base_dir, self.pid, location)

2108 if self.from_folder:

2109 location = os.path.join(self.from_folder, location)

2110 location = "file:" + location

2111 link["location"] = location

2112

2113 # Fix article types and subjects

2114 for xarticle in self.articles:

2115 article_type = "research-article"

2116 old_type = ""

2117 new_subjs = []

2118

2119 if xarticle.fpage != "":

2120 try:

2121 value = int(xarticle.fpage)

2122 except ValueError:

2123 # fpage is not a number: the article is an editorial

2124 article_type = "editorial"

2125

2126 if article_type == "research-article":

2127 for subj in xarticle.subjs:

2128 if subj["type"] == "type":

2129 # Fix article types

2130 value = subj["value"].lower()

2131 old_type = value

2132 if value == "discussion":

2133 article_type = "letter"

2134 elif value == "editorial":

2135 if xarticle.title_tex.lower().find("foreword") == 0:

2136 article_type = "foreword"

2137 else:

2138 article_type = "editorial"

2139 elif value in ["mini review", "review article", "book review"]:

2140 article_type = "review"

2141 elif value == "research article":

2142 article_type = "research-article"

2143 elif value == "short communication":

2144 article_type = "foreword"

2145 elif value == "correspondence":

2146 article_type = "letter"

2147 elif value.find("conference") == 0:

2148 article_type = "congress"

2149 elif subj["type"] == "heading" and not xarticle.title_tex:

2150 # The title may be stored in the heading: fix it

2151 xarticle.title_tex = xarticle.title_html = subj["value"]

2152 xarticle.title_xml = get_title_xml(subj["value"])

2153 elif subj["type"] == "heading":

2154 value = subj["value"].lower().strip()

2155 issue_title = self.title_tex.lower()

2156 if issue_title.find("dossier: ") == 0:

2157 issue_title = issue_title[9:]

2158 self.title_tex = self.title_html = self.title_tex[9:]

2159 self.title_xml = (

2160 "<issue-title>"

2161 + get_single_title_xml(issue_title)

2162 + "</issue-title>"

2163 )

2164

2165 # Some heading values are in fact article type

2166 if value.find("erratum") == 0:

2167 article_type = "erratum"

2168 elif value.find("corrigendum") == 0:

2169 article_type = "corrigendum"

2170 elif value.find("foreword") == 0:

2171 article_type = "foreword"

2172 elif value.find("nécrologie") == 0 or value.find("obituary") == 0:

2173 article_type = "history-of-sciences"

2174 elif (

2175 value.find("block calendar/éphéméride") == 0

2176 or value.find("chronique") == 0

2177 ):

2178 article_type = "history-of-sciences"

2179 elif value.find("histoire") == 0 or value.find("historic") == 0:

2180 article_type = "history-of-sciences"

2181 elif value.find("tribute/hommage") == 0:

2182 article_type = "history-of-sciences"

2183 elif value.find("note historique") == 0:

2184 article_type = "historical-commentary"

2185 elif (

2186 value.find("le point sur") == 0 or value.find("le point-sur") == 0

2187 ):

2188 article_type = "review"

2189 elif (

2190 value.find("review") == 0

2191 or value.find("revue") == 0

2192 or value.find("concise review") == 0

2193 ):

2194 article_type = "review"

2195 elif value.find("conférence") == 0:

2196 article_type = "congress"

2197 elif (

2198 value.find("communication") == 0 or value.find("preliminary") == 0

2199 ):

2200 article_type = "preliminary-communication"

2201 elif value.find("perspective") == 0 and old_type in [

2202 "correspondence",

2203 "short communication",

2204 ]:

2205 article_type = "opinion"

2206 elif value.find("debate") == 0:

2207 article_type = "opinion"

2208 elif (

2209 value.find("index") == 0

2210 or value.find("keyword") == 0

2211 or value.find("sommaire") == 0

2212 ):

2213 article_type = "editorial"

2214 elif (

2215 value.find("table auteurs") == 0

2216 or value.find("table sommaire") == 0

2217 ):

2218 article_type = "editorial"

2219 elif value.find("page présentation des index") == 0:

2220 article_type = "editorial"

2221 elif value.find("fac-similé") == 0:

2222 # Article de crbiol, Pubmed les met en "Classical Article"

2223 article_type = "historical-commentary"

2224 # On ajoute le sujet dans ce cas pour garder la mention de "fac-similé" (== recopie)

2225 new_subjs.append(subj)

2226 # Ignore the issue titles

2227 elif (

2228 not self.title_tex

2229 or value.find(self.title_tex.lower().strip()) != 0

2230 ):

2231 # Exclude headings that are redundant with article types

2232 exclude_list = [

2233 "editorial",

2234 "éditorial",

2235 "avant-propos",

2236 "book review",

2237 "comment",

2238 "concise review paper",

2239 "answer",

2240 "commentaire",

2241 "commentary",

2242 "reply",

2243 "foreword",

2244 "full paper",

2245 "mémoire",

2246 ]

2247 if len([x for x in exclude_list if value.find(x) == 0]) == 0:

2248 new_subjs.append(subj)

2249 else:

2250 new_subjs.append(subj)

2251

2252 # print(old_type, '-', old_heading, '-', article_type, '-', xarticle.pid, '-', xarticle.fpage)

2253 xarticle.atype = article_type

2254 xarticle.subjs = new_subjs

2255

2256 def parse_custom_meta_group(self, node, **kwargs):

2257 for child in node:

2258 tag = normalize(child.tag)

2259

2260 if tag == "custom-meta": 2260 ↛ 2268line 2260 didn't jump to line 2268, because the condition on line 2260 was never false

2261 name, value = self.get_data_from_custom_meta(child)

2262

2263 if name == "provider":

2264 self.provider = value

2265 elif name == "efirst": 2265 ↛ 2257line 2265 didn't jump to line 2257, because the condition on line 2265 was never false

2266 self.with_online_first = value == "yes"

2267 else:

2268 self.warnings.append(

2269 {

2270 self.pid: self.__class__.__name__

2271 + "."

2272 + inspect.currentframe().f_code.co_name

2273 + " "

2274 + tag

2275 }

2276 )

2277

2278 def parse_issue_meta(self, node, **kwargs):

2279 for child in node:

2280 tag = normalize(child.tag)

2281

2282 if tag == "issue-id":

2283 self.parse_id(child)

2284 elif tag == "volume-series":

2285 self.vseries = child.text

2286 elif tag == "volume":

2287 self.volume = child.text

2288 elif tag == "issue":

2289 self.number = child.text

2290 elif tag == "pub-date":

2291 self.year = self.get_data_from_date(child, ignore_month=True)

2292 elif tag == "history":

2293 history_dates = self.get_data_from_history(child)

2294 for date in history_dates:

2295 if date["type"] == "last-modified":

2296 self.last_modified_iso_8601_date_str = date["date"]

2297 elif date["type"] == "prod-deployed-date":

2298 self.prod_deployed_date_iso_8601_date_str = date["date"]

2299 elif tag == "issue-title":

2300 content_type = child.get("content-type") or ""

2301 if content_type != "subtitle" and content_type != "cover-date": 2301 ↛ 2279line 2301 didn't jump to line 2279, because the condition on line 2301 was never false

2302 # Elsevier stores contributors in subtitles. Ignore.

2303 lang = get_normalized_attrib(child, "lang") or "und"

2304 if not self.title_tex and (

2305 self.lang == "und" or lang == "und" or lang == self.lang

2306 ):

2307 self.parse_title(child)

2308 # In xmldata, title_xml had the <title_group> tag:

2309 # self.title_xml can't be set in parse_title

2310 self.title_xml += get_xml_from_node(child)

2311 else:

2312 self.trans_lang = lang

2313 (

2314 self.trans_title_tex,

2315 self.trans_title_html,

2316 ) = self.parse_node_with_mixed_content(child)

2317 self.title_xml += get_xml_from_node(child)

2318 elif tag == "issue-title-group": 2318 ↛ 2319line 2318 didn't jump to line 2319, because the condition on line 2318 was never true

2319 self.parse_title_group(child)

2320 else:

2321 fct_name = "parse_" + tag.replace("-", "_")

2322 ftor = getattr(self, fct_name, None)

2323 if callable(ftor): 2323 ↛ 2326line 2323 didn't jump to line 2326, because the condition on line 2323 was never false

2324 ftor(child, add_ext_link=True)

2325 else:

2326 self.warnings.append(

2327 {

2328 self.pid: self.__class__.__name__

2329 + "."

2330 + inspect.currentframe().f_code.co_name

2331 + " "

2332 + tag

2333 }

2334 )

2335

2336 if self.last_modified_iso_8601_date_str is None:

2337 self.last_modified_iso_8601_date_str = timezone.now().isoformat()

2338

2339

2340class JatsArticleBase(JatsBase):

2341 def parse_custom_meta_group(self, node, **kwargs):

2342 for child in node:

2343 tag = normalize(child.tag)

2344

2345 if tag == "custom-meta": 2345 ↛ 2361line 2345 didn't jump to line 2361, because the condition on line 2345 was never false

2346 name, value = self.get_data_from_custom_meta(child)

2347

2348 if name == "article-number":

2349 self.article_number = value

2350 elif name == "talk-number":

2351 self.talk_number = value

2352 elif name == "presented": 2352 ↛ 2353line 2352 didn't jump to line 2353, because the condition on line 2352 was never true

2353 presenter = create_contributor()

2354 presenter["role"] = "presenter"

2355 presenter["string_name"] = value.replace("Presented by ", "").replace(

2356 "Présenté par ", ""

2357 )

2358 presenter["contrib_xml"] = get_contrib_xml(presenter)

2359 self.contributors.append(presenter)

2360 else:

2361 self.warnings.append(

2362 {

2363 self.pid: self.__class__.__name__

2364 + "."

2365 + inspect.currentframe().f_code.co_name

2366 + " "

2367 + tag

2368 }

2369 )

2370

2371

2372class JatsArticle(ArticleData, JatsArticleBase):

2373 def __init__(self, *args, **kwargs): # , tree, pid=None):

2374 super().__init__(*args, **kwargs)

2375 self.pid = kwargs["pid"] if "pid" in kwargs else None

2376 self.issue = kwargs["issue"] if "issue" in kwargs else None

2377

2378 self.add_span_around_tex_formula = (

2379 kwargs["add_span_around_tex_formula"]

2380 if "add_span_around_tex_formula" in kwargs

2381 else False

2382 )

2383 self.for_tex_file = kwargs["for_tex_file"] if "for_tex_file" in kwargs else False

2384 self.from_folder = kwargs["from_folder"] if "from_folder" in kwargs else None

2385 self.no_bib = kwargs.get("no_bib", False)

2386

2387 self.parse_tree(kwargs["tree"])

2388

2389 def parse_tree(self, tree):

2390 super().parse_tree(tree)

2391

2392 self.atype = get_normalized_attrib(tree, "article-type") or ""

2393

2394 # First loop to catch float-groups that are inserted inside the body

2395 for node in tree:

2396 tag = normalize(node.tag)

2397

2398 if tag == "front":

2399 for child in node:

2400 tag = normalize(child.tag)

2401

2402 if tag == "article-meta":

2403 self.parse_article_meta(child)

2404 else:

2405 self.warnings.append(

2406 {

2407 self.pid: self.__class__.__name__

2408 + "."

2409 + inspect.currentframe().f_code.co_name

2410 + " "

2411 + tag

2412 }

2413 )

2414 elif tag == "front-stub": 2414 ↛ 2415line 2414 didn't jump to line 2415, because the condition on line 2414 was never true

2415 self.parse_article_meta(node)

2416 elif tag == "floats-group": 2416 ↛ 2417line 2416 didn't jump to line 2417, because the condition on line 2416 was never true

2417 self.parse_floats_group(node)

2418

2419 for node in tree:

2420 tag = normalize(node.tag)

2421 if tag == "back":

2422 for child in node:

2423 tag = normalize(child.tag)

2424

2425 if tag == "ref-list" and not self.no_bib:

2426 print("Parse bib")

2427 self.parse_ref_list(child)

2428 elif tag == "ack": 2428 ↛ 2429line 2428 didn't jump to line 2429, because the condition on line 2428 was never true

2429 self.parse_ack(child)

2430 elif tag == "sec": 2430 ↛ 2431line 2430 didn't jump to line 2431, because the condition on line 2430 was never true

2431 self.parse_sec(child)

2432 elif tag == "app-group": 2432 ↛ 2433line 2432 didn't jump to line 2433, because the condition on line 2432 was never true

2433 self.parse_app_group(child)

2434 elif tag == "fn-group": 2434 ↛ 2435line 2434 didn't jump to line 2435, because the condition on line 2434 was never true

2435 self.parse_fn_group(child)

2436 else:

2437 self.warnings.append(

2438 {

2439 self.pid: self.__class__.__name__

2440 + "."

2441 + inspect.currentframe().f_code.co_name

2442 + " "

2443 + tag

2444 }

2445 )

2446

2447 elif tag == "body":

2448 self.parse_body(node)

2449 elif tag == "sub-article": 2449 ↛ 2450line 2449 didn't jump to line 2450, because the condition on line 2449 was never true

2450 self.parse_sub_article(node)

2451 elif tag == "floats-group" or tag == "front": 2451 ↛ 2455line 2451 didn't jump to line 2455, because the condition on line 2451 was never false

2452 # Handled above

2453 pass

2454 else:

2455 self.warnings.append(

2456 {

2457 self.pid: self.__class__.__name__

2458 + "."

2459 + inspect.currentframe().f_code.co_name

2460 + " "

2461 + tag

2462 }

2463 )

2464

2465 # Add the footnotes at the end

2466 if len(self.fns) > 0: 2466 ↛ 2467line 2466 didn't jump to line 2467, because the condition on line 2466 was never true

2467 fn_text = '<div class="footnotes">'

2468 for fn in self.fns:

2469 fn_text += fn

2470 fn_text += "</div>"

2471

2472 self.body_html = fn_text if not self.body_html else self.body_html + fn_text

2473

2474 if ( 2474 ↛ 2478line 2474 didn't jump to line 2478

2475 len(self.funding_statement_xml) > 0

2476 and self.funding_statement_xml.find('<name-content content-type="fn"') == -1

2477 ):

2478 self.funding_statement_xml = (

2479 f'<name-content content-type="fn">{self.funding_statement_xml}</name-content>'

2480 )

2481

2482 # Case for XML with <body>, then <back> and <floats_group>

2483 # The figures/tables of the floats_group are added inside the body_html

2484 # (close to their first <xref>)

2485 # It's too complicated to do the same for the body_xml as we use the get_xml_from_node function.

2486 # Instead, we append the floats_group_xml to the body_xml

2487 if hasattr(self, "floats_group_xml"): 2487 ↛ 2488line 2487 didn't jump to line 2488, because the condition on line 2487 was never true

2488 self.body_xml += self.floats_group_xml

2489

2490 # Special treatment for Elsevier articles: web scrapping to find the date_published

2491 # Moved to the import management commands since Elsevier blocks IP after 1000+ requests

2492 # if hasattr(self, 'pii') and self.date_published_iso_8601_date_str is None:

2493 # article_data = scrapping.fetch_article(self.doi, self.pii)

2494 # self.date_published_iso_8601_date_str = article_data.date_published_iso_8601_date_str

2495

2496 if self.no_bib: 2496 ↛ 2498line 2496 didn't jump to line 2498, because the condition on line 2496 was never true

2497 # For Geodesic

2498 ext_link = create_extlink()

2499 ext_link["rel"] = "source"

2500 ext_link["location"] = "http://www.numdam.org/item/" + self.pid

2501 ext_link["metadata"] = "NUMDAM"

2502 self.ext_links.append(ext_link)

2503

2504 def update_body_content(self, node, **kwargs):

2505 if len(node) == 0:

2506 # Most journals do not display the Full text

2507 # the <body> is then used to store the text for the search engine and has no children

2508 # Let's not compute body_html in this case.

2509 # We want the same behavior for journals that display the Full text,

2510 # but with old articles without Full text.

2511 return

2512

2513 # <front> has to be put before <body> so self.pid is defined here

2514 if hasattr(settings, "SITE_URL_PREFIX"): 2514 ↛ 2515line 2514 didn't jump to line 2515, because the condition on line 2514 was never true

2515 prefix = settings.SITE_URL_PREFIX

2516 base_article = settings.ARTICLE_BASE_URL

2517 base_url = "/" + prefix + base_article + self.pid

2518 else:

2519 base_url = os.path.join(settings.ARTICLE_BASE_URL, self.pid)

2520 kwargs["base_url"] = base_url

2521

2522 append_to_body = True

2523 current_len = len(self.supplementary_materials)

2524

2525 if "use_sec" in kwargs and kwargs["use_sec"]: 2525 ↛ 2527line 2525 didn't jump to line 2527, because the condition on line 2525 was never true

2526 # Hack for Elsevier: convert <ack> into <sec> of the <body>

2527 body_tex, body_html = self.parse_node_with_sec(node, **kwargs)

2528 else:

2529 body_tex, body_html = self.parse_node_with_mixed_content(node, **kwargs)

2530

2531 if len(self.supplementary_materials) != current_len: 2531 ↛ 2534line 2531 didn't jump to line 2534, because the condition on line 2531 was never true

2532 # Elsevier stores supplementary-material in app-group.

2533 # They are extracted, but ignored in the body_html if the appendix has only supplements

2534 append_to_body = False

2535

2536 for child in node:

2537 if child.tag == "p":

2538 for gchild in child:

2539 if gchild.tag != "supplementary-material":

2540 append_to_body = True

2541

2542 if append_to_body: 2542 ↛ exitline 2542 didn't return from function 'update_body_content', because the condition on line 2542 was never false

2543 self.body_tex = body_tex if not self.body_tex else self.body_tex + body_tex

2544 self.body_html = body_html if not self.body_html else self.body_html + body_html

2545

2546 body_xml = get_xml_from_node(node)

2547 if not self.body_xml: 2547 ↛ 2550line 2547 didn't jump to line 2550, because the condition on line 2547 was never false

2548 self.body_xml = body_xml

2549 else:

2550 if "use_sec" in kwargs and kwargs["use_sec"]:

2551 self.body_xml = f"{self.body_xml[0:-7]}<sec>{body_xml[5:-6]}</sec></body>"

2552 else:

2553 self.body_xml = f"{self.body_xml[0:-7]}{body_xml}</body>"

2554

2555 def parse_ack(self, node, **kwargs):

2556 content_type = node.get("content-type") or ""

2557 if content_type == "COI-statement":

2558 self.coi_statement = get_text_from_node(node)

2559 else:

2560 # Hack for Elsevier: convert <ack> into <sec> of the <body>

2561 self.update_body_content(node, use_sec=True)

2562

2563 def parse_app(self, node, **kwargs):

2564 for child in node:

2565 tag = normalize(child.tag)

2566

2567 if tag == "sec":

2568 # Elsevier can store all appendixes inside one <app> ?!?

2569 # One of them can store the supplements and has to be ignored in the body_html

2570 self.update_body_content(child)

2571 else:

2572 self.warnings.append(

2573 {

2574 self.pid: self.__class__.__name__

2575 + "."

2576 + inspect.currentframe().f_code.co_name

2577 + " "

2578 + tag

2579 }

2580 )

2581

2582 def parse_app_group(self, node, **kwargs):

2583 for child in node:

2584 tag = normalize(child.tag)

2585

2586 if tag == "app":

2587 self.parse_app(child)

2588 else:

2589 self.warnings.append(

2590 {

2591 self.pid: self.__class__.__name__

2592 + "."

2593 + inspect.currentframe().f_code.co_name

2594 + " "

2595 + tag

2596 }

2597 )

2598

2599 def parse_article_categories(self, node, **kwargs):

2600 for child in node:

2601 tag = normalize(child.tag)

2602

2603 if tag == "subj-group": 2603 ↛ 2606line 2603 didn't jump to line 2606, because the condition on line 2603 was never false

2604 self.parse_subj_group(child)

2605 else:

2606 self.warnings.append(

2607 {

2608 self.pid: self.__class__.__name__

2609 + "."

2610 + inspect.currentframe().f_code.co_name

2611 + " "

2612 + tag

2613 }

2614 )

2615

2616 def parse_article_meta(self, node, **kwargs):

2617 for child in node:

2618 tag = normalize(child.tag)

2619

2620 if tag == "article-id":

2621 self.parse_id(child)

2622 elif tag == "fpage":

2623 self.fpage = child.text

2624 self.page_type = child.get("content-type") or ""

2625 elif tag == "lpage":

2626 self.lpage = child.text or ""

2627 elif tag == "page-range":

2628 self.page_range = child.text

2629 elif tag in ("page-count", "size"): 2629 ↛ 2630line 2629 didn't jump to line 2630, because the condition on line 2629 was never true

2630 self.size = child.text

2631 elif tag == "elocation-id": 2631 ↛ 2632line 2631 didn't jump to line 2632, because the condition on line 2631 was never true

2632 self.elocation = child.text

2633 elif tag == "pub-date":

2634 date_type = child.get("date-type") or "pub"

2635 if date_type == "pub":

2636 self.date_published_iso_8601_date_str = self.get_data_from_date(child)

2637 else:

2638 date_str = self.get_data_from_date(child)

2639 self.history_dates.append({"type": "online", "date": date_str})

2640 elif tag == "history":

2641 self.history_dates += self.get_data_from_history(child)

2642 for date in self.history_dates:

2643 if date["type"] == "prod-deployed-date":

2644 self.prod_deployed_date_iso_8601_date_str = date["date"]

2645 elif tag in ["volume", "issue-id", "permissions", "pub-date-not-available"]:

2646 pass

2647 # TODO: store permissions in XML

2648 elif tag == "author-notes": 2648 ↛ 2650line 2648 didn't jump to line 2650, because the condition on line 2648 was never true

2649 # 2022/11/15 Mersenne meeting. ignore author-notes

2650 pass

2651 # self.parse_author_notes(child)

2652 else:

2653 fct_name = "parse_" + tag.replace("-", "_")

2654 ftor = getattr(self, fct_name, None)

2655 if callable(ftor):

2656 ftor(child, add_ext_link=True)

2657 else:

2658 self.warnings.append(

2659 {

2660 self.pid: self.__class__.__name__

2661 + "."

2662 + inspect.currentframe().f_code.co_name

2663 + " "

2664 + tag

2665 }

2666 )

2667

2668 def parse_author_notes(self, node, **kwargs):

2669 for child in node:

2670 tag = normalize(child.tag)

2671 if tag == "fn":

2672 _, html = self.parse_node_with_fn(child, keep_fn=True, keep_fn_label=False)

2673 xml = get_xml_from_node(child)

2674 self.footnotes_xml += xml

2675 self.footnotes_html += html

2676

2677 def parse_body(self, node, **kwargs):

2678 self.body = get_text_from_node(node)

2679

2680 if hasattr(self, "floats"): 2680 ↛ 2681line 2680 didn't jump to line 2681, because the condition on line 2680 was never true

2681 self.floats_to_insert = []

2682

2683 self.update_body_content(node, **kwargs)

2684

2685 if not self.body_xml:

2686 self.body_xml = get_xml_from_node(node)

2687

2688 def parse_boxed_text(self, node, **kwargs):

2689 """

2690 Parse <boxed-text> inside <floats-group> and fills the self.float_boxed_texts dictionary.

2691 The dictionary is then used during parse_body to embed the boxed-text inside the body HTML.

2692 """

2693 box_id = node.attrib["id"] if "id" in node.attrib else None

2694

2695 _, html = self.parse_node_with_boxed_text(node, **kwargs)

2696

2697 if box_id is not None:

2698 self.floats[box_id] = html

2699

2700 def parse_floats_group(self, node, **kwargs):

2701 if hasattr(settings, "SITE_URL_PREFIX"):

2702 prefix = settings.SITE_URL_PREFIX

2703 base_article = settings.ARTICLE_BASE_URL

2704 base_url = "/" + prefix + base_article + self.pid

2705 else:

2706 base_url = os.path.join(settings.ARTICLE_BASE_URL, self.pid)

2707

2708 self.floats = {}

2709 for child in node:

2710 tag = normalize(child.tag)

2711

2712 if tag == "fig":

2713 self.parse_node_with_fig(child, append_floats=True, base_url=base_url)

2714 elif tag == "table-wrap":

2715 self.parse_node_with_table_wrap(child, append_floats=True, base_url=base_url)

2716 elif tag == "boxed-text":

2717 self.parse_boxed_text(child, base_url=base_url)

2718 else:

2719 self.warnings.append(

2720 {

2721 self.pid: self.__class__.__name__

2722 + "."

2723 + inspect.currentframe().f_code.co_name

2724 + " "

2725 + tag

2726 }

2727 )

2728

2729 self.floats_group_xml = get_xml_from_node(node)

2730

2731 def parse_fn_group(self, node, **kwargs):

2732 for child in node:

2733 tag = normalize(child.tag)

2734

2735 if tag == "fn":

2736 _, html = self.parse_node_with_fn(child, keep_fn=True)

2737 xml = get_xml_from_node(child)

2738

2739 self.footnotes_html += html

2740 self.footnotes_xml += xml

2741 else:

2742 self.warnings.append(

2743 {

2744 self.pid: self.__class__.__name__

2745 + "."

2746 + inspect.currentframe().f_code.co_name

2747 + " "

2748 + tag

2749 }

2750 )

2751

2752 def parse_funding_group(self, node, **kwargs):

2753 for child in node:

2754 tag = normalize(child.tag)

2755

2756 if tag == "award-group": 2756 ↛ 2758line 2756 didn't jump to line 2758, because the condition on line 2756 was never false

2757 self.parse_award_group(child)

2758 elif tag == "funding-statement":

2759 for funding_node in child:

2760 if funding_node.tag == "name-content":

2761 for funding_child in funding_node:

2762 if funding_child.tag == "fn":

2763 _, html = self.parse_node_with_fn(funding_child, keep_fn=True)

2764 self.funding_statement_html += html

2765 self.funding_statement_xml = get_xml_from_node(funding_node)

2766

2767 # TODO: handle funding-statement with simple texts

2768 else:

2769 self.warnings.append(

2770 {

2771 self.pid: self.__class__.__name__

2772 + "."

2773 + inspect.currentframe().f_code.co_name

2774 + " "

2775 + tag

2776 }

2777 )

2778

2779 def parse_issue(self, node, **kwargs):

2780 # Elsevier stores bs in the seq attribute

2781 self.seq = "0" if hasattr(self, "pii") else (node.get("seq") or "0")

2782

2783

2784class JatsRef(RefBase, JatsBase):

2785 def __init__(self, *args, **kwargs): # , tree, lang):

2786 super().__init__(*args, **kwargs) # lang)

2787 self.parse_tree(kwargs["tree"])

2788

2789 def parse_tree(self, tree):

2790 super().parse_tree(tree)

2791

2792 self.user_id = get_normalized_attrib(tree, "id") or ""

2793

2794 for node in tree:

2795 tag = normalize(node.tag)

2796

2797 if tag == "label":

2798 self.label = node.text or ""

2799

2800 if self.label: 2800 ↛ 2835line 2800 didn't jump to line 2835, because the condition on line 2800 was never false

2801 if self.label[0] != "[":

2802 self.label = "[" + self.label + "]"

2803

2804 elif tag == "mixed-citation" or tag == "note":

2805 self.parse_citation_node(node)

2806

2807 self.citation_tex, self.citation_html = self.parse_node_with_mixed_content(

2808 node,

2809 is_citation=True,

2810 is_mixed_citation=True,

2811 add_ext_link=True,

2812 ref_type="misc",

2813 )

2814

2815 if self.label:

2816 self.citation_html = self.label + " " + self.citation_html

2817 self.citation_tex = self.label + " " + self.citation_tex

2818

2819 elif tag == "element-citation":

2820 self.parse_citation_node(node)

2821

2822 self.citation_tex = self.citation_html = get_citation_html(self)

2823 else:

2824 self.warnings.append(

2825 {

2826 self.pid: self.__class__.__name__

2827 + "."

2828 + inspect.currentframe().f_code.co_name

2829 + " "

2830 + tag

2831 }

2832 )

2833

2834 # With xmldata, citation_xml does not have '<ref>', but only the text of the children

2835 self.citation_xml += get_xml_from_node(node)

2836

2837 def get_data_from_name_in_ref(self, node, role):

2838 params = create_contributor()

2839 params["role"] = role

2840

2841 if node.tag == "name":

2842 self.update_data_from_name(node, params)

2843 elif node.tag == "string-name":

2844 self.update_data_from_name(node, params)

2845 if params["first_name"] == "" and params["last_name"] == "":

2846 params["string_name"] = node.text or ""

2847 elif node.tag == "name-alternatives": 2847 ↛ 2848line 2847 didn't jump to line 2848, because the condition on line 2847 was never true

2848 params["mid"] = self.get_data_from_name_alternatives(node)

2849 elif node.tag == "collab": 2849 ↛ 2850line 2849 didn't jump to line 2850, because the condition on line 2849 was never true

2850 params["string_name"] = node.text or ""

2851

2852 use_initials = getattr(settings, "REF_JEP_STYLE", False)

2853 helper_update_name_params(params, use_initials)

2854 params["contrib_xml"] = "<etal/>" if node.tag == "etal" else get_xml_from_node(node)

2855

2856 return params

2857

2858 def parse_node_with_chapter_title(self, node, **kwargs):

2859 tex, html = self.parse_inner_node(node, **kwargs)

2860

2861 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False

2862 if is_mixed_citation:

2863 html = add_span_class_to_html_from_chapter_title(html, **kwargs)

2864

2865 return tex, html

2866

2867 def parse_node_with_source(self, node, **kwargs):

2868 tex, html = self.parse_inner_node(node, **kwargs)

2869

2870 is_mixed_citation = kwargs["is_mixed_citation"] if "is_mixed_citation" in kwargs else False

2871 if is_mixed_citation:

2872 html = add_span_class_to_html_from_source(html, **kwargs)

2873

2874 return tex, html

2875

2876 def parse_citation_node(self, node, **kwargs):

2877 self.type = get_normalized_attrib(node, "publication-type") or "misc"

2878

2879 # Elsevier can store data about a translation after comments (<source>...)

2880 # Append these tags in the comment

2881 has_comment = False

2882

2883 for child in node:

2884 tag = normalize(child.tag)

2885

2886 if tag in ("page-count", "size"):

2887 if not self.size: 2887 ↛ 2883line 2887 didn't jump to line 2883, because the condition on line 2887 was never false

2888 self.size = child.text

2889 elif tag == "comment":

2890 has_comment = True

2891 # comments may have ext-links or uri. HTML <a> links will be added

2892 _, comment = self.parse_node_with_mixed_content(

2893 child, is_citation=True, is_comment=True, add_HTML_link=True

2894 )

2895 if self.comment:

2896 self.comment += " "

2897 self.comment += comment

2898 elif tag == "source":

2899 # TODO: migration to store source_tex and source_html

2900 _, source_tex = self.parse_node_with_mixed_content(child, is_citation=True)

2901

2902 if self.type in ["book", "inproceedings"] and len(self.source_tex) > 0: 2902 ↛ 2904line 2902 didn't jump to line 2904, because the condition on line 2902 was never true

2903 # Multiple source for a book, store the extra source in series

2904 if self.series and has_comment:

2905 self.comment += " " + source_tex

2906 else:

2907 if self.series:

2908 self.series += ", "

2909 self.series += get_text_from_node(child)

2910 else:

2911 if self.source_tex and has_comment: 2911 ↛ 2912line 2911 didn't jump to line 2912, because the condition on line 2911 was never true

2912 self.comment += " " + source_tex

2913 else:

2914 self.source_tex = source_tex

2915 elif tag == "series":

2916 series = get_text_from_node(child)

2917 if self.series and has_comment: 2917 ↛ 2918line 2917 didn't jump to line 2918, because the condition on line 2917 was never true

2918 self.comment += ", " + series

2919 else:

2920 if self.series: 2920 ↛ 2921line 2920 didn't jump to line 2921, because the condition on line 2920 was never true

2921 self.series += ", "

2922 self.series += series

2923 elif tag == "annotation": 2923 ↛ 2924line 2923 didn't jump to line 2924, because the condition on line 2923 was never true

2924 if not self.annotation:

2925 self.annotation = get_text_from_node(child)

2926 elif tag == "article-title":

2927 # TODO: migration to store article_title_tex and article_title_html

2928 _, article_title_tex = self.parse_node_with_mixed_content(child, is_citation=True)

2929

2930 if self.type == "book": 2930 ↛ 2932line 2930 didn't jump to line 2932, because the condition on line 2930 was never true

2931 # Elsevier uses article-title for books !?!

2932 if len(self.source_tex) == 0:

2933 if has_comment:

2934 self.comment += " " + article_title_tex

2935 else:

2936 self.source_tex = article_title_tex

2937 else:

2938 if self.series and has_comment:

2939 self.comment += ", " + article_title_tex

2940 else:

2941 self.series += get_text_from_node(child)

2942 elif self.type == "inproceedings":

2943 if self.chapter_title_tex and has_comment: 2943 ↛ 2944line 2943 didn't jump to line 2944, because the condition on line 2943 was never true

2944 self.comment += " " + article_title_tex

2945 else:

2946 self.chapter_title_tex = article_title_tex

2947 else:

2948 if self.article_title_tex and has_comment: 2948 ↛ 2949line 2948 didn't jump to line 2949, because the condition on line 2948 was never true

2949 self.comment += " " + article_title_tex

2950 else:

2951 self.article_title_tex = article_title_tex

2952 elif tag == "chapter-title":

2953 # TODO: migration to store chapter_title_tex and chapter_title_html

2954 _, chapter_title_tex = self.parse_node_with_mixed_content(child, is_citation=True)

2955 if self.chapter_title_tex and has_comment: 2955 ↛ 2956line 2955 didn't jump to line 2956, because the condition on line 2955 was never true

2956 self.comment += " " + chapter_title_tex

2957 else:

2958 self.chapter_title_tex = chapter_title_tex

2959 elif tag == "conf-name":

2960 _, conf_tex = self.parse_node_with_mixed_content(child, is_citation=True)

2961 if self.source_tex and has_comment: 2961 ↛ 2962line 2961 didn't jump to line 2962, because the condition on line 2961 was never true

2962 self.comment += ", " + conf_tex

2963 else:

2964 self.source_tex = conf_tex

2965 elif tag in ("name", "string-name", "name-alternatives", "etal", "collab"):

2966 params = self.get_data_from_name_in_ref(child, "author")

2967 self.contributors.append(params)

2968 elif tag == "person-group":

2969 self.parse_person_group(child)

2970 elif tag == "ext-link":

2971 self.parse_ext_link(child, add_ext_link=True)

2972 elif tag == "pub-id":

2973 self.parse_pub_id(child)

2974 elif tag == "date": 2974 ↛ 2975line 2974 didn't jump to line 2975, because the condition on line 2974 was never true

2975 self.year = get_text_from_node(child)

2976 elif tag == "date-in-citation": 2976 ↛ 2977line 2976 didn't jump to line 2977, because the condition on line 2976 was never true

2977 date_ = child.get("iso-8601-date") or ""

2978 if date_:

2979 if self.comment:

2980 self.comment += ", "

2981 self.comment += "Accessed " + date_

2982 elif tag == "isbn": 2982 ↛ 2983line 2982 didn't jump to line 2983, because the condition on line 2982 was never true

2983 if self.annotation:

2984 self.annotation += ", "

2985 self.annotation += "ISBN: " + child.text

2986 elif tag == "issn": 2986 ↛ 2987line 2986 didn't jump to line 2987, because the condition on line 2986 was never true

2987 if self.annotation:

2988 self.annotation += ", "

2989 self.annotation += "ISSN: " + child.text

2990 elif child.text is not None:

2991 variable_name = tag.replace("-", "_")

2992 if has_comment and hasattr(self, variable_name) and getattr(self, variable_name): 2992 ↛ 2993line 2992 didn't jump to line 2993, because the condition on line 2992 was never true

2993 if tag == "fpage":

2994 self.comment += ", pp. "

2995 elif tag == "lpage":

2996 self.comment += "-"

2997 else:

2998 self.comment += ", "

2999 self.comment += child.text

3000 elif not hasattr(self, variable_name) or not getattr(self, variable_name):

3001 setattr(self, variable_name, child.text)

3002

3003 def parse_person_group(self, node, **kwargs):

3004 role = node.get("person-group-type") or ""

3005 if role and role[-1] == "s": 3005 ↛ 3006line 3005 didn't jump to line 3006, because the condition on line 3005 was never true

3006 role = role[:-1]

3007

3008 for child in node:

3009 tag = normalize(child.tag)

3010

3011 if tag in ("name", "string-name", "name-alternatives", "etal", "collab"): 3011 ↛ 3015line 3011 didn't jump to line 3015, because the condition on line 3011 was never false

3012 contrib = self.get_data_from_name_in_ref(child, role)

3013 self.contributors.append(contrib)

3014 else:

3015 self.warnings.append(

3016 {

3017 self.pid: self.__class__.__name__

3018 + "."

3019 + inspect.currentframe().f_code.co_name

3020 + " "

3021 + tag

3022 }

3023 )

3024

3025 def parse_pub_id(self, node, **kwargs):

3026 node_type = node.get("pub-id-type") or ""

3027

3028 data = {

3029 "rel": node_type,

3030 "mimetype": "",

3031 "location": "",

3032 "base": "",

3033 "metadata": node.text,

3034 }

3035

3036 self.add_extids_from_node_with_link(data)

3037

3038 def split_label(self):

3039 """

3040 Used when sorting non-digit bibitems

3041 """

3042 label = self.label.lower()

3043 if len(label) > 1:

3044 label = label[1:-1]

3045

3046 try:

3047 self.label_prefix, self.label_suffix = re.split(r"[\d]+", label)

3048 except ValueError:

3049 # Special case where label is similar as "Sma" instead of "Sma15"

3050 self.label_prefix, self.label_suffix = [label, ""]

3051

3052

3053class BitsCollection(CollectionData, JatsBase):

3054 def __init__(self, *args, **kwargs):

3055 super().__init__(*args, **kwargs)

3056 self.parse_tree(kwargs["tree"])

3057

3058 def parse_tree(self, tree):

3059 super().parse_tree(tree)

3060

3061 if tree is not None: 3061 ↛ 3104line 3061 didn't jump to line 3104, because the condition on line 3061 was never false

3062 tag = normalize(tree.tag)

3063 collection_meta_node = None

3064 if tag == "collection-meta":

3065 self.parse_collection_meta(tree)

3066 collection_meta_node = tree

3067 elif tag == "in-collection": 3067 ↛ 3091line 3067 didn't jump to line 3091, because the condition on line 3067 was never false

3068 for node in tree:

3069 tag = normalize(node.tag)

3070

3071 if tag == "collection-meta":

3072 self.parse_collection_meta(node)

3073 collection_meta_node = node

3074 elif tag == "volume":

3075 self.parse_volume(node)

3076 elif tag == "volume-series": 3076 ↛ 3078line 3076 didn't jump to line 3078, because the condition on line 3076 was never false

3077 self.parse_volume_series(node)

3078 elif tag == "volume-title":

3079 self.parse_volume_title(node)

3080 else:

3081 self.warnings.append(

3082 {

3083 self.pid: self.__class__.__name__

3084 + "."

3085 + inspect.currentframe().f_code.co_name

3086 + " "

3087 + tag

3088 }

3089 )

3090

3091 if collection_meta_node is not None: 3091 ↛ 3094line 3091 didn't jump to line 3094, because the condition on line 3091 was never false

3092 self.set_seq(collection_meta_node)

3093 else:

3094 self.warnings.append(

3095 {

3096 self.pid: self.__class__.__name__

3097 + "."

3098 + inspect.currentframe().f_code.co_name

3099 + " "

3100 + tag

3101 }

3102 )

3103

3104 self.collection = Foo()

3105 self.collection.pid = self.pid

3106

3107 def parse_collection_meta(self, node, **kwargs):

3108 self.coltype = node.get("collection-type")

3109

3110 for child in node:

3111 tag = normalize(child.tag)

3112

3113 if tag == "collection-id":

3114 self.pid = child.text

3115 elif tag == "title-group":

3116 self.parse_title_group(child)

3117 elif tag == "issn":

3118 node_type = child.get("pub-type")

3119 if node_type == "ppub": 3119 ↛ 3120line 3119 didn't jump to line 3120, because the condition on line 3119 was never true

3120 self.issn = child.text

3121 self.ids.append(("issn", child.text))

3122 elif node_type == "epub": 3122 ↛ 3123line 3122 didn't jump to line 3123, because the condition on line 3122 was never true

3123 self.e_issn = child.text

3124 self.ids.append(("e-issn", child.text))

3125 elif tag == "ext-link": 3125 ↛ 3126line 3125 didn't jump to line 3126, because the condition on line 3125 was never true

3126 data = self.get_data_from_ext_link(child)

3127 self.ext_links.append(data)

3128 elif tag == "volume-in-collection":

3129 self.parse_volume_in_collection(child)

3130 else:

3131 self.warnings.append(

3132 {

3133 self.pid: self.__class__.__name__

3134 + "."

3135 + inspect.currentframe().f_code.co_name

3136 + " "

3137 + tag

3138 }

3139 )

3140

3141 def parse_volume(self, node, **kwargs):

3142 self.volume = node.text

3143

3144 def parse_volume_in_collection(self, node, **kwargs):

3145 for child in node:

3146 tag = normalize(child.tag)

3147

3148 if tag == "volume-number":

3149 self.parse_volume(child)

3150 elif tag == "volume-series":

3151 self.parse_volume_series(child)

3152 elif tag == "volume-title": 3152 ↛ 3155line 3152 didn't jump to line 3155, because the condition on line 3152 was never false

3153 self.parse_volume_title(child)

3154 else:

3155 self.warnings.append(

3156 {

3157 self.pid: self.__class__.__name__

3158 + "."

3159 + inspect.currentframe().f_code.co_name

3160 + " "

3161 + tag

3162 }

3163 )

3164

3165 def parse_volume_series(self, node, **kwargs):

3166 self.vseries = node.text

3167

3168 def parse_volume_title(self, node, **kwargs):

3169 self.title_tex, self.title_html = self.parse_node_with_mixed_content(node)

3170 self.title_xml = get_xml_from_node(node)

3171

3172 def set_seq(self, node):

3173 try:

3174 # First, use the seq attribute, if any

3175 self.seq = int(node.get("seq") or "")

3176 except ValueError:

3177 # Second, use self.volume (which can be like "158-159")

3178 if not self.volume: 3178 ↛ 3179line 3178 didn't jump to line 3179, because the condition on line 3178 was never true

3179 self.seq = 0

3180 else:

3181 text = self.volume.split("-")[0]

3182 try:

3183 self.seq = int(text)

3184 except ValueError:

3185 self.seq = 0

3186

3187 # Third, use self.vseries as an offset

3188 try:

3189 # pas plus de 10000 ouvrages dans une série (gasp)

3190 self.seq = int(self.vseries) * 10000 + self.seq

3191 except ValueError:

3192 pass

3193

3194

3195class BitsBook(BookData, JatsBase):

3196 def __init__(self, *args, **kwargs):

3197 super().__init__(*args, **kwargs)

3198 self.no_bib = kwargs.get("no_bib", False)

3199

3200 self.parse_tree(kwargs["tree"])

3201

3202 def parse_tree(self, tree):

3203 super().parse_tree(tree)

3204

3205 book_type = get_normalized_attrib(tree, "book-type") or "Book"

3206 self.ctype = "book-" + book_type

3207

3208 for node in tree:

3209 if type(tree) == type(node): 3209 ↛ 3208line 3209 didn't jump to line 3208, because the condition on line 3209 was never false

3210 tag = normalize(node.tag)

3211

3212 if tag in ("collection-meta", "in-collection"):

3213 col = BitsCollection(tree=node)

3214 self.incollection.append(col)

3215 elif tag == "book-meta":

3216 self.parse_book_meta(node)

3217 elif tag == "book-body":

3218 self.parse_book_body(node)

3219 elif tag == "front-matter":

3220 self.parse_front_matter(node)

3221 elif tag == "book-back":

3222 for child in node:

3223 tag = normalize(child.tag)

3224 if tag == "ref-list":

3225 self.parse_ref_list(child)

3226 else:

3227 self.warnings.append(

3228 {

3229 self.pid: self.__class__.__name__

3230 + "."

3231 + inspect.currentframe().f_code.co_name

3232 + " "

3233 + tag

3234 }

3235 )

3236 else:

3237 self.warnings.append(

3238 {

3239 self.pid: self.__class__.__name__

3240 + "."

3241 + inspect.currentframe().f_code.co_name

3242 + " "

3243 + tag

3244 }

3245 )

3246

3247 self.set_contribs()

3248 self.set_title()

3249

3250 def parse_book_body(self, node, **kwargs):

3251 for child in node:

3252 if type(child) == type(node): 3252 ↛ 3251line 3252 didn't jump to line 3251, because the condition on line 3252 was never false

3253 tag = normalize(child.tag)

3254

3255 if tag == "book-part": 3255 ↛ 3260line 3255 didn't jump to line 3260, because the condition on line 3255 was never false

3256 book_part = BitsBookPart(tree=child)

3257 self.warnings.extend(book_part.warnings)

3258 self.parts.append(book_part)

3259 else:

3260 self.warnings.append(

3261 {

3262 self.pid: self.__class__.__name__

3263 + "."

3264 + inspect.currentframe().f_code.co_name

3265 + " "

3266 + tag

3267 }

3268 )

3269

3270 if not self.parts:

3271 self.body = get_text_from_node(node)

3272

3273 def parse_book_meta(self, node, **kwargs):

3274 for child in node:

3275 tag = normalize(child.tag)

3276

3277 if tag == "book-id":

3278 self.parse_id(child)

3279 elif tag == "pub-date":

3280 self.year = self.get_data_from_date(child)

3281 elif tag == "book-volume-number": 3281 ↛ 3282line 3281 didn't jump to line 3282, because the condition on line 3281 was never true

3282 self.volume = child.text

3283 self.volume_int = child.text

3284 elif tag == "pub-history":

3285 history_dates = self.get_data_from_history(child)

3286 for date in history_dates:

3287 if date["type"] == "last-modified":

3288 self.last_modified_iso_8601_date_str = date["date"]

3289 elif date["type"] == "prod-deployed-date": 3289 ↛ 3290line 3289 didn't jump to line 3290, because the condition on line 3289 was never true

3290 self.prod_deployed_date_iso_8601_date_str = date["date"]

3291 elif tag == "book-title-group":

3292 self.parse_title_group(child)

3293 elif tag == "publisher":

3294 self.publisher = JatsPublisher(tree=child)

3295 else:

3296 fct_name = "parse_" + tag.replace("-", "_")

3297 ftor = getattr(self, fct_name, None)

3298 if callable(ftor):

3299 ftor(child, add_ext_link=True)

3300 else:

3301 self.warnings.append(

3302 {

3303 self.pid: self.__class__.__name__

3304 + "."

3305 + inspect.currentframe().f_code.co_name

3306 + " "

3307 + tag

3308 }

3309 )

3310

3311 if self.last_modified_iso_8601_date_str is None: 3311 ↛ 3312line 3311 didn't jump to line 3312, because the condition on line 3311 was never true

3312 self.last_modified_iso_8601_date_str = timezone.now().isoformat()

3313

3314 def parse_custom_meta_group(self, node, **kwargs):

3315 for child in node:

3316 tag = normalize(child.tag)

3317

3318 if tag == "custom-meta": 3318 ↛ 3315line 3318 didn't jump to line 3315, because the condition on line 3318 was never false

3319 name, value = self.get_data_from_custom_meta(child)

3320

3321 if name == "provider": 3321 ↛ 3315line 3321 didn't jump to line 3315, because the condition on line 3321 was never false

3322 self.provider = value

3323

3324 def set_contribs(self):

3325 """

3326 Update the contrib_groups if the XML does not declare any

3327 - with the authors of the first part

3328 - if the book is a monograph

3329 - if all parts are written by the same authors

3330

3331 :return:

3332 """

3333

3334 authors = [contrib for contrib in self.contributors if contrib["role"] == "author"]

3335 if not authors:

3336 if self.ctype == "book-monograph" and self.parts:

3337 first_part = self.parts[0]

3338 self.contributors = first_part.contributors

3339 elif ( 3339 ↛ exitline 3339 didn't return from function 'set_contribs', because the condition on line 3339 was never false

3340 self.ctype == "book-edited-book" or self.ctype == "book-lecture-notes"

3341 ) and self.parts:

3342 # check if authors of the book-parts are identical

3343 equal = True

3344 book_part_contributors = self.parts[0].contributors

3345 i = 1

3346 while equal and i < len(self.parts):

3347 part = self.parts[i]

3348 if part.contributors != book_part_contributors: 3348 ↛ 3350line 3348 didn't jump to line 3350, because the condition on line 3348 was never false

3349 equal = False

3350 i += 1

3351 if equal: 3351 ↛ 3352line 3351 didn't jump to line 3352, because the condition on line 3351 was never true

3352 if self.ctype == "book-edited-book":

3353 self.ctype = "book-monograph"

3354 self.contributors = book_part_contributors

3355 else:

3356 contrib = create_contributor()

3357 contrib["string_name"] = "Collectif"

3358 contrib["role"] = "author"

3359 contrib["contrib_xml"] = get_contrib_xml(contrib)

3360 self.contributors.append(contrib)

3361

3362 def set_title(self):

3363 if self.title_xml == "" and len(self.incollection) > 0:

3364 self.title_xml = self.incollection[0].title_xml

3365 self.title_html = self.incollection[0].title_html

3366 self.title_tex = self.incollection[0].title_tex

3367

3368

3369class BitsBookPart(BookPartData, JatsArticleBase):

3370 def __init__(self, *args, **kwargs):

3371 super().__init__(*args, **kwargs)

3372 self.no_bib = kwargs.get("no_bib", False)

3373 self.parse_tree(kwargs["tree"])

3374

3375 def parse_tree(self, tree):

3376 super().parse_tree(tree)

3377

3378 self.atype = get_normalized_attrib(tree, "book-part-type") or ""

3379 try:

3380 self.seq = int(get_normalized_attrib(tree, "seq") or "")

3381 except ValueError:

3382 pass

3383

3384 for node in tree:

3385 tag = normalize(node.tag)

3386

3387 if tag == "book-part-meta":

3388 self.parse_book_part_meta(node)

3389 elif tag == "body":

3390 self.parse_body(node)

3391 elif tag == "front-matter": 3391 ↛ 3392line 3391 didn't jump to line 3392, because the condition on line 3391 was never true

3392 self.parse_front_matter(node)

3393 elif tag == "back": 3393 ↛ 3410line 3393 didn't jump to line 3410, because the condition on line 3393 was never false

3394 for child in node:

3395 tag = normalize(child.tag)

3396

3397 if tag == "ref-list": 3397 ↛ 3400line 3397 didn't jump to line 3400, because the condition on line 3397 was never false

3398 self.parse_ref_list(child)

3399 else:

3400 self.warnings.append(

3401 {

3402 self.pid: self.__class__.__name__

3403 + "."

3404 + inspect.currentframe().f_code.co_name

3405 + " "

3406 + tag

3407 }

3408 )

3409 else:

3410 self.warnings.append(

3411 {

3412 self.pid: self.__class__.__name__

3413 + "."

3414 + inspect.currentframe().f_code.co_name

3415 + " "

3416 + tag

3417 }

3418 )

3419

3420 # Workaround a numdam-plus bug where a book-part can have a trans-title without a title

3421 # TODO: Fix numdam-plus, the books impacted and remove the hack

3422 self.set_title()

3423

3424 def parse_book_part_meta(self, node, **kwargs):

3425 for child in node:

3426 tag = normalize(child.tag)

3427

3428 if tag == "book-part-id":

3429 self.parse_id(child)

3430 elif tag == "fpage":

3431 self.fpage = child.text

3432 self.page_type = get_normalized_attrib(child, "content-type") or ""

3433 elif tag == "lpage":

3434 self.lpage = child.text

3435 elif tag == "page-range": 3435 ↛ 3436line 3435 didn't jump to line 3436, because the condition on line 3435 was never true

3436 self.page_range = child.text

3437 else:

3438 fct_name = "parse_" + tag.replace("-", "_")

3439 ftor = getattr(self, fct_name, None)

3440 if callable(ftor): 3440 ↛ 3443line 3440 didn't jump to line 3443, because the condition on line 3440 was never false

3441 ftor(child)

3442 else:

3443 self.warnings.append(

3444 {

3445 self.pid: self.__class__.__name__

3446 + "."

3447 + inspect.currentframe().f_code.co_name

3448 + " "

3449 + tag

3450 }

3451 )

3452

3453 def parse_body(self, node, **kwargs):

3454 for child in node:

3455 tag = normalize(child.tag)

3456

3457 if tag == "book-part":

3458 book_part = BitsBookPart(tree=child)

3459 self.warnings.extend(book_part.warnings)

3460 self.parts.append(book_part)

3461 else:

3462 self.warnings.append(

3463 {

3464 self.pid: self.__class__.__name__

3465 + "."

3466 + inspect.currentframe().f_code.co_name

3467 + " "

3468 + tag

3469 }

3470 )

3471

3472 self.body = get_text_from_node(node)

3473

3474 def set_title(self):

3475 """

3476 Bug in some books: some chapters may have a trans-title, but no title !

3477 Hack and manually set the title*

3478 :return:

3479 """

3480

3481 if self.trans_title_html and not self.title_html:

3482 self.title_html = self.trans_title_html

3483 self.title_tex = self.trans_title_tex

3484

3485

3486######################################################################################

3487#

3488# Functions used by ptf-tools

3489#

3490######################################################################################

3491

3492

3493def update_bibitem_xml(bibitem, new_ids):

3494 xml = "<ref>" + bibitem.citation_xml + "</ref>"

3495 the_parser = etree.XMLParser(

3496 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True

3497 )

3498 tree = etree.fromstring(xml, parser=the_parser)

3499

3500 node = tree.find("element-citation")

3501 if node is None:

3502 node = tree.find("mixed-citation")

3503 if node is not None: 3503 ↛ 3544line 3503 didn't jump to line 3544, because the condition on line 3503 was never false

3504 children_to_remove = []

3505 for child in node:

3506 if child.tag == "ext-link":

3507 child_type = child.get("ext-link-type")

3508 if child_type and child_type in [

3509 "zbl-item-id",

3510 "mr-item-id",

3511 "doi",

3512 "numdam-id",

3513 "mathdoc-id",

3514 "eid",

3515 ]:

3516 children_to_remove.append(child)

3517 elif child.tag == "pub-id":

3518 child_type = child.get("pub-id-type")

3519 if child_type and child_type in [

3520 "zbl-item-id",

3521 "mr-item-id",

3522 "doi",

3523 "numdam-id",

3524 "mathdoc-id",

3525 ]:

3526 children_to_remove.append(child)

3527

3528 for child in children_to_remove:

3529 node.remove(child)

3530

3531 for id_type, value_dict in new_ids.items():

3532 if value_dict["checked"] and not value_dict["false_positive"]:

3533 if id_type in ["doi", "arxiv", "tel", "hal", "theses.fr"]:

3534 new_node = etree.Element("pub-id")

3535 new_node.set("pub-id-type", id_type)

3536 else:

3537 new_node = etree.Element("ext-link")

3538 new_node.set("ext-link-type", id_type)

3539

3540 new_node.text = value_dict["id_value"]

3541 node.append(new_node)

3542

3543 # TODO Modify the call to update_bibitem_xml and pass the parent's lang

3544 result = JatsRef(tree=tree, lang="und")

3545 return result

3546

3547

3548def check_bibitem_xml(bibitem):

3549 xml = "<ref>" + bibitem.citation_xml + "</ref>"

3550 the_parser = etree.XMLParser(

3551 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True

3552 )

3553 tree = etree.fromstring(xml, parser=the_parser)

3554

3555 result = JatsRef(tree=tree, lang="und")

3556 return result

3557

3558

3559# Create XML strings based on internal data

3560

3561

3562def get_single_title_xml(title):

3563 has_italic = title.find("<i>") > -1 and title.find("</i>") > -1

3564 has_superscript = title.find("<sup>") > -1 and title.find("</sup>") > -1

3565 has_subscript = title.find("<sub>") > -1 and title.find("</sub>") > -1

3566

3567 if has_italic: 3567 ↛ 3568line 3567 didn't jump to line 3568, because the condition on line 3567 was never true

3568 title = title.replace("<i>", "|||i|||").replace("</i>", "|||/i|||")

3569 if has_superscript: 3569 ↛ 3570line 3569 didn't jump to line 3570, because the condition on line 3569 was never true

3570 title = title.replace("<sup>", "|||sup|||").replace("</sup>", "|||/sup|||")

3571 if has_subscript: 3571 ↛ 3572line 3571 didn't jump to line 3572, because the condition on line 3571 was never true

3572 title = title.replace("<sub>", "|||sub|||").replace("</sub>", "|||/sub|||")

3573

3574 title = escape(title)

3575

3576 if has_italic: 3576 ↛ 3577line 3576 didn't jump to line 3577, because the condition on line 3576 was never true

3577 title = title.replace("|||i|||", "<italic>").replace("|||/i|||", "</italic>")

3578

3579 if has_superscript: 3579 ↛ 3580line 3579 didn't jump to line 3580, because the condition on line 3579 was never true

3580 title = title.replace("|||sup|||", "<sup>").replace("|||/sup|||", "</sup>")

3581

3582 if has_subscript: 3582 ↛ 3583line 3582 didn't jump to line 3583, because the condition on line 3582 was never true

3583 title = title.replace("|||sub|||", "<sub>").replace("|||/sub|||", "</sub>")

3584

3585 return title

3586

3587

3588def get_title_xml(title, trans_title=None, trans_lang=None, with_tex_values=True):

3589 """

3590 Get the title_xml given a simple title

3591 If the title has formulas, use CKeditorParser first, then call this function with the value_xml returned by the parser

3592 and set with_tex_values to False

3593 TODO: enhance CkeditorParser to accept both title and trans_title to build the xml in 1 shot.

3594 """

3595 if with_tex_values:

3596 title = get_single_title_xml(title)

3597

3598 xml = '<title-group xmlns:xlink="http://www.w3.org/1999/xlink">'

3599 xml += f'<article-title xml:space="preserve">{title}</article-title>'

3600

3601 if trans_title and trans_lang:

3602 if with_tex_values:

3603 trans_title = get_single_title_xml(trans_title)

3604 xml += f'<trans-title-group xml:lang="{trans_lang}"><trans-title>{trans_title}</trans-title></trans-title-group>'

3605

3606 xml += "</title-group>"

3607

3608 return xml

3609

3610

3611def get_issue_title_xml(title, lang, trans_title=None, trans_lang=None):

3612 """

3613 Get the title_xml given a simple title

3614 """

3615 title = get_single_title_xml(title)

3616 xml = f'<issue-title xml:lang="{lang}" xml:space="preserve">{title}</issue-title>'

3617

3618 if trans_title and trans_lang:

3619 trans_title = get_single_title_xml(trans_title)

3620 xml += f'<issue-title xml:lang="{trans_lang}" xml:space="preserve">{trans_title}</issue-title>'

3621

3622 return xml

3623

3624

3625def get_name_params(first_name, last_name, prefix, suffix, orcid):

3626 params = {

3627 "first_name": first_name,

3628 "last_name": last_name,

3629 "prefix": prefix,

3630 "suffix": suffix,

3631 "orcid": orcid,

3632 }

3633 helper_update_name_params(params)

3634

3635 return params

3636

3637

3638def get_tex_from_xml(xml, tag, **kwargs):

3639 parser_ = etree.XMLParser(

3640 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True

3641 )

3642 etree.register_namespace("mml", "http://www.w3.org/1998/Math/MathML")

3643 # text = xml.replace('xmlns:xlink="http://www.w3.org/1999/xlink"', '')

3644 text = xml

3645

3646 if tag in ["abstract", "title"]: 3646 ↛ 3649line 3646 didn't jump to line 3649, because the condition on line 3646 was never false

3647 text = f"<article><front><article-meta>{text}</article-meta></front></article>"

3648

3649 tree = etree.fromstring(text.encode("utf-8"), parser=parser_)

3650 xarticle = JatsArticle(tree=tree, **kwargs)

3651

3652 result = ""

3653 if tag == "abstract": 3653 ↛ 3655line 3653 didn't jump to line 3655, because the condition on line 3653 was never false

3654 result = xarticle.abstracts[0]["value_tex"]

3655 elif tag == "title":

3656 result = xarticle.title_tex, xarticle.trans_title_tex

3657

3658 return result

Coverage for apps/ptf/cmds/xml/jats/jats_parser.py: 70%

2055 statements