Coverage for apps/ptf/cmds/xml/jats/xmldata.py: 15%

1850 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-05-19 19:20 +0000

1import os 

2import re 

3import unicodedata 

4 

5from lxml import etree 

6 

7from django.conf import settings 

8from django.utils import timezone 

9from django.utils.translation import gettext_lazy as _ 

10 

11from ptf.cmds.xml.xml_utils import escape 

12from ptf.cmds.xml.xml_utils import normalize 

13from ptf.cmds.xml.xml_utils import remove_namespace 

14 

15 

16def get_attribute_value(node, fullname, basename=None, name=None): 

17 value = "" 

18 try: 

19 if basename == name: 

20 value = node.attrib[fullname] 

21 except KeyError: 

22 pass 

23 

24 return value 

25 

26 

27def get_lang_attrib(node): 

28 lang = "und" 

29 if node is not None: 

30 for attrib in node.attrib: 

31 name = normalize(attrib) 

32 if name == "lang": 

33 lang = node.attrib[attrib] 

34 

35 return lang 

36 

37 

38def get_href_attrib(node): 

39 href = None 

40 if node is not None: 

41 for attrib in node.attrib: 

42 name = normalize(attrib) 

43 if name == "href": 

44 href = node.attrib[attrib] 

45 

46 return href 

47 

48 

49def innerxml(node): 

50 if node.text: 

51 parts = [escape(node.text)] + [ 

52 etree.tostring(c, encoding="unicode") for c in node.getchildren() 

53 ] 

54 else: 

55 parts = [etree.tostring(c, encoding="unicode") for c in node.getchildren()] 

56 return "".join(parts).strip().encode("utf-8") 

57 

58 

59def get_node_text(node): 

60 text = "" 

61 if node is not None: 

62 text = etree.tostring( 

63 node, encoding="unicode", method="text", xml_declaration=False, with_tail=False 

64 ) 

65 return text 

66 

67 

68########################################################################## 

69# 

70# get_mixed_content: recreate the xml string from a node 

71# 

72# Used to export data (OAI) 

73# 

74########################################################################## 

75 

76 

77def get_mixed_content(node): 

78 text = "" 

79 if node is not None: 

80 text = etree.tostring( 

81 node, encoding="unicode", method="xml", xml_declaration=False, with_tail=False 

82 ) 

83 return text 

84 

85 

86########################################################################## 

87# 

88# get_tex: get the tex version of a node with mixed-content 

89# 

90# Strip the mathml alternative of formula 

91# 

92# Used to prepare the HTML pages. A Django template can simply display title_tex 

93# 

94########################################################################## 

95def get_tex(node, is_top=True, is_citation=False): 

96 text = "" 

97 is_citation_author = False 

98 is_citation_title = False 

99 

100 if node is not None: 

101 normalized_tag = normalize(node.tag) 

102 

103 if normalized_tag == "element-citation": 

104 text += get_element_citation_str(node, is_top) 

105 elif normalized_tag != "math": 

106 if normalized_tag == "mixed-citation": 

107 is_citation = True 

108 elif is_citation and normalized_tag == "string-name": 

109 is_citation_author = True 

110 elif is_citation and ( 

111 normalized_tag == "article-title" 

112 or normalized_tag == "chapter-title" 

113 or normalized_tag == "italic" 

114 ): 

115 is_citation_title = True 

116 

117 if node.text: 

118 text += node.text 

119 

120 for child in node: 

121 text += get_tex(child, False) 

122 

123 if is_citation_title: 

124 text = '<span class="citation-title">' + text + "</span>" 

125 elif is_citation_author: 

126 text = '<span class="citation-author">' + text.title() + "</span>" 

127 

128 if node.tail and not is_top: 

129 text += node.tail 

130 

131 return text 

132 

133 

134def make_links_clickable(href, string): 

135 if re.match(r"http+", href): 

136 return f'<a href="{href}" target="_blank">{string}</a>' 

137 if href.startswith("/"): 

138 return f'<a href="{href}">{string}</a>' 

139 return string 

140 

141 

142########################################################################## 

143# 

144# get_html_mixed_content_with_figures: get the mathml version of a node with mixed-content 

145# 

146# Strip the tex alternative of formula, add the tex version to the tooltip (HTML <title> tag) 

147# 

148# Used to prepare the HTML pages. A Django template can simply display the_html 

149# 

150# TODO: Use a dict to pass the params 

151# 

152########################################################################## 

153def get_html_mixed_content_with_figures( 

154 node, 

155 is_top=True, 

156 is_citation=False, 

157 is_comment=False, 

158 is_figure=False, 

159 prefix="", 

160 suffix="", 

161 sec_level=2, 

162 label_title="", 

163 figures=None, 

164 base_url="", 

165): 

166 text = "" 

167 is_citation_author = False 

168 is_citation_title = False 

169 is_citation_volume = False 

170 

171 # specific case for element-citation as the order of the children 

172 # might not be the order of display 

173 if node is not None: 

174 normalized_tag = normalize(node.tag) 

175 

176 if normalized_tag == "element-citation": 

177 text = get_element_citation_str(node, is_top) 

178 # pub-id are ignored by default are they are treated separately 

179 # Inside citations or comments, ext-links are converted in html links 

180 elif is_comment or (normalized_tag != "pub-id" and normalized_tag != "object-id"): 

181 if normalized_tag == "mixed-citation" or normalized_tag == "toc": 

182 is_citation = True 

183 # elif normalized_tag == "toc": 

184 # is_toc = True 

185 elif normalized_tag == "comment": 

186 is_comment = True 

187 elif is_citation and normalized_tag == "string-name": 

188 is_citation_author = True 

189 elif is_citation and ( 

190 normalized_tag == "article-title" 

191 or normalized_tag == "chapter-title" 

192 or normalized_tag == "italic" 

193 ): 

194 is_citation_title = True 

195 elif is_citation and normalized_tag == "volume": 

196 is_citation_volume = True 

197 

198 text += prefix 

199 

200 if is_citation and normalized_tag == "ext-link": 

201 type = node.get("ext-link-type") 

202 if type is None: 

203 href = get_href_attrib(node) 

204 if not href: 

205 href = node.text 

206 if "www.numdam.org" not in href: 

207 href = make_links_clickable(href, node.text) 

208 text += href 

209 elif is_citation and normalized_tag == "uri": 

210 href = get_href_attrib(node) 

211 if not href: 

212 href = node.text 

213 href = make_links_clickable(href, node.text) 

214 text += href 

215 # elif normalized_tag == "nav-pointer": 

216 # rid = get_attribute_value(node,'rid') 

217 # if rid is not '': 

218 # href = '/item/%s' % rid 

219 # #href = make_links_clickable(href, node.text) non car make links clickable cree un lien absolu avec target _blank 

220 # link = '<a href="%s">%s</a>' % (href, node.text) 

221 # else: 

222 # link = node.text 

223 # text += link 

224 elif is_comment and node.text: 

225 match = re.match(r"[\n ]+", node.text) 

226 if not match: 

227 comment = make_links_clickable(node.text, node.text) 

228 text += comment 

229 elif node.text: 

230 text += node.text 

231 

232 label = "" 

233 if ( 

234 normalized_tag == "sec" 

235 or normalized_tag == "statement" 

236 or normalized_tag == "fig" 

237 or normalized_tag == "list-item" 

238 or normalized_tag == "table-wrap" 

239 ): 

240 child = node.find("label") 

241 if child is not None: 

242 label += child.text 

243 node.remove(child) 

244 child = node.find("title") 

245 if child is not None: 

246 if label: 

247 label += " " 

248 label += child.text 

249 node.remove(child) 

250 

251 if normalized_tag == "sec" or normalized_tag == "statement": 

252 text = "<h" + str(sec_level) + ">" + label + "</h" + str(sec_level) + ">" 

253 sec_level += 1 

254 

255 if normalized_tag == "table-wrap": 

256 text = "<strong>" + label + "</strong>" 

257 

258 if normalized_tag == "fig": 

259 is_figure = True 

260 child = node.find("caption") 

261 if child is not None: 

262 child_text, figures = get_html_mixed_content_with_figures( 

263 child, 

264 False, 

265 is_citation, 

266 is_comment, 

267 is_figure, 

268 "", 

269 "", 

270 sec_level, 

271 "", 

272 figures, 

273 base_url, 

274 ) 

275 label += " : " + child_text 

276 node.remove(child) 

277 

278 if normalized_tag == "list-item": 

279 label_title = label 

280 

281 if normalized_tag == "p": 

282 if label_title: 

283 text = label_title + " " + text 

284 label_title = "" 

285 

286 if normalized_tag == "inline-formula" or normalized_tag == "disp-formula": 

287 for child in node: 

288 if child.tag == "alternatives": 

289 math_text = "" 

290 tex_text = "" 

291 

292 for great_child in child: 

293 normalized_tag = normalize(great_child.tag) 

294 if normalized_tag == "math": 

295 math_text = get_mixed_content(great_child) 

296 else: 

297 tex_text = get_tex(great_child) 

298 

299 text += '<span title="' + tex_text + '">' + math_text + "</span>" 

300 

301 else: 

302 for child in node: 

303 child_text, figures = get_html_mixed_content_with_figures( 

304 child, 

305 False, 

306 is_citation, 

307 is_comment, 

308 is_figure, 

309 "", 

310 "", 

311 sec_level, 

312 label_title, 

313 figures, 

314 base_url, 

315 ) 

316 text += child_text 

317 

318 if is_citation_title: 

319 text = '<span class="citation-document-title">' + text + "</span>" 

320 elif is_citation_author: 

321 text = '<span class="citation-author">' + text.title() + "</span>" 

322 elif is_citation_volume: 

323 text = '<span class="citation-volume">' + text + "</span>" 

324 elif normalized_tag == "list": 

325 type = node.get("list-type") 

326 if type is None or type == "bullet": 

327 text = "<ul>" + text + "</ul>" 

328 else: 

329 if type == "order": 

330 text = '<ol type="1">' + text + "</ol>" 

331 elif type == "alpha-lower": 

332 text = '<ol type="a">' + text + "</ol>" 

333 elif type == "alpha-upper": 

334 text = '<ol type="A">' + text + "</ol>" 

335 elif type == "roman-lower": 

336 text = '<ol type="i">' + text + "</ol>" 

337 elif type == "roman-upper": 

338 text = '<ol type="I">' + text + "</ol>" 

339 else: 

340 text = ( 

341 '<ul class="no-bullet" style="list-style-type:none;">' + text + "</ul>" 

342 ) 

343 elif normalized_tag == "list-item": 

344 text = "<li>" + text + "</li>" 

345 elif normalized_tag == "strong" or normalized_tag == "bold": 

346 text = "<strong>" + text + "</strong>" 

347 elif normalized_tag == "italic": 

348 text = '<span class="italique">' + text + "</span>" 

349 elif normalized_tag == "p": 

350 type = node.get("specific-use") 

351 if type: 

352 text = '<p class="' + type + '">' + text + "</p>" 

353 else: 

354 text = "<p>" + text + "</p>" 

355 elif normalized_tag == "caption" and not is_figure: 

356 text = '<div class="caption">' + text + "</div>" 

357 elif normalized_tag == "sec" or normalized_tag == "statement": 

358 text = "<section>" + text + "</section>" 

359 elif normalized_tag == "fig": 

360 id = node.get("id") 

361 if id: 

362 tag = '<figure id="' + id + '">' 

363 else: 

364 tag = "<figure>" 

365 text = tag + text 

366 if label: 

367 text += "<figcaption>" + label + "</figcaption>" 

368 text += "</figure>" 

369 elif normalized_tag == "sub" or normalized_tag == "sup": 

370 text = "<" + normalized_tag + ">" + text + "</" + normalized_tag + ">" 

371 elif normalized_tag == "xref": 

372 id = node.get("rid") 

373 if id: 

374 text = '<a href="#' + id + '">' + text + "</a>" 

375 elif normalized_tag == "graphic" and is_figure: 

376 href = "" 

377 for attrib in node.attrib: 

378 name = normalize(attrib) 

379 href = node.attrib[attrib] if name == "href" else "" 

380 

381 if len(href) > 0: 

382 basename = os.path.basename(href) 

383 ext = basename.split(".")[-1] 

384 if ext == "png": 

385 mimetype = "image/png" 

386 else: 

387 mimetype = "image/jpeg" 

388 

389 location = "src/tex/figures/" + basename 

390 v = { 

391 "rel": "image", 

392 "mimetype": mimetype, 

393 "location": location, 

394 "base": None, 

395 "text": node.text if node.text is not None else "", 

396 } 

397 

398 if ext == "png": 

399 location = os.path.join(base_url, "png", location) 

400 else: 

401 location = os.path.join(base_url, "jpg", location) 

402 text = '<img src="' + location + '" class="article-body-img" />' 

403 

404 figures.append(v) 

405 elif ( 

406 normalized_tag == "table" 

407 or normalized_tag == "th" 

408 or normalized_tag == "thead" 

409 or normalized_tag == "tr" 

410 or normalized_tag == "td" 

411 ): 

412 tag = "<" + normalized_tag 

413 if "rowspan" in node.attrib: 

414 tag += ' rowspan="' + node.attrib["rowspan"] + '"' 

415 text = tag + ">" + text + "</" + normalized_tag + ">" 

416 elif normalized_tag == "table-wrap": 

417 tag = '<div class="table-wrap"' 

418 id = node.get("id") 

419 if id: 

420 tag += ' id="' + id + '"' 

421 

422 text = tag + ">" + text + "</div>" 

423 

424 if node.tail and not is_top: 

425 # match = None 

426 # if is_citation: 

427 # match = re.match(r'[\n ]+', node.tail) 

428 # if not match: 

429 text += node.tail 

430 

431 text += suffix 

432 

433 return text, figures 

434 

435 

436def get_html_mixed_content( 

437 node, 

438 is_top=True, 

439 is_citation=False, 

440 is_comment=False, 

441 prefix="", 

442 suffix="", 

443 sec_level=2, 

444 label="", 

445): 

446 text, _ = get_html_mixed_content_with_figures( 

447 node, is_top, is_citation, is_comment, False, prefix, suffix, sec_level, label, None 

448 ) 

449 return text 

450 

451 

452########################################################################## 

453# 

454# get_element_citation_str: get the mixed content of an element-citation node 

455# 

456# An element-citation node is specific as the order of its children might not be 

457# the correct order for display 

458# 

459# Used to prepare the HTML pages. A Django template can simply display title_html 

460# 

461########################################################################## 

462def get_element_citation_str(node, is_top=False, is_html=True): 

463 text = document_title = "" 

464 REF_JEP_STYLE = getattr(settings, "REF_JEP_STYLE", False) 

465 

466 # xbibitem = BibItem(node.getparent()) 

467 # ids = xbibitem.extids 

468 

469 if node is not None: 

470 type = node.get("publication-type") 

471 

472 name_str = get_author_str(node) 

473 text += name_str 

474 

475 if is_html: 

476 prefix = " " 

477 suffix = "" 

478 if REF_JEP_STYLE: 

479 prefix = " - &ldquo;" 

480 suffix = "&rdquo;" 

481 document_title += get_html_mixed_content( 

482 node.find("article-title"), True, True, False, prefix, suffix 

483 ) 

484 

485 if REF_JEP_STYLE and type == "incollection": 

486 document_title += get_html_mixed_content( 

487 node.find("chapter-title"), True, True, False, prefix, suffix 

488 ) 

489 else: 

490 document_title += get_html_mixed_content( 

491 node.find("chapter-title"), True, True, False, " " 

492 ) 

493 else: 

494 document_title += " " + get_tex(node.find("article-title")) 

495 document_title += " " + get_tex(node.find("chapter-title")) 

496 

497 text += document_title 

498 

499 prefix = "" 

500 suffix = "</span>" 

501 

502 if document_title: 

503 if REF_JEP_STYLE and type == "incollection": 

504 prefix += ', in <span class="citation-publication-title">' 

505 else: 

506 prefix += ', <span class="citation-publication-title">' 

507 else: 

508 if name_str: 

509 prefix = " " 

510 if REF_JEP_STYLE: 

511 prefix = " - " 

512 if type in ["unpublished", "misc"]: 

513 prefix += "&ldquo;" 

514 suffix += "&rdquo;" 

515 prefix += '<span class="citation-document-title">' 

516 

517 source = get_html_mixed_content(node.find("source"), True, True, False, prefix, suffix) 

518 if REF_JEP_STYLE and type == "book": 

519 source = f"<i>{source}</i>" 

520 if type in ["book", "incollection"]: 

521 editor = get_editor_str(node.find("person-group")) 

522 source += editor 

523 else: 

524 editor = "" 

525 text += source 

526 

527 if document_title: 

528 if REF_JEP_STYLE and type == "incollection": 

529 prefix = ", " 

530 else: 

531 prefix = " (" 

532 suffix = ")" 

533 else: 

534 if REF_JEP_STYLE and type == "book": 

535 prefix = ', <span class="citation-publication-title-book">' 

536 else: 

537 prefix = ', <span class="citation-publication-title">' 

538 suffix = "</span>" 

539 

540 serie = get_html_mixed_content(node.find("series"), True, True, False, prefix, suffix) 

541 text += serie 

542 

543 if REF_JEP_STYLE: 

544 if type in ["incollection", "book"]: 

545 prefix = ", vol. " 

546 else: 

547 prefix = " " 

548 else: 

549 if document_title: 

550 prefix = " " if serie else ", " 

551 else: 

552 prefix = ", " if serie else " " 

553 prefix += str(_("Tome")) + " " 

554 

555 text += get_html_mixed_content(node.find("volume"), True, True, False, prefix) 

556 if type in ["incollection", "book"]: 

557 text = text.replace("citation-volume", "citation-volume-incollection") 

558 text += get_html_mixed_content(node.find("publisher-name"), True, True, False, ", ") 

559 text += get_html_mixed_content(node.find("publisher-loc"), True, True, False, ", ") 

560 text += get_html_mixed_content(node.find("institution"), True, True, False, ", ") 

561 prefix = ", " 

562 suffix = "" 

563 elif type == "misc": 

564 prefix = ", " 

565 suffix = "" 

566 else: 

567 prefix = " (" 

568 suffix = ")" 

569 text += get_html_mixed_content(node.find("year"), True, True, False, prefix, suffix) 

570 text += get_html_mixed_content(node.find("issue"), True, True, False, " no. ") 

571 

572 for child in node.findall("pub-id"): 

573 if child.get("pub-id-type") == "eid": 

574 text += ", " + child.text 

575 

576 for child in node.findall("ext-link"): 

577 if child.get("ext-link-type") == "eid": 

578 if REF_JEP_STYLE: 

579 text += ", article ID " + child.text 

580 else: 

581 text += ", " + child.text 

582 

583 if not (REF_JEP_STYLE and type == "book"): 

584 text += get_pages_str(node) 

585 

586 for child in node.findall("ext-link"): 

587 type = child.get("ext-link-type") 

588 if type is None: 

589 href = get_href_attrib(child) 

590 if not href: 

591 href = child.text 

592 # bibitem with ext-links pointing to numdam.org have a numdam-id 

593 # ext-links to doi.org are transformed in an extid 

594 # We can ignore both cases 

595 if "www.numdam.org" not in href and "doi.org" not in href and not REF_JEP_STYLE: 

596 href = make_links_clickable(href, child.text) 

597 text += " " + href 

598 

599 if REF_JEP_STYLE: 

600 text += get_html_mixed_content(node.find("comment"), True, True, True, ", ") 

601 else: 

602 text += get_html_mixed_content(node.find("comment"), True, True, True, " (", ")") 

603 

604 # if type is None or type == 'article': 

605 # elif type == 'book' or type == 'proceedings': 

606 # elif type == 'incollection': 

607 # elif type == 'conference': 

608 # elif type == 'unpublished': 

609 # elif type == "booklet": 

610 # elif type == 'inbook' or type == 'inproceedings': 

611 # elif type == "misc": 

612 # elif type == 'phdthesis' or type == 'masterthesis': 

613 # elif type == 'techreport' or type == 'manual': 

614 

615 # Fallback in case the publication-type is unknown 

616 # else: 

617 # if node.text: 

618 # text += node.text 

619 # 

620 # for child in node: 

621 # text += get_html_mixed_content(child, False, True) 

622 # 

623 # if node.tail and not is_top: 

624 # text += node.tail 

625 

626 return text 

627 

628 

629def get_name_str(node): 

630 text = "" 

631 REF_JEP_STYLE = getattr(settings, "REF_JEP_STYLE", False) 

632 

633 if node is not None: 

634 names = node.findall("name") 

635 i = 1 

636 for name_node in names: 

637 first_name = last_name = prefix = suffix = string_name = "" 

638 

639 for child in name_node: 

640 if child.tag == "given-names": 

641 if REF_JEP_STYLE: 

642 first_name += child.get("initials", "") 

643 else: 

644 if child.text is None: 

645 child.text = "" 

646 first_name += child.text 

647 if child.tag == "surname": 

648 last_name += child.text 

649 if child.tag == "prefix": 

650 prefix += child.text 

651 if child.tag == "suffix": 

652 suffix += child.text 

653 

654 if prefix: 

655 string_name = prefix + " " 

656 

657 if getattr(settings, "DISPLAY_FIRST_NAME_FIRST", False): 

658 if first_name: 

659 string_name += first_name + " " 

660 string_name += last_name 

661 else: 

662 string_name += last_name 

663 

664 if first_name: 

665 string_name += ", " + first_name 

666 

667 if suffix: 

668 string_name += " " + suffix 

669 

670 if text: 

671 if i == len(names) and REF_JEP_STYLE: 

672 text += " & " 

673 elif REF_JEP_STYLE: 

674 text += ", " 

675 else: 

676 text += "; " 

677 

678 text += string_name 

679 i += 1 

680 

681 names = node.findall("string-name") 

682 i = 1 

683 for name_node in names: 

684 string_name = get_tex(name_node) 

685 

686 if text: 

687 if i == len(names) and REF_JEP_STYLE: 

688 text += " & " 

689 elif REF_JEP_STYLE: 

690 text += ", " 

691 else: 

692 text += "; " 

693 

694 text += string_name 

695 i += 1 

696 return text 

697 

698 

699def get_author_str(node): 

700 authors = get_name_str(node) 

701 return f'<span class="citation-author">{authors}</span>' 

702 

703 

704def get_editor_str(node): 

705 editors = get_name_str(node) 

706 if not editors: 

707 return "" 

708 # Here, we replace '&' (used in JEP) by ';' and then split in order to 

709 # find if there are multiple editors 

710 suffix = "eds." if len(editors.replace("&", ";").split(";")) > 1 else "ed." 

711 return f" ({editors}, {suffix})" 

712 

713 

714def get_pages_str(node): 

715 text = "" 

716 REF_JEP_STYLE = getattr(settings, "REF_JEP_STYLE", False) 

717 child = node.find("page-count") 

718 if child is not None: 

719 text += get_html_mixed_content(child, True, True, False, ", ", " pages") 

720 

721 if not text: 

722 child = node.find("size") 

723 if child is not None: 

724 text += get_html_mixed_content(child, True, True, False, ", ", " pages") 

725 

726 if not text: 

727 first_page_child = node.find("fpage") 

728 if first_page_child is not None: 

729 fpage_text = get_html_mixed_content(first_page_child, True, True, False) 

730 lpage_text = "" 

731 fpage_int = lpage_int = 0 

732 try: 

733 fpage_int = int(fpage_text) 

734 except BaseException: 

735 pass 

736 

737 last_page_child = node.find("lpage") 

738 if last_page_child is not None: 

739 lpage_text = get_html_mixed_content(last_page_child, True, True, False) 

740 try: 

741 lpage_int = int(lpage_text) 

742 except BaseException: 

743 pass 

744 

745 if lpage_int > 0 and lpage_int - fpage_int > 1 and not REF_JEP_STYLE: 

746 text += ", pp. " 

747 else: 

748 text += ", p. " 

749 text += fpage_text 

750 if lpage_text: 

751 text += "-" + lpage_text 

752 

753 if not text: 

754 child = node.find("page-range") 

755 if child is not None: 

756 prefix = ", pp. " 

757 suffix = "" 

758 if REF_JEP_STYLE: 

759 prefix = ", p. " 

760 

761 text += get_html_mixed_content(child, True, True, False, prefix, suffix) 

762 

763 return text 

764 

765 

766########################################################################## 

767# 

768# Parse a name node ("name", "string-name", or "name-alternative) and find the fields related to a person name: 

769# first_name <given-names> 

770# last_name <surname> 

771# prefix <prefix> 

772# suffix <suffix> 

773# string_name <string_name> or built with "<prefix> <last_name>, <first_name>, <suffix>" 

774# reference_name <string_name specific-use="index"> or string_name 

775# Used in Solr for facets (regroup multiple orthographies under the same person) 

776# 

777# Note: parse_name and get_name_str can not be merged...today 

778# string-names in mixed-citation mix structured data (ex: "surname") and non structured content. 

779# Ex: <surname>ROBERTSON</surname>, <given-names>D. H.</given-names></string-name> 

780# Notice the ", " inside. 

781# get_name_str is used for web pages and need to preserve everything (the ', " in particular) 

782# parse_name is used to export bibtex: only structured data are preserved. 

783# TODO: discuss this workflow. Why add or preserve the mix content of a string-name ? 

784# 

785# TODO: merge parse_name and parse_contrib 

786# 1. <contrib> can have multiple entries (ex: <name> then <string-name specific-use="index") for 1 single person, 

787# whereas <mixed-citation> or <element-citation> use 1 entry per person. 

788# 2. string-name is a contrib is a simple text, string-name in mixed-citation is a tree 

789# 

790########################################################################## 

791 

792 

793def get_name_params(first_name, last_name, prefix, suffix, string_name="", reference_name=""): 

794 if string_name and not reference_name: 

795 reference_name = string_name 

796 

797 if last_name and not string_name: 

798 if prefix: 

799 string_name = prefix + " " 

800 

801 string_name += last_name 

802 

803 if first_name: 

804 string_name += ", " + first_name 

805 

806 if suffix: 

807 string_name += " " + suffix 

808 

809 elif string_name and not last_name: 

810 array = string_name.split(",") 

811 if len(array) > 1: 

812 last_name = array[0] 

813 first_name = array[1] 

814 

815 if not reference_name and last_name: 

816 if getattr(settings, "DISPLAY_FIRST_NAME_FIRST", False): 

817 reference_name = "" 

818 if first_name: 

819 reference_name = first_name + " " 

820 reference_name += last_name 

821 else: 

822 reference_name = last_name 

823 if first_name: 

824 reference_name += ", " + first_name 

825 

826 params = { 

827 "first_name": first_name, 

828 "last_name": last_name, 

829 "prefix": prefix, 

830 "suffix": suffix, 

831 "string_name": string_name, 

832 "reference_name": reference_name, 

833 } 

834 

835 return params 

836 

837 

838def parse_name(node): 

839 first_name = last_name = prefix = suffix = string_name = reference_name = "" 

840 

841 if node is not None: 

842 if node.tag == "name": 

843 for child in node: 

844 if child.tag == "given-names": 

845 first_name += child.text 

846 if child.tag == "surname": 

847 last_name += child.text 

848 if child.tag == "prefix": 

849 prefix += child.text 

850 if child.tag == "suffix": 

851 suffix += child.text 

852 if node.tag == "string-name": 

853 for child in node: 

854 if child.text: 

855 if child.tag == "given-names": 

856 first_name += child.text 

857 if child.tag == "surname": 

858 last_name += child.text 

859 if child.tag == "prefix": 

860 prefix += child.text 

861 if child.tag == "suffix": 

862 suffix += child.text 

863 

864 if not first_name and not last_name: 

865 string_name = node.text 

866 

867 if node.tag == "name-alternatives": 

868 for child in node: 

869 if child.tag == "string-name": 

870 if child.get("specific-use") == "index": 

871 reference_name += child.text 

872 

873 params = get_name_params(first_name, last_name, prefix, suffix, string_name, reference_name) 

874 

875 return params 

876 

877 

878########################################################################## 

879# 

880# Parse a Contrib node and find the fields related to a person name: 

881# first_name <given-names> 

882# last_name <surname> 

883# prefix <prefix> 

884# suffix <suffix> 

885# string_name <string_name> or built with "<prefix> <last_name>, <first_name>, <suffix>" 

886# reference_name <string_name specific-use="index"> or string_name 

887# Used in Solr for facets (regroup multiple orthographies under the same person) 

888# 

889########################################################################## 

890 

891 

892def parse_contrib(node): 

893 first_name = last_name = prefix = suffix = string_name = reference_name = "" 

894 

895 if node is not None: 

896 for child in node: 

897 if child.tag == "name": 

898 for great_child in child: 

899 if great_child.text is not None: 

900 if great_child.tag == "given-names": 

901 first_name += great_child.text 

902 if great_child.tag == "surname": 

903 last_name += great_child.text 

904 if great_child.tag == "prefix": 

905 prefix += great_child.text 

906 if great_child.tag == "suffix": 

907 suffix += great_child.text 

908 if child.tag == "string-name": 

909 if child.text is not None: 

910 string_name += child.text 

911 if child.tag == "name-alternatives": 

912 for great_child in child: 

913 if great_child.text is not None: 

914 if great_child.tag == "string-name": 

915 if great_child.get("specific-use") == "index": 

916 reference_name += great_child.text 

917 

918 if string_name and not reference_name: 

919 reference_name = string_name 

920 

921 if last_name and not string_name: 

922 if prefix: 

923 string_name = prefix + " " 

924 

925 string_name += last_name 

926 

927 if first_name: 

928 string_name += ", " + first_name 

929 

930 if suffix: 

931 string_name += " " + suffix 

932 

933 elif string_name and not last_name: 

934 array = string_name.split(",") 

935 if len(array) > 1: 

936 last_name = array[0] 

937 first_name = array[1] 

938 

939 if not reference_name and last_name: 

940 if getattr(settings, "DISPLAY_FIRST_NAME_FIRST", False): 

941 reference_name = "" 

942 if first_name: 

943 reference_name = first_name + " " 

944 reference_name += last_name 

945 else: 

946 reference_name = last_name 

947 if first_name: 

948 reference_name += ", " + first_name 

949 

950 params = { 

951 "first_name": first_name, 

952 "last_name": last_name, 

953 "prefix": prefix, 

954 "suffix": suffix, 

955 "string_name": string_name, 

956 "reference_name": reference_name, 

957 } 

958 

959 return params 

960 

961 

962def make_int(value): 

963 v = value.split("-")[0] 

964 try: 

965 v = int(v) 

966 except BaseException: 

967 v = [x for x in v if x.isdigit()] 

968 v = int(v) 

969 else: 

970 pass 

971 return v 

972 

973 

974def uni2ascii(s): 

975 s = unicodedata.normalize("NFKD", str(s)).encode("ascii", "ignore") 

976 return s 

977 

978 

979sid_type = None 

980pid_type = None 

981 

982 

983def set_sid_type(id_type): 

984 global sid_type 

985 sid_type = id_type 

986 

987 

988def set_pid_type(id_type): 

989 global pid_type 

990 pid_type = id_type 

991 

992 

993class XmlData: 

994 ids_xpath = None 

995 id_type_attr = "pub-id-type" 

996 

997 extids_xpath = None 

998 extid_type_attr = None 

999 title_group_elt_path = None 

1000 title_path = None 

1001 trans_title_group_elt_path = None 

1002 trans_title_path = None 

1003 alternate_title_path = None 

1004 alternate_title_group_elt_path = None 

1005 meta_root_xpath = "" 

1006 custom_meta_path = "custom-meta-group" 

1007 counts_path = "counts" 

1008 remove_links = False 

1009 

1010 def __init__(self, tree): 

1011 self.tree = tree 

1012 if self.meta_root_xpath: 

1013 self.meta_root = tree.find(self.meta_root_xpath) 

1014 else: 

1015 self.meta_root = None 

1016 

1017 def __getattr__(self, name): 

1018 mname = "get_" + name if "self" not in name else name 

1019 getter = getattr(self, mname) 

1020 obj = getter() 

1021 setattr(self, name, obj) 

1022 return obj 

1023 

1024 def get_doi(self): 

1025 return None 

1026 

1027 def xpath(self, xpath): 

1028 return self.tree.xpath(xpath) 

1029 

1030 def xget_subtree(self, xpath): 

1031 subtree = self.tree.xpath(xpath) 

1032 if subtree: 

1033 return subtree[0] 

1034 return None 

1035 

1036 def xget_subtrees(self, xpath): 

1037 return self.tree.xpath(xpath) 

1038 

1039 def get_subtree(self, path): 

1040 return self.tree.find(path) 

1041 

1042 def get_subtrees(self, path): 

1043 return self.tree.findall(path) 

1044 

1045 def get_node_text(self, path, return_none=""): 

1046 node = self.tree.find(path) 

1047 if node is None: 

1048 return return_none 

1049 if node.text is None: 

1050 return return_none 

1051 xml_text = etree.tostring( 

1052 node, encoding="unicode", method="text", xml_declaration=False, with_tail=False 

1053 ) 

1054 return xml_text 

1055 

1056 def get_nodes_text(self, path): 

1057 return [x.text for x in self.tree.findall(path)] 

1058 

1059 def get_ascii_text(self, path, return_none=""): 

1060 return uni2ascii(self.get_node_text(path, return_none=return_none)) 

1061 

1062 def xget_node_text(self, xpath, return_none=None): 

1063 try: 

1064 return self.tree.xpath(xpath)[0].text 

1065 except BaseException: 

1066 return return_none 

1067 

1068 def xget_ascii_text(self, xpath, return_none=""): 

1069 return uni2ascii(self.xget_node_text(xpath, return_none=return_none)) 

1070 

1071 def tostring(self): 

1072 self.prune() 

1073 return etree.tostring(self.tree, encoding="utf-8", xml_declaration=False) 

1074 

1075 __str__ = tostring 

1076 

1077 def prune(self): 

1078 pass 

1079 

1080 def get_ids(self): 

1081 if self.ids_xpath is not None: 

1082 nodes = self.xget_subtrees(self.ids_xpath) 

1083 return [(x.get(self.id_type_attr), x.text) for x in nodes if x.text is not None] 

1084 return [] 

1085 

1086 def get_mathdoc_id(self): 

1087 if self.mathdoc_id_xpath is not None: 

1088 try: 

1089 node = self.xget_subtrees(self.mathdoc_id_xpath)[0] 

1090 except IndexError: 

1091 return None 

1092 else: 

1093 return node.text 

1094 return None 

1095 

1096 def get_title_xml(self): 

1097 title_xml = "" 

1098 node = self.tree.find(self.title_group_elt_path) 

1099 if node is None and self.alternate_title_group_elt_path: 

1100 node = self.tree.find(self.alternate_title_group_elt_path) 

1101 if node is not None: 

1102 title_xml = get_mixed_content(node) 

1103 return title_xml 

1104 

1105 def inner_get_title_html(self, path, alternate_path=None): 

1106 title_html = "" 

1107 node = self.tree.find(path) 

1108 if node is None and alternate_path: 

1109 node = self.tree.find(alternate_path) 

1110 if node is not None: 

1111 title_html = get_html_mixed_content(node) 

1112 return title_html 

1113 

1114 def get_title_html(self): 

1115 return self.inner_get_title_html(self.title_path, self.alternate_title_path) 

1116 

1117 def get_trans_title_html(self): 

1118 return self.inner_get_title_html(self.trans_title_path) 

1119 

1120 def inner_get_title_tex(self, path, alternate_path=None): 

1121 title_tex = "" 

1122 node = self.tree.find(path) 

1123 if node is None and alternate_path: 

1124 node = self.tree.find(alternate_path) 

1125 if node is not None: 

1126 title_tex = get_tex(node) 

1127 return title_tex 

1128 

1129 def get_title_tex(self): 

1130 return self.inner_get_title_tex(self.title_path, self.alternate_title_path) 

1131 

1132 def get_trans_title_tex(self): 

1133 return self.inner_get_title_tex(self.trans_title_path) 

1134 

1135 def get_lang(self): 

1136 tree = self.tree 

1137 

1138 lang = get_lang_attrib(tree) 

1139 

1140 if lang == "und": 

1141 parent = tree.getparent() 

1142 

1143 grand_parent = parent 

1144 while grand_parent is not None: 

1145 parent = grand_parent 

1146 grand_parent = parent.getparent() 

1147 

1148 lang = get_lang_attrib(parent) 

1149 

1150 return lang 

1151 

1152 def get_trans_lang(self): 

1153 lang = "und" 

1154 node = self.tree.find(self.trans_title_group_elt_path) 

1155 lang = get_lang_attrib(node) 

1156 

1157 return lang 

1158 

1159 def get_extids(self): 

1160 if self.extids_xpath is not None: 

1161 nodes = self.xget_subtrees(self.extids_xpath) 

1162 links = [] 

1163 for n in nodes: 

1164 id_type = n.get(self.extid_type_attr) 

1165 value = n.text.strip() 

1166 if id_type is None and value.find("doi.org/") > 0: 

1167 id_type = "doi" 

1168 if id_type in ( 

1169 "mr-item-id", 

1170 "zbl-item-id", 

1171 "sps-id", 

1172 "numdam-id", 

1173 "mathdoc-id", 

1174 "jfm-item-id", 

1175 "eudml-item-id", 

1176 "doi", 

1177 "eid", 

1178 ): 

1179 if id_type == "numdam-id": 

1180 id_type = "mathdoc-id" 

1181 if id_type == "doi": 

1182 if value.find("doi.org") > 0: 

1183 value = value.replace("http://dx.doi.org/", "") 

1184 value = value.replace("https://doi.org/", "") 

1185 value = value.replace("doi:", "") 

1186 links.append((id_type, value)) 

1187 if self.remove_links: 

1188 n.getparent().remove(n) 

1189 return links 

1190 return [] 

1191 

1192 def get_xml(self, path, return_none=""): 

1193 node = self.get_subtree(path) 

1194 if node is not None: 

1195 return etree.tostring(node, encoding="utf-8", xml_declaration=False) 

1196 return return_none 

1197 

1198 def get_inner_xml(self, path, return_none=""): 

1199 node = self.get_subtree(path) 

1200 if node is not None: 

1201 return innerxml(node) 

1202 return return_none 

1203 

1204 def xget_xml(self, path): 

1205 node = self.xget_subtree(path) 

1206 if node is not None: 

1207 return etree.tostring(node, encoding="utf-8", xml_declaration=False) 

1208 return "" 

1209 

1210 def get_catxml(self, path): 

1211 nodes = self.get_subtrees(path) 

1212 text = [] 

1213 for node in nodes: 

1214 text.append(etree.tostring(node)) 

1215 return "".join(text) 

1216 

1217 def get_streams(self): 

1218 if self.meta_root is not None: 

1219 self_uris = self.meta_root.findall("self-uri") 

1220 else: 

1221 self_uris = self.tree.findall("self-uri") 

1222 vv = [] 

1223 for node in self_uris: 

1224 href = base = type = "" 

1225 for attrib in node.attrib: 

1226 name = normalize(attrib) 

1227 

1228 href = node.attrib[attrib] if name == "href" else href 

1229 base = node.attrib[attrib] if name == "base" else base 

1230 type = node.attrib[attrib] if name == "content-type" else type 

1231 

1232 v = { 

1233 "rel": "full-text", 

1234 "mimetype": type or "text/html", 

1235 "location": href, 

1236 "base": base, 

1237 "text": node.text if node.text else "Link", 

1238 } 

1239 

1240 vv.append(v) 

1241 return vv 

1242 

1243 def get_related_objects(self): 

1244 related = [] 

1245 if self.meta_root is not None: 

1246 nodes = self.meta_root.findall("related-object") 

1247 else: 

1248 nodes = self.tree.findall("related-object") 

1249 for node in nodes: 

1250 rel = href = base = type = "" 

1251 for attrib in node.attrib: 

1252 name = normalize(attrib) 

1253 

1254 rel = node.attrib[attrib] if name == "link-type" else rel 

1255 href = node.attrib[attrib] if name == "href" else href 

1256 base = node.attrib[attrib] if name == "base" else base 

1257 type = node.attrib[attrib] if name == "content-type" else type 

1258 

1259 text = innerxml(node) 

1260 v = {"rel": rel, "mimetype": type, "location": href, "base": base, "metadata": text} 

1261 related.append(v) 

1262 return related 

1263 

1264 def get_supplementary_materials(self): 

1265 materials = [] 

1266 if self.meta_root is not None: 

1267 nodes = self.meta_root.findall("supplementary-material") 

1268 else: 

1269 nodes = self.tree.findall("supplementary-material") 

1270 for node in nodes: 

1271 try: 

1272 location = node.attrib["href"] 

1273 except KeyError: 

1274 location = node.attrib["id"] 

1275 material = { 

1276 "rel": node.attrib.get("content-type"), 

1277 "mimetype": node.attrib.get("mimetype"), 

1278 "location": location, 

1279 "base": "", 

1280 "metadata": "", 

1281 "caption": node.xpath("caption/text()")[0], 

1282 } 

1283 materials.append(material) 

1284 return materials 

1285 

1286 def get_metadataparts(self): 

1287 return [] 

1288 

1289 def get_custom_meta(self): 

1290 cm = {} 

1291 if self.custom_meta_path: 

1292 node = self.tree.find(self.custom_meta_path) 

1293 if node is not None: 

1294 for child in node: 

1295 key = child[0].text 

1296 value = child[1].text 

1297 cm[key] = value 

1298 return cm 

1299 

1300 def get_wall(self): 

1301 try: 

1302 wall = self.custom_meta["wall"] 

1303 except KeyError: 

1304 return 0 

1305 return int(wall) 

1306 

1307 def get_pid(self): 

1308 # try: 

1309 # name = self.custom_meta['provider'] 

1310 # except KeyError: 

1311 # return None 

1312 # provider_id_type = name + '-id' 

1313 for id_type, id_value in self.ids: 

1314 if id_type == pid_type or ( 

1315 (id_type == "numdam-id" or id_type == "mathdoc-id") 

1316 and (pid_type == "numdam-id" or pid_type == "mathdoc-id") 

1317 ): 

1318 return id_value 

1319 

1320 def get_provider(self): 

1321 return self.custom_meta.get("provider", None) 

1322 

1323 def get_sid(self): 

1324 for id_type, id_value in self.ids: 

1325 if id_type == sid_type: 

1326 return id_value 

1327 return None 

1328 

1329 def get_counts(self): 

1330 counts = [] 

1331 if self.counts_path: 

1332 node = self.tree.find(self.counts_path) 

1333 if node is not None: 

1334 page_count = node.find("page-count") 

1335 if page_count is None: 

1336 page_count = node.find("book-page-count") 

1337 count = page_count.get("count") 

1338 if not count: 

1339 count = get_node_text(node) 

1340 counts.append(("page-count", count)) 

1341 return counts 

1342 

1343 def get_ext_links(self): 

1344 referentials = [ 

1345 "jfm-item-id", 

1346 "zbl-item-id", 

1347 "mr-item-id", 

1348 "nmid", 

1349 "numdam-id", 

1350 "mathdoc-id", 

1351 "sps-id", 

1352 "dmlid", 

1353 "eudml-item-id", 

1354 ] 

1355 result = [] 

1356 if self.meta_root is not None: 

1357 nodes = self.meta_root.findall("ext-link") 

1358 else: 

1359 nodes = self.tree.findall("ext-link") 

1360 for node in nodes: 

1361 rel = href = base = "" 

1362 for attrib in node.attrib: 

1363 name = normalize(attrib) 

1364 

1365 rel = node.attrib[attrib] if name == "ext-link-type" else rel 

1366 href = node.attrib[attrib] if name == "href" else href 

1367 base = node.attrib[attrib] if name == "base" else base 

1368 

1369 if rel in referentials: 

1370 continue 

1371 

1372 text = innerxml(node) 

1373 v = {"rel": rel, "mimetype": "", "location": href, "base": base, "metadata": text} 

1374 result.append(v) 

1375 return result 

1376 

1377 def get_last_modified_iso_8601_date_str(self): 

1378 if self.last_modified_path: 

1379 node = self.tree.find(self.last_modified_path) 

1380 if node is not None: 

1381 last_modified_iso_8601_date_str = node.attrib["iso-8601-date"] 

1382 return last_modified_iso_8601_date_str 

1383 # on traite le cas où le container arrive via ptf-tools et donc la date de 

1384 # dernière modification est la date d'import 

1385 return timezone.now().isoformat() 

1386 

1387 def get_date_published_iso_8601_date_str(self): 

1388 date_str = None 

1389 if self.published_path: 

1390 node = self.tree.find(self.published_path) 

1391 if node is not None: 

1392 if "iso-8601-date" in node.attrib: 

1393 date_published_iso_8601_date_str = node.attrib["iso-8601-date"] 

1394 date_str = date_published_iso_8601_date_str 

1395 else: 

1396 year = month = day = "" 

1397 

1398 sub_node = node.find("year") 

1399 if sub_node is not None: 

1400 year = sub_node.text 

1401 sub_node = node.find("month") 

1402 if sub_node is not None: 

1403 month = sub_node.text 

1404 sub_node = node.find("day") 

1405 if sub_node is not None: 

1406 day = sub_node.text 

1407 

1408 date_str = year 

1409 if date_str and month: 

1410 date_str += "-" + month 

1411 if date_str and day: 

1412 date_str += "-" + day 

1413 

1414 return date_str 

1415 

1416 def get_prod_deployed_date_iso_8601_date_str(self): 

1417 if self.prod_deployed_date_path: 

1418 node = self.tree.find(self.prod_deployed_date_path) 

1419 if node is not None: 

1420 prod_deployed_date_iso_8601_date_str = node.attrib["iso-8601-date"] 

1421 return prod_deployed_date_iso_8601_date_str 

1422 return None 

1423 

1424 

1425class StreamGroup: 

1426 def __init__(self, tree): 

1427 self.use = tree.get("use").lower() 

1428 streams = [] 

1429 for node in tree: 

1430 link = node.find("link") 

1431 rel = href = seq = type = "" 

1432 for attrib in link.attrib: 

1433 name = normalize(attrib) 

1434 

1435 rel = link.attrib[attrib] if name == "rel" else rel 

1436 href = link.attrib[attrib] if name == "href" else href 

1437 seq = link.attrib[attrib] if name == "seq" else seq 

1438 type = node.attrib[attrib] if name == "content-type" else type 

1439 

1440 v = { 

1441 "rel": rel, 

1442 "mimetype": type, 

1443 "location": href, 

1444 "seq": seq, 

1445 "text": link.text or "", 

1446 } 

1447 streams.append(v) 

1448 self.streams = streams 

1449 

1450 

1451### 

1452# 

1453class Work(XmlData): 

1454 lang = "und" 

1455 back_paths = ("back", "book-back") 

1456 biblio_xpath = "ref-list" 

1457 

1458 def inner_get_lang(self, node): 

1459 the_lang = get_lang_attrib(node) 

1460 if the_lang == "und": 

1461 the_lang = self.lang 

1462 

1463 return the_lang 

1464 

1465 def inner_get_abstract(self, node, tag, attrs): 

1466 if node is not None: 

1467 the_lang = self.inner_get_lang(node) 

1468 

1469 value_xml = get_mixed_content(node) 

1470 value_html = get_html_mixed_content(node) 

1471 value_tex = get_tex(node) 

1472 

1473 attrs.append( 

1474 { 

1475 "tag": tag, 

1476 "lang": the_lang, 

1477 "value_xml": value_xml, 

1478 "value_html": value_html, 

1479 "value_tex": value_tex, 

1480 } 

1481 ) 

1482 

1483 def get_abstracts(self): 

1484 attrs = [] 

1485 nodes = self.tree.findall(self.abstract_path) 

1486 for node in nodes: 

1487 tag = node.get("abstract-type") or "abstract" 

1488 self.inner_get_abstract(node, tag, attrs) 

1489 

1490 nodes = self.tree.findall(self.trans_abstract_path) 

1491 for node in nodes: 

1492 tag = node.get("abstract-type") or "abstract" 

1493 tag = "trans-" + tag 

1494 self.inner_get_abstract(node, tag, attrs) 

1495 

1496 return attrs 

1497 

1498 def get_contrib_groups(self): 

1499 groups = [] 

1500 grps = self.tree.findall(self.contrib_path) 

1501 for g in grps: 

1502 contribs = g.findall("contrib") 

1503 gc = [] 

1504 for contrib in contribs: 

1505 params = parse_contrib(contrib) 

1506 params["contrib_type"] = contrib.get("contrib-type") or "" 

1507 params["deceased"] = contrib.get("deceased") or "" 

1508 params["contrib_xml"] = get_mixed_content(contrib) 

1509 if ( 

1510 params["first_name"] 

1511 or params["last_name"] 

1512 or params["string_name"] 

1513 or params["reference_name"] 

1514 ): 

1515 gc.append(params) 

1516 if gc: 

1517 groups.append({"content_type": g.get("content-type") or "", "contribs": gc}) 

1518 return groups 

1519 

1520 def get_kwd_groups(self): 

1521 groups = [] 

1522 grps = self.tree.findall(self.kwd_path) 

1523 for g in grps: 

1524 ugrp = g.find("unstructured-kwd-group") 

1525 the_lang = self.inner_get_lang(g) 

1526 if ugrp is not None: 

1527 value_xml = get_mixed_content(ugrp) 

1528 value_tex = get_tex(ugrp) 

1529 value_html = get_html_mixed_content(ugrp) 

1530 groups.append( 

1531 { 

1532 "content_type": g.get("content-type") or "", 

1533 "lang": the_lang, 

1534 "value_xml": value_xml, 

1535 "value_html": value_html, 

1536 "value_tex": value_tex, 

1537 "kwds": [], 

1538 } 

1539 ) 

1540 else: 

1541 kwds = g.findall("kwd") 

1542 values = [innerxml(x) for x in kwds] 

1543 groups.append( 

1544 { 

1545 "content_type": g.get("kwd-group-type") or "", 

1546 "lang": the_lang, 

1547 "value": "", 

1548 "kwds": values, 

1549 } 

1550 ) 

1551 return groups 

1552 

1553 def get_subj_groups(self): 

1554 groups = [] 

1555 grps = self.tree.findall(self.subj_path) 

1556 for g in grps: 

1557 the_lang = self.inner_get_lang(g) 

1558 subjects = g.findall("subject") 

1559 values = [innerxml(x) for x in subjects] 

1560 groups.append( 

1561 { 

1562 "content_type": g.get("subj-group-type") or "", 

1563 "lang": the_lang, 

1564 "value": "", 

1565 "subjects": values, 

1566 } 

1567 ) 

1568 return groups 

1569 

1570 def get_awards(self): 

1571 awards = [] 

1572 

1573 nodes = self.tree.findall(self.funding_path) 

1574 for node in nodes: 

1575 abbrev = award_id = None 

1576 names = node.findall("funding-source/named-content") 

1577 for name_node in names: 

1578 tag = name_node.get("content-type") or "" 

1579 if tag == "abbrevation": 

1580 abbrev = innerxml(name_node) 

1581 id_node = node.find("award-id") 

1582 if id_node is not None: 

1583 award_id = innerxml(id_node) 

1584 

1585 if abbrev is not None and id is not None: 

1586 awards.append({"abbrev": abbrev, "award_id": award_id}) 

1587 

1588 return awards 

1589 

1590 # def get_title_group(self): 

1591 # title_xml = '' 

1592 # group = self.tree.find(self.title_group_elt_path) 

1593 # if group is not None: 

1594 # title_xml = get_mixed_content(group) 

1595 # return title_xml 

1596 # return innerxml(group) 

1597 # return "" 

1598 # 

1599 # def get_title_text(self): 

1600 # return self.get_node_text(self.title_path) 

1601 

1602 # def get_abstract(self): 

1603 # return self.get_xml(self.abstract_path) 

1604 # 

1605 # def get_abstract_text(self): 

1606 # return self.get_node_text(self.abstract_path) 

1607 # 

1608 # def get_trans_abstracts(self): 

1609 # return self.get_catxml(self.trans_abstract_path) 

1610 

1611 def get_keywords(self): 

1612 return self.get_catxml(self.kwd_path) 

1613 

1614 def get_bibitems(self): 

1615 for back_path in self.back_paths: 

1616 back = self.tree.find(back_path) 

1617 if back is not None: 

1618 break 

1619 if back is None: 

1620 return [] 

1621 ref_list = back.find(self.biblio_xpath) 

1622 if ref_list is None: 

1623 return [] 

1624 items = [] 

1625 for ref in ref_list: 

1626 if ref.tag == "ref": 

1627 items.append(BibItem(ref)) 

1628 # try: 

1629 # self.tree.getroot().remove(back) 

1630 # except: 

1631 # self.tree.remove(back) 

1632 return items 

1633 

1634 

1635class InCollection(XmlData): 

1636 def __init__(self, tree): 

1637 super().__init__(tree) 

1638 self.volume, self.seq, self.vseries = get_volume_and_seq(tree) 

1639 colmeta = tree.find("collection-meta") 

1640 self.collection = Collection(colmeta) 

1641 

1642 

1643class BitsCollection(XmlData): 

1644 def __init__(self, tree): 

1645 try: 

1646 seq = int(tree.get("seq")) 

1647 except BaseException: 

1648 try: 

1649 seq = int(tree.find("volume-in-collection/volume-number").text) 

1650 except BaseException: 

1651 seq = 0 

1652 try: 

1653 volume = tree.find("volume-in-collection/volume-number").text 

1654 except BaseException: 

1655 volume = "" 

1656 try: 

1657 series = tree.find("volume-in-collection/volume-series").text 

1658 except BaseException: 

1659 series = "" 

1660 self.volume = volume 

1661 self.seq = seq 

1662 self.vseries = series 

1663 self.collection = Collection(tree) 

1664 

1665 

1666class Publisher(XmlData): 

1667 mathdoc_id_xpath = 'publisher-id[@publisher-id-type="mathdoc-id"]' 

1668 

1669 def get_name(self): 

1670 return self.get_node_text("publisher-name") 

1671 

1672 def get_loc(self): 

1673 return self.get_node_text("publisher-loc") 

1674 

1675 

1676class EventSeries(XmlData): 

1677 def __init__(self, tree): 

1678 super().__init__(tree) 

1679 self.event_type = tree.get("event-type") 

1680 

1681 def get_title(self): 

1682 return self.get_node_text("event-name") 

1683 

1684 def get_acro(self): 

1685 return self.get_node_text("event-acronym") 

1686 

1687 def get_short_title(self): 

1688 return "" 

1689 

1690 

1691class Event(XmlData): 

1692 def __init__(self, tree): 

1693 super().__init__(tree) 

1694 self.event_type = tree.get("event-type") 

1695 

1696 def get_title(self): 

1697 return self.get_node_text("event-name") 

1698 

1699 def get_acro(self): 

1700 return self.get_node_text("event-acronym") 

1701 

1702 def get_year(self): 

1703 return self.get_node_text("event-date") 

1704 

1705 def get_number(self): 

1706 return self.get_node_text("event-num") 

1707 

1708 def get_loc(self): 

1709 return self.get_node_text("event-loc") 

1710 

1711 

1712# <collection-meta> d'un <book> 

1713 

1714 

1715class Collection(Work): 

1716 lang = "und" 

1717 title_group_elt_path = "title-group" 

1718 title_path = "title-group/title" 

1719 subtitle_path = "title-group/subtitle" 

1720 abstract_path = "abstract" 

1721 trans_abstract_path = "trans-abstract" 

1722 kwd_path = "kwd-group" 

1723 subj_path = "Not-supported" 

1724 ids_xpath = "collection-id" 

1725 mathdoc_id_xpath = 'collection-id[@collection-id-type="mathdoc-id"]' 

1726 trans_title_group_elt_path = "title-group/trans-title-group" 

1727 trans_title_path = "title-group/trans-title-group/trans-title" 

1728 funding_path = "Not supported" 

1729 

1730 contrib_path = "contrib-group" 

1731 id_type_attr = "collection-id-type" 

1732 

1733 def get_coltype(self): 

1734 return self.tree.get("collection-type") or "collection" 

1735 

1736 def get_publisher(self): 

1737 node = self.tree.find("publisher") 

1738 if node is not None: 

1739 return Publisher(node) 

1740 return None 

1741 

1742 def get_title(self): 

1743 return self.get_node_text("title-group/title") 

1744 

1745 def get_abbrev(self): 

1746 return self.get_node_text("title-group/abbrev-title") 

1747 

1748 def get_ids(self): 

1749 ids = XmlData.get_ids(self) 

1750 issns = self.tree.findall("issn") 

1751 for issn in issns: 

1752 itp = issn.get("pub-type") 

1753 if itp == "ppub": 

1754 ids.append(("issn", issn.text)) 

1755 elif itp == "epub": 

1756 ids.append(("e-issn", issn.text)) 

1757 else: 

1758 pass 

1759 return ids 

1760 

1761 

1762# <journal-meta> d'un <journal-issue> 

1763 

1764 

1765class Journal(Work): 

1766 ids_xpath = "journal-id" 

1767 id_type_attr = "journal-id-type" 

1768 title_group_elt_path = "journal-title-group" 

1769 title_path = "journal-title-group/journal-title" 

1770 abbrev_title_path = "journal-title-group/abbrev-title" 

1771 trans_title_group_elt_path = "journal-title-group/trans-title-group" 

1772 trans_title_path = "journal-title-group/trans-title-group/trans-title" 

1773 abstract_path = "abstract" 

1774 trans_abstract_path = "trans-abstract" 

1775 contrib_path = "contrib-group" 

1776 kwd_path = "kwd-group" 

1777 subj_path = "Not-supported" 

1778 funding_path = "Not-supported" 

1779 

1780 def get_ids(self): 

1781 ids = XmlData.get_ids(self) 

1782 issns = self.tree.findall("issn") 

1783 for issn in issns: 

1784 itp = issn.get("pub-type") 

1785 if issn.text: 

1786 if itp == "ppub": 

1787 ids.append(("issn", issn.text)) 

1788 elif itp == "epub": 

1789 ids.append(("e-issn", issn.text)) 

1790 else: 

1791 pass 

1792 return ids 

1793 

1794 def get_publisher(self): 

1795 node = self.tree.find("publisher") 

1796 if node is not None: 

1797 return Publisher(node) 

1798 return None 

1799 

1800 def get_title_group(self): 

1801 node = self.tree.find(self.title_group_elt_path) 

1802 if node is not None: 

1803 return innerxml(node) 

1804 return "" 

1805 

1806 def get_title_xml(self): 

1807 title_xml = "" 

1808 node = self.tree.find(self.title_group_elt_path) 

1809 if node is not None: 

1810 title_xml = get_mixed_content(node) 

1811 return title_xml 

1812 

1813 def get_title_html(self): 

1814 title_html = "" 

1815 node = self.tree.find(self.title_path) 

1816 if node is not None: 

1817 title_html = get_html_mixed_content(node) 

1818 return title_html 

1819 

1820 def get_title_tex(self): 

1821 title_tex = "" 

1822 node = self.tree.find(self.title_path) 

1823 if node is not None: 

1824 title_tex = get_tex(node) 

1825 return title_tex 

1826 

1827 def get_abbrev(self): 

1828 return self.get_node_text(self.abbrev_title_path) 

1829 

1830 def get_coltype(self): 

1831 return self.custom_meta.get("serial-type") 

1832 

1833 

1834class Publication(Journal): 

1835 ids_xpath = "publication-id" 

1836 id_type_attr = "publication-id-type" 

1837 title_group_elt_path = "title-group" 

1838 title_path = "title-group/title" 

1839 abbrev_title_path = "title-group/abbrev-title" 

1840 trans_title_group_elt_path = "title-group/trans-title-group" 

1841 trans_title_path = "title-group/trans-title-group/trans-title" 

1842 

1843 

1844class Issue(Work): 

1845 mathdoc_id_xpath = 'issue-meta/issue-id[@issue-id-type="mathdoc-id"]' 

1846 ids_xpath = "issue-meta/issue-id" 

1847 abstract_path = "issue-meta/abstract" 

1848 trans_abstract_path = "issue-meta/trans-abstract" 

1849 kwd_path = "issue-meta/kwd-group" 

1850 subj_path = "Not-supported" 

1851 contrib_path = "issue-meta/contrib-group" 

1852 title_group_elt_path = "issue-meta/issue-title" 

1853 title_path = "issue-meta/issue-title" 

1854 # TODO support langs in issue-title 

1855 subtitle_path = "" 

1856 trans_title_path = "" 

1857 trans_title_group_elt_path = "" 

1858 counts_path = "issue-meta/counts" 

1859 last_modified_path = 'issue-meta/history/date[@date-type="last-modified"]' 

1860 published_path = 'issue-meta/pub-date[@date-type="pub"]' 

1861 prod_deployed_date_path = 'issue-meta/history/date[@date-type="prod-deployed-date"]' 

1862 funding_path = "Not-supported" 

1863 

1864 lang = "und" 

1865 meta_root_xpath = "issue-meta" 

1866 custom_meta_path = "issue-meta/custom-meta-group" 

1867 

1868 def get_journal(self): 

1869 node = self.tree.find("journal-meta") 

1870 return Journal(node) 

1871 

1872 def get_ctype(self): 

1873 return "issue" 

1874 

1875 def get_vseries(self): 

1876 return self.get_node_text("issue-meta/volume-series") 

1877 

1878 def get_vseries_int(self): 

1879 v = self.get_node_text("issue-meta/volume-series") 

1880 if v: 

1881 return make_int(v) 

1882 return 0 

1883 

1884 def get_volume(self): 

1885 return self.get_node_text("issue-meta/volume") 

1886 

1887 def get_volume_int(self): 

1888 v = self.get_node_text("issue-meta/volume") 

1889 if v: 

1890 return make_int(v) 

1891 return 0 

1892 

1893 def get_number(self): 

1894 return self.get_node_text("issue-meta/issue") 

1895 

1896 def get_number_int(self): 

1897 v = self.get_node_text("issue-meta/issue") 

1898 if v: 

1899 return make_int(v) 

1900 return 0 

1901 

1902 def get_year(self): 

1903 return self.get_node_text("issue-meta/pub-date/year") 

1904 

1905 def get_event(self): 

1906 node = self.tree.find("event") 

1907 if node is not None: 

1908 return Event(node) 

1909 return None 

1910 

1911 def get_publisher(self): 

1912 xpublisher = None 

1913 xjournal = self.get_journal() 

1914 if xjournal is not None: 

1915 xpublisher = xjournal.publisher 

1916 return xpublisher 

1917 

1918 def __iter__(self): 

1919 body = self.tree.find("body") 

1920 for node in body: 

1921 yield Article(node) 

1922 

1923 

1924class BibItem(XmlData): 

1925 extids_xpath = "*/ext-link" 

1926 extid_type_attr = "ext-link-type" 

1927 

1928 # remove_links = True 

1929 def __init__(self, tree): 

1930 super().__init__(tree) 

1931 self.extids = self.get_extids() 

1932 

1933 # Temporary code 

1934 # Some xml only have a pub-id (doi) and do not have an ext-link with a ext-link-type=doi 

1935 # We need to manually create the link 

1936 

1937 has_doi = False 

1938 for id_type, _id_value in self.extids: 

1939 if id_type == "doi": 

1940 has_doi = True 

1941 

1942 nodes = self.tree.findall("*/pub-id") 

1943 for node in nodes: 

1944 id_type = node.get("pub-id-type") 

1945 if id_type == "doi" and not has_doi: 

1946 value = node.text 

1947 value = value.replace("http://dx.doi.org/", "") 

1948 value = value.replace("https://doi.org/", "") 

1949 value = value.replace("doi:", "") 

1950 self.extids.append(("doi", value)) 

1951 elif id_type in ["eid", "arxiv", "tel", "hal", "theses.fr"]: 

1952 value = node.text 

1953 self.extids.append((id_type, value)) 

1954 

1955 def get_ref(self): 

1956 return self.tostring() 

1957 

1958 def split_label(self): 

1959 """ 

1960 Used when sorting non-digit bibitems 

1961 """ 

1962 label = self.label.lower() 

1963 

1964 try: 

1965 self.label_prefix, self.label_suffix = re.split(r"[\d]+", label) 

1966 except ValueError: 

1967 # Special case where label is similar as "Sma" instead of "Sma15" 

1968 self.label_prefix, self.label_suffix = [label, ""] 

1969 

1970 def get_label(self): 

1971 node = self.tree.find("label") 

1972 if node is not None: 

1973 return node.text 

1974 return "" 

1975 

1976 def get_user_id(self): 

1977 return self.tree.get("id", "") 

1978 

1979 def get_citation_xml(self): 

1980 text = "" 

1981 for name in ("mixed-citation", "element-citation"): 

1982 if not text: 

1983 node = self.tree.find(name) 

1984 text = get_mixed_content(node) 

1985 

1986 label = self.get_label() 

1987 if label: 

1988 text = "<label>" + label + "</label>" + text 

1989 

1990 return text 

1991 

1992 def get_citation_html(self): 

1993 text = "" 

1994 for name in ("mixed-citation", "element-citation"): 

1995 if not text: 

1996 node = self.tree.find(name) 

1997 text = get_html_mixed_content(node) 

1998 

1999 label = self.get_label() 

2000 if label: 

2001 if label[0] != "[": 

2002 label = "[" + label + "]" 

2003 text = label + " " + text 

2004 

2005 return text 

2006 

2007 def get_citation_tex(self): 

2008 text = "" 

2009 for name in ("mixed-citation", "element-citation"): 

2010 if not text: 

2011 node = self.tree.find(name) 

2012 text = get_tex(node) 

2013 

2014 label = self.get_label() 

2015 if label: 

2016 if label[0] != "[": 

2017 label = "[" + label + "]" 

2018 text = label + " " + text 

2019 

2020 return text 

2021 

2022 def get_citation_node(self): 

2023 tree = self.tree.find("element-citation") 

2024 if tree is None: 

2025 tree = self.tree.find("mixed-citation") 

2026 

2027 return tree 

2028 

2029 def get_type(self): 

2030 type = "misc" 

2031 

2032 tree = self.get_citation_node() 

2033 if tree is not None: 

2034 type = tree.get("publication-type", "misc") 

2035 

2036 return type 

2037 

2038 def get_node_text(self, node_name, tex=False): 

2039 text = "" 

2040 tree = self.get_citation_node() 

2041 if tree is not None: 

2042 node = tree.find(node_name) 

2043 if node is not None: 

2044 if tex: 

2045 text = get_tex(node) 

2046 else: 

2047 text = node.text 

2048 return text 

2049 

2050 def get_publisher_name(self): 

2051 return self.get_node_text("publisher-name") 

2052 

2053 def get_publisher_loc(self): 

2054 return self.get_node_text("publisher-loc") 

2055 

2056 def get_institution(self): 

2057 return self.get_node_text("institution") 

2058 

2059 def get_series(self): 

2060 return self.get_node_text("series") 

2061 

2062 def get_volume(self): 

2063 return self.get_node_text("volume") 

2064 

2065 def get_issue(self): 

2066 return self.get_node_text("issue") 

2067 

2068 def get_year(self): 

2069 return self.get_node_text("year") 

2070 

2071 # TODO: comments may have ext-link like arxiv. Add ExtId ? 

2072 def get_comment(self): 

2073 return self.get_node_text("comment", tex=True) 

2074 

2075 def get_fpage(self): 

2076 return self.get_node_text("fpage") 

2077 

2078 def get_lpage(self): 

2079 return self.get_node_text("lpage") 

2080 

2081 def get_page_range(self): 

2082 return self.get_node_text("page-range") 

2083 

2084 def get_size(self): 

2085 text = self.get_node_text("page-count") 

2086 if not text: 

2087 text = self.get_node_text("size") 

2088 return text 

2089 

2090 def get_source_tex(self): 

2091 return self.get_node_text("source", tex=True) 

2092 

2093 def get_article_title_tex(self): 

2094 return self.get_node_text("article-title", tex=True) 

2095 

2096 def get_chapter_title_tex(self): 

2097 return self.get_node_text("chapter-title", tex=True) 

2098 

2099 def get_contrib_groups(self): 

2100 groups = [] 

2101 

2102 tree = self.get_citation_node() 

2103 if tree is not None: 

2104 gc = [] 

2105 

2106 for child in tree: 

2107 if ( 

2108 child.tag == "name" 

2109 or child.tag == "string-name" 

2110 or child.tag == "name-alternatives" 

2111 ): 

2112 params = parse_name(child) 

2113 params["contrib_type"] = "" 

2114 params["contrib_xml"] = get_mixed_content(child) 

2115 gc.append(params) 

2116 

2117 groups.append({"content_type": "", "contribs": gc}) 

2118 return groups 

2119 

2120 

2121class Relation(XmlData): 

2122 def get_id_type(self): 

2123 return self.tree.get("ext-link-type") or "" 

2124 

2125 def get_rel_type(self): 

2126 return self.tree.get("related-article-type") or "" 

2127 

2128 def get_id_value(self): 

2129 return self.tree.text or "" 

2130 

2131 def get_right_pid(self): 

2132 return self.id_value 

2133 

2134 

2135class Article(Work): 

2136 mathdoc_id_xpath = 'article-id[@pub-id-type="mathdoc-id"]' 

2137 ids_xpath = "front/article-meta/article-id" 

2138 article_xpath = "front/article-meta" 

2139 

2140 extids_xpath = ( 

2141 'front/article-meta/ext-link[@ext-link-type="mr-item-id"]' 

2142 '|front/article-meta/ext-link[@ext-link-type="zbl-item-id"]' 

2143 '|front/article-meta/ext-link[@ext-link-type="sps-id"]' 

2144 '|front/article-meta/ext-link[@ext-link-type="jfm-item-id"]' 

2145 ) 

2146 extid_type_attr = "ext-link-type" 

2147 title_group_elt_path = "front/article-meta/title-group" 

2148 title_path = "front/article-meta/title-group/article-title" 

2149 subtitle_path = "front/article-meta/title-group/subtitle" 

2150 trans_title_group_elt_path = "front/article-meta/title-group/trans-title-group" 

2151 trans_title_path = "front/article-meta/title-group/trans-title-group/trans-title" 

2152 abstract_path = "front/article-meta/abstract" 

2153 trans_abstract_path = "front/article-meta/trans-abstract" 

2154 kwd_path = "front/article-meta/kwd-group" 

2155 subj_path = "front/article-meta/article-categories/subj-group" 

2156 contrib_path = "front/article-meta/contrib-group" 

2157 meta_root_xpath = "front/article-meta" 

2158 custom_meta_path = "front/article-meta/custom-meta-group" 

2159 counts_path = "front/article-meta/counts" 

2160 published_path = 'front/article-meta/pub-date[@date-type="pub"]' 

2161 prod_deployed_date_path = 'front/article-meta/history/date[@date-type="prod-deployed-date"]' 

2162 history_path = "front/article-meta/history/date" 

2163 funding_path = "front/article-meta/funding-group/award-group" 

2164 

2165 def __init__(self, tree): 

2166 # Case when we import the JATS article from OAI. 

2167 # The <article> tag is surrounded by a <header> tag. Remove this tag. 

2168 if tree.tag != "article": 

2169 remove_namespace(tree) 

2170 tree = tree.xpath("metadata/article")[0] 

2171 

2172 super().__init__(tree) 

2173 self.article_meta = self.get_subtree(self.article_xpath) 

2174 self.atype = tree.get("article-type") or "" 

2175 self.numbering = "" 

2176 self.lang = self.get_lang() 

2177 

2178 def get_doi(self): 

2179 try: 

2180 text = self.tree.xpath('front/article-meta/article-id[@pub-id-type="doi"]')[0].text 

2181 except BaseException: 

2182 return None 

2183 else: 

2184 return text 

2185 

2186 # When the JATS XML has only an <article>, we need to construct the Journal on the fly 

2187 def get_journal(self): 

2188 node = self.tree.xpath("front/journal-meta")[0] 

2189 return Journal(node) 

2190 

2191 def get_issue_id(self): 

2192 try: 

2193 return self.tree.xpath("front/article-meta/issue-id")[0].text 

2194 except: 

2195 return "" 

2196 

2197 def get_volume(self): 

2198 try: 

2199 return self.tree.xpath("front/article-meta/volume")[0].text 

2200 except: 

2201 return "" 

2202 

2203 def get_fpage(self): 

2204 return self.get_node_text("front/article-meta/fpage") 

2205 

2206 def get_lpage(self): 

2207 return self.get_node_text("front/article-meta/lpage") 

2208 

2209 def get_page_type(self): 

2210 page_type = "" 

2211 node = self.tree.find("front/article-meta/fpage") 

2212 if node is not None: 

2213 page_type = node.get("content-type") 

2214 

2215 if page_type is None: 

2216 page_type = "" 

2217 

2218 return page_type 

2219 

2220 # Olivier 2016-01-13 add page-range & elocation 

2221 def get_page_range(self): 

2222 return self.get_node_text("front/article-meta/page-range") 

2223 

2224 def get_elocation(self): 

2225 return self.get_node_text("front/article-meta/elocation-id") 

2226 

2227 def get_body(self): 

2228 node = self.tree.find("body") 

2229 text = get_node_text(node) 

2230 return text 

2231 

2232 def body_jats_to_html(self, base_url): 

2233 body_html = "" 

2234 figures = [] 

2235 node = self.tree.find("body") 

2236 if node is not None: 

2237 body_html, figures = get_html_mixed_content_with_figures( 

2238 node, 

2239 is_top=True, 

2240 is_citation=False, 

2241 is_comment=False, 

2242 is_figure=False, 

2243 prefix="", 

2244 suffix="", 

2245 sec_level=2, 

2246 label_title="", 

2247 figures=figures, 

2248 base_url=base_url, 

2249 ) 

2250 return body_html, figures 

2251 

2252 def get_body_tex(self): 

2253 node = self.tree.find("body") 

2254 # TODO: body_tex devrait être en fait le HTML va les fourmules TeX en texte 

2255 value_tex = get_tex(node) 

2256 return value_tex 

2257 

2258 def get_body_xml(self): 

2259 node = self.tree.find("body") 

2260 value_xml = get_mixed_content(node) 

2261 return value_xml 

2262 

2263 def get_seq(self): 

2264 issue = self.get_subtree("front/article-meta/issue") 

2265 seq = 0 

2266 if issue is not None: 

2267 seq = issue.get("seq") or 0 

2268 if not seq: 

2269 fpage = self.get_subtree("front/article-meta/fpage") 

2270 if fpage is not None: 

2271 seq = fpage.get("seq") or 0 

2272 try: 

2273 seq = int(seq) 

2274 except BaseException: 

2275 seq = 0 

2276 return seq 

2277 

2278 def get_relations(self): 

2279 relations = [] 

2280 nodes = self.tree.findall("front/article-meta/related-article") 

2281 for n in nodes: 

2282 rel = Relation(n) 

2283 rel.left_pid = self.pid 

2284 relations.append(rel) 

2285 return relations 

2286 

2287 def get_history_dates(self): 

2288 dates = [] 

2289 nodes = self.tree.findall(self.history_path) 

2290 for node in nodes: 

2291 type = node.attrib["date-type"] 

2292 date = node.attrib["iso-8601-date"] 

2293 dates.append({"type": type, "date": date}) 

2294 

2295 return dates 

2296 

2297 def get_article_number(self): 

2298 return self.custom_meta.get("article-number", "") 

2299 

2300 def get_talk_number(self): 

2301 return self.custom_meta.get("talk-number", "") 

2302 

2303 

2304class BookSeries(XmlData): 

2305 mathdoc_id_xpath = 'collection-id[@collection-id-type="mathdoc-id"]' 

2306 ids_xpath = "collection-id" 

2307 extid_type_attr = "collection-id-type" 

2308 title_group_elt_path = "title-group" 

2309 title_path = "title-group/title" 

2310 subtitle_path = "title-group/subtitle" 

2311 lang = "und" 

2312 

2313 def get_ids(self): 

2314 ids = [] 

2315 issn = self.get_node_text("issn") 

2316 if issn: 

2317 ids.append(("issn", issn)) 

2318 nodes = self.tree.findall("collection-id") 

2319 for n in nodes: 

2320 id_type = n.get("collection-id-type") 

2321 id_val = n.text 

2322 ids.append((id_type, id_val)) 

2323 return ids 

2324 

2325 def get_title(self): 

2326 return self.get_node_text(self.title_path) 

2327 

2328 def get_abbrev(self): 

2329 return self.get_node_text("title-group/abbrev-title") 

2330 

2331 def get_publisher(self): 

2332 node = self.tree.find("publisher") 

2333 if node is not None: 

2334 return Publisher(node) 

2335 return None 

2336 

2337 def get_stype(self): 

2338 return self.custom_meta.get("serial-type") 

2339 

2340 

2341# Mixin 

2342class HasParts: 

2343 def get_parts(self): 

2344 xparts = [] 

2345 for name in ("book-body", "body"): 

2346 parts = self.xget_subtrees("%s/book-part" % name) 

2347 if parts: 

2348 break 

2349 if parts: 

2350 for tree in parts: 

2351 part = self.__class__.get_book_part_class()(tree) 

2352 xparts.append(part) 

2353 for name in ("book-body", "body"): 

2354 body = self.get_subtree(name) 

2355 if body is not None: 

2356 break 

2357 if body is not None: 

2358 try: 

2359 self.tree.getroot().remove(body) # XSLT result tree 

2360 except BaseException: 

2361 self.tree.remove(body) # Element tree 

2362 return xparts 

2363 

2364 

2365class BookPart(Work, HasParts): 

2366 id_type_attr = "book-part-id-type" 

2367 part_xpath = "book-part-meta" 

2368 ids_xpath = "book-part-meta/book-part-id" 

2369 mathdoc_id_xpath = 'book-part-meta/book-part-id[@book-part-id-type="mathdoc-id"]' 

2370 meta_xpath = "book-part-meta" 

2371 extids_xpath = ( 

2372 'book-part-meta/ext-link[@ext-link-type="mr-item-id"]' 

2373 '|book-part-meta/ext-link[@ext-link-type="zbl-item-id"]' 

2374 '|book-part-meta/ext-link[@ext-link-type="jfm-item-id"]' 

2375 ) 

2376 extid_type_attr = "ext-link-type" 

2377 title_group_elt_path = "book-part-meta/title-group" 

2378 title_path = "book-part-meta/title-group/title" 

2379 subtitle_path = "book-part-meta/title-group/subtitle" 

2380 trans_title_group_elt_path = "book-part-meta/title-group/trans-title-group" 

2381 trans_title_path = "book-part-meta/title-group/trans-title-group/trans-title" 

2382 abstract_path = "book-part-meta/abstract" 

2383 trans_abstract_path = "book-part-meta/trans-abstract" 

2384 kwd_path = "book-part-meta/kwd-group" 

2385 subj_path = "front/book-part-meta/article-categories/subj-group" 

2386 contrib_path = "book-part-meta/contrib-group" 

2387 meta_root_xpath = "book-part-meta" 

2388 custom_meta_path = "book-part-meta/custom-meta-group" 

2389 funding_path = "book-part-meta/funding-group/award-group" 

2390 

2391 def __init__(self, tree): 

2392 super().__init__(tree) 

2393 self.part_meta = self.get_subtree(self.part_xpath) 

2394 indexed = tree.get("indexed", "true") 

2395 self.indexed = True if indexed == "true" else False 

2396 self.atype = tree.get("book-part-type") or "" 

2397 self.numbering = tree.get("book-part-number") or "" 

2398 self.parts = self.get_parts() 

2399 self.lang = self.get_lang() 

2400 

2401 def get_fpage(self): 

2402 return self.get_node_text("book-part-meta/fpage") 

2403 

2404 def get_lpage(self): 

2405 return self.get_node_text("book-part-meta/lpage") 

2406 

2407 def get_page_range(self): 

2408 return "" 

2409 

2410 def get_page_type(self): 

2411 page_type = "" 

2412 node = self.tree.find("book-part-meta/fpage") 

2413 if node is not None: 

2414 page_type = node.get("content-type") 

2415 

2416 if page_type is None: 

2417 page_type = "" 

2418 

2419 return page_type 

2420 

2421 def get_seq(self): 

2422 v = self.fpage 

2423 try: 

2424 v = int(v) 

2425 except BaseException: 

2426 return 0 

2427 return v 

2428 

2429 def get_body(self): 

2430 node = self.tree.find("body") 

2431 if node is not None: 

2432 return etree.tostring(node, encoding="utf-8", xml_declaration=False) 

2433 return "" 

2434 

2435 def get_relations(self): 

2436 relations = [] 

2437 nodes = self.tree.findall("book-part-meta/related-article") 

2438 for n in nodes: 

2439 rel = Relation(n) 

2440 rel.left_pid = self.pid 

2441 relations.append(rel) 

2442 return relations 

2443 

2444 def get_article_number(self): 

2445 return self.custom_meta.get("article-number", "") 

2446 

2447 def get_talk_number(self): 

2448 return self.custom_meta.get("talk-number", "") 

2449 

2450 

2451def get_volume_and_seq(incol): 

2452 v = incol.find("volume") 

2453 try: 

2454 seq = int(incol.get("seq")) 

2455 except BaseException: 

2456 if v is None: 

2457 seq = 0 

2458 else: 

2459 vt = v.text.split("-")[0] 

2460 vt = [x for x in vt if x.isdigit()] 

2461 try: 

2462 seq = int(vt) 

2463 except BaseException: 

2464 seq = 0 

2465 try: 

2466 volume = v.text 

2467 except BaseException: 

2468 volume = "" 

2469 try: 

2470 vseries = incol.find("volume-series").text 

2471 except BaseException: 

2472 vseries = "" 

2473 if vseries: 

2474 try: 

2475 # pas plus de 10000 ouvrages dans une série (gasp) 

2476 seq = int(vseries) * 10000 + seq 

2477 except BaseException: 

2478 pass 

2479 return (volume, seq, vseries) 

2480 

2481 

2482class Book(Work, HasParts): 

2483 id_type_attr = "book-id-type" 

2484 mathdoc_id_xpath = 'book-meta/book-id[@book-id-type="mathdoc-id"]' 

2485 ids_xpath = "book-meta/book-id" 

2486 book_xpath = "book-meta" 

2487 extids_xpath = ( 

2488 'book-meta/ext-link[@ext-link-type="mr-item-id"]' 

2489 '|book-meta/ext-link[@ext-link-type="zbl-item-id"]' 

2490 '|book-meta/ext-link[@ext-link-type="jfm-item-id"]' 

2491 ) 

2492 extid_type_attr = "ext-link-type" 

2493 

2494 title_group_elt_path = "book-meta/book-title-group" 

2495 title_path = "book-meta/book-title-group/book-title" 

2496 alternate_title_group_elt_path = "collection-meta/volume-in-collection/volume-title" 

2497 alternate_title_path = "collection-meta/volume-in-collection/volume-title" 

2498 trans_title_group_elt_path = "book-meta/book-title-group/trans-title-group" 

2499 trans_title_path = "book-meta/book-title-group/trans-title-group/trans-title" 

2500 subtitle_path = "book-meta/book-title-group/subtitle" 

2501 

2502 abstract_path = "book-meta/abstract" 

2503 trans_abstract_path = "book-meta/trans-abstract" 

2504 kwd_path = "book-meta/kwd-group" 

2505 subj_path = "Not-supported" 

2506 contrib_path = "book-meta/contrib-group" 

2507 meta_root_xpath = "book-meta" 

2508 custom_meta_path = "book-meta/custom-meta-group" 

2509 counts_path = "book-meta/counts" 

2510 last_modified_path = 'book-meta/pub-history/date[@date-type="last-modified"]' 

2511 published_path = 'book-meta/pub-date[@date-type="pub"]' 

2512 prod_deployed_date_path = 'book-meta/pub-history/date[@date-type="prod-deployed-date"]' 

2513 year_path = "book-meta/pub-date/year" 

2514 funding_path = "Not-supported" 

2515 

2516 mbook_seq = 0 

2517 mbook_volume = "" 

2518 mbook_vseries = "" 

2519 

2520 def __init__(self, tree): 

2521 # Case when we import the book from OAI. 

2522 # The <book> tag is surrounded by a <header> tag. Remove this tag. 

2523 if tree.tag != "book": 

2524 remove_namespace(tree) 

2525 tree = tree.xpath("metadata/book")[0] 

2526 if tree.getchildren()[0].tag == "front": 

2527 tree = tree.xpath("front")[0] 

2528 

2529 super().__init__(tree) 

2530 self.book_meta = self.get_subtree(self.book_xpath) 

2531 self.contrib_groups = [] 

2532 try: 

2533 self.book_type = tree.get("book-type") or "Book" 

2534 except BaseException: 

2535 self.book_type = tree.getroot().get("book-type") or "Book" 

2536 # if self.book_type == 'proceedings' or self.book_type == 'edited-book' 

2537 # or self.book_type == 'monograph' : 

2538 if self.book_type: 

2539 self.parts = self.get_parts() 

2540 

2541 # patch for book without contrib-group: 

2542 # 1 : monograph with book_parts : contrib-group of book egal to the 

2543 # contrib-group of the first book-part 

2544 # OR 2 : edited-books with same author for all of its book_parts : book-type become 'monograph' and 

2545 # contrib-group of book equal to the contrib-group of the first book-part 

2546 # OR 3 : edited-books but not same author for all book-parts : contrib-group of 

2547 # book become "Collectif" 

2548 self.contrib_groups = self.get_contrib_groups() 

2549 if not self.contrib_groups: 

2550 if self.book_type == "monograph" and self.parts: 

2551 first_part = self.parts[0] 

2552 self.contrib_groups = first_part.get_contrib_groups() 

2553 elif self.book_type == "edited-book" and self.parts: 

2554 # check if authors of the book-parts are identical 

2555 equal = True 

2556 book_part_contrib_group = self.parts[0].get_contrib_groups() 

2557 for xparts in self.parts: 

2558 if xparts.get_contrib_groups() != book_part_contrib_group: 

2559 equal = False 

2560 break 

2561 if equal: 

2562 # FIXME : ? is it a check or an assignation ? 

2563 self.book_type == "monograph" 

2564 self.contrib_groups = book_part_contrib_group 

2565 else: 

2566 self.contrib_groups = [ 

2567 { 

2568 "contribs": [ 

2569 { 

2570 "first_name": "", 

2571 "last_name": "Collectif", 

2572 "suffix": "", 

2573 "string_name": "Collectif", 

2574 "reference_name": "Collectif", 

2575 "contrib_xml": "<contrib><name><surname>Collectif</surname><given-names>" 

2576 + "</given-names></name><name-alternatives>" 

2577 + '<string-name specific-use="index">Collectif</string-name></name-alternatives></contrib>', 

2578 "prefix": "", 

2579 "contrib_type": "author", 

2580 } 

2581 ], 

2582 "content_type": "authors", 

2583 } 

2584 ] 

2585 

2586 self.body = "" 

2587 # else: #or self.book_type == 'monograph': pour monograph pas de book-part, body contient le plein text 

2588 # self.parts = [] 

2589 self.incollection = self.get_incollection() 

2590 

2591 self.lang = self.get_lang() 

2592 

2593 @staticmethod 

2594 def get_book_part_class(): 

2595 return BookPart 

2596 

2597 def get_doi(self): 

2598 try: 

2599 text = self.tree.xpath('book-meta/book-id[@book-id-type="doi"]')[0].text 

2600 except BaseException: 

2601 return None 

2602 else: 

2603 return text 

2604 

2605 def get_ctype(self): 

2606 return "book-%s" % self.book_type 

2607 

2608 def get_contrib_groups(self): 

2609 if self.contrib_groups: 

2610 return self.contrib_groups 

2611 return super().get_contrib_groups() 

2612 

2613 def get_publisher(self): 

2614 node = self.tree.find("book-meta/publisher") 

2615 if node is not None: 

2616 return Publisher(node) 

2617 return None 

2618 

2619 def get_year(self): 

2620 return self.get_node_text(self.year_path) 

2621 

2622 def get_title(self): 

2623 text = self.get_node_text("book-meta/title-group/title") 

2624 if not text: 

2625 self.get_node_text("collection-meta/volume-in-collection/volume-title") 

2626 return text 

2627 

2628 def get_body(self): 

2629 node = self.tree.find("book-body") 

2630 if node is not None: 

2631 return etree.tostring(node, encoding="utf-8", xml_declaration=False) 

2632 return "" 

2633 

2634 def get_incollection(self): 

2635 nodes = self.tree.findall("in-collection") 

2636 incols = [] 

2637 for node in nodes: 

2638 incols.append(InCollection(node)) 

2639 if incols: 

2640 return incols 

2641 nodes = self.tree.findall("collection-meta") 

2642 for node in nodes: 

2643 incols.append(BitsCollection(node)) 

2644 return incols 

2645 

2646 def get_event(self): 

2647 node = self.tree.find("book-meta/event") 

2648 if node is not None: 

2649 return Event(node) 

2650 return None 

2651 

2652 def get_event_series(self): 

2653 node = self.tree.find("book-meta/event-series") 

2654 if node is not None: 

2655 return EventSeries(node) 

2656 return None 

2657 

2658 def get_vseries(self): 

2659 return self.get_node_text("book-meta/volume-series") 

2660 

2661 def get_frontmatter(self): 

2662 node = self.tree.find("front-matter") 

2663 if node is not None: 

2664 return innerxml(node) 

2665 return "" 

2666 

2667 def get_relations(self): 

2668 relations = [] 

2669 nodes = self.tree.findall("book-meta/related-article") 

2670 for n in nodes: 

2671 rel = Relation(n) 

2672 rel.left_pid = self.pid 

2673 relations.append(rel) 

2674 return relations 

2675 

2676 

2677factories = { 

2678 "collection": Collection, 

2679 "publisher": Publisher, 

2680 "journal": Journal, 

2681 "issue": Issue, 

2682 "article": Article, 

2683 "book": Book, 

2684} 

2685 

2686 

2687def xobj_fromtree(classname, tree): 

2688 factory = factories[classname] 

2689 return factory(tree) 

2690 

2691 

2692def xobj_fromstring(classname, metadata): 

2693 tree = etree.fromstring(metadata) 

2694 return xobj_fromtree(classname, tree) 

2695 

2696 

2697def xobj_fromfile(classname, path): 

2698 metadata = open(path, "rb").read() 

2699 return xobj_fromstring(classname, metadata) 

2700 

2701 

2702def update_bibitem_xml(bibitem, new_ids): 

2703 xml = "<ref>" + bibitem.citation_xml + "</ref>" 

2704 parser = etree.XMLParser( 

2705 huge_tree=True, recover=True, remove_blank_text=True, remove_comments=True 

2706 ) 

2707 tree = etree.fromstring(xml, parser=parser) 

2708 

2709 node = tree.find("element-citation") 

2710 if node is None: 

2711 node = tree.find("mixed-citation") 

2712 if node is not None: 

2713 children_to_remove = [] 

2714 for child in node: 

2715 if child.tag == "ext-link": 

2716 type = child.get("ext-link-type") 

2717 if type and type in new_ids: 

2718 children_to_remove.append(child) 

2719 elif child.tag == "pub-id": 

2720 type = child.get("pub-id-type") 

2721 if type and type in new_ids: 

2722 children_to_remove.append(child) 

2723 

2724 for child in children_to_remove: 

2725 node.remove(child) 

2726 

2727 for type, value_dict in new_ids.items(): 

2728 if value_dict["checked"] and not value_dict["false_positive"]: 

2729 if type in ["doi", "arxiv", "tel", "hal", "theses.fr"]: 

2730 new_node = etree.Element("pub-id") 

2731 new_node.set("pub-id-type", type) 

2732 else: 

2733 new_node = etree.Element("ext-link") 

2734 new_node.set("ext-link-type", type) 

2735 

2736 new_node.text = value_dict["id_value"] 

2737 node.append(new_node) 

2738 

2739 result = BibItem(tree) 

2740 return result 

2741 

2742 

2743######################################################################################### 

2744# 

2745# Create XML strings based on internal data 

2746# 

2747######################################################################################### 

2748 

2749 

2750def get_contrib_xml(type, first_name, last_name, prefix, suffix, deceased): 

2751 xml = "<contrib" 

2752 if type: 

2753 xml += ' contrib-type="' + type + '"' 

2754 if deceased: 

2755 xml += ' deceased="yes"' 

2756 xml += "><name>" 

2757 

2758 if prefix: 

2759 xml += "<prefix>" + prefix + "</prefix>" 

2760 if first_name: 

2761 xml += "<given-names>" + first_name + "</given-names>" 

2762 if last_name: 

2763 xml += "<surname>" + last_name + "</surname>" 

2764 if suffix: 

2765 xml += "<suffix>" + suffix + "</suffix>" 

2766 

2767 xml += "</name></contrib>" 

2768 

2769 return xml 

2770 

2771 

2772def get_title_xml(title): 

2773 xml = '<title-group xmlns:xlink="http://www.w3.org/1999/xlink"><article-title xml:space="preserve">' 

2774 xml += title 

2775 xml += "</article-title></title-group>" 

2776 

2777 return xml