Coverage for apps/ptf/cmds/xml/jats/xmldata.py: 15%

1import os

2import re

3import unicodedata

5from lxml import etree

7from django.conf import settings

8from django.utils import timezone

9from django.utils.translation import gettext_lazy as _

11from ptf.cmds.xml.xml_utils import escape

12from ptf.cmds.xml.xml_utils import normalize

13from ptf.cmds.xml.xml_utils import remove_namespace

16def get_attribute_value(node, fullname, basename=None, name=None):

17 value = ""

18 try:

19 if basename == name:

20 value = node.attrib[fullname]

21 except KeyError:

22 pass

24 return value

27def get_lang_attrib(node):

28 lang = "und"

29 if node is not None:

30 for attrib in node.attrib:

31 name = normalize(attrib)

32 if name == "lang":

33 lang = node.attrib[attrib]

35 return lang

38def get_href_attrib(node):

39 href = None

40 if node is not None:

41 for attrib in node.attrib:

42 name = normalize(attrib)

43 if name == "href":

44 href = node.attrib[attrib]

46 return href

49def innerxml(node):

50 if node.text:

51 parts = [escape(node.text)] + [

52 etree.tostring(c, encoding="unicode") for c in node.getchildren()

53 ]

54 else:

55 parts = [etree.tostring(c, encoding="unicode") for c in node.getchildren()]

56 return "".join(parts).strip().encode("utf-8")

59def get_node_text(node):

60 text = ""

61 if node is not None:

62 text = etree.tostring(

63 node, encoding="unicode", method="text", xml_declaration=False, with_tail=False

64 )

65 return text

68##########################################################################

69#

70# get_mixed_content: recreate the xml string from a node

71#

72# Used to export data (OAI)

73#

74##########################################################################

77def get_mixed_content(node):

78 text = ""

79 if node is not None:

80 text = etree.tostring(

81 node, encoding="unicode", method="xml", xml_declaration=False, with_tail=False

82 )

83 return text

86##########################################################################

87#

88# get_tex: get the tex version of a node with mixed-content

89#

90# Strip the mathml alternative of formula

91#

92# Used to prepare the HTML pages. A Django template can simply display title_tex

93#

94##########################################################################

95def get_tex(node, is_top=True, is_citation=False):

96 text = ""

97 is_citation_author = False

98 is_citation_title = False

100 if node is not None:

101 normalized_tag = normalize(node.tag)

102

103 if normalized_tag == "element-citation":

104 text += get_element_citation_str(node, is_top)

105 elif normalized_tag != "math":

106 if normalized_tag == "mixed-citation":

107 is_citation = True

108 elif is_citation and normalized_tag == "string-name":

109 is_citation_author = True

110 elif is_citation and (

111 normalized_tag == "article-title"

112 or normalized_tag == "chapter-title"

113 or normalized_tag == "italic"

114 ):

115 is_citation_title = True

116

117 if node.text:

118 text += node.text

119

120 for child in node:

121 text += get_tex(child, False)

122

123 if is_citation_title:

124 text = '<span class="citation-title">' + text + "</span>"

125 elif is_citation_author:

126 text = '<span class="citation-author">' + text.title() + "</span>"

127

128 if node.tail and not is_top:

129 text += node.tail

130

131 return text

132

133

134def make_links_clickable(href, string):

135 if re.match(r"http+", href):

136 return f'<a href="{href}" target="_blank">{string}</a>'

137 if href.startswith("/"):

138 return f'<a href="{href}">{string}</a>'

139 return string

140

141

142##########################################################################

143#

144# get_html_mixed_content_with_figures: get the mathml version of a node with mixed-content

145#

146# Strip the tex alternative of formula, add the tex version to the tooltip (HTML <title> tag)

147#

148# Used to prepare the HTML pages. A Django template can simply display the_html

149#

150# TODO: Use a dict to pass the params

151#

152##########################################################################

153def get_html_mixed_content_with_figures(

154 node,

155 is_top=True,

156 is_citation=False,

157 is_comment=False,

158 is_figure=False,

159 prefix="",

160 suffix="",

161 sec_level=2,

162 label_title="",

163 figures=None,

164 base_url="",

165):

166 text = ""

167 is_citation_author = False

168 is_citation_title = False

169 is_citation_volume = False

170

171 # specific case for element-citation as the order of the children

172 # might not be the order of display

173 if node is not None:

174 normalized_tag = normalize(node.tag)

175

176 if normalized_tag == "element-citation":

177 text = get_element_citation_str(node, is_top)

178 # pub-id are ignored by default are they are treated separately

179 # Inside citations or comments, ext-links are converted in html links

180 elif is_comment or (normalized_tag != "pub-id" and normalized_tag != "object-id"):

181 if normalized_tag == "mixed-citation" or normalized_tag == "toc":

182 is_citation = True

183 # elif normalized_tag == "toc":

184 # is_toc = True

185 elif normalized_tag == "comment":

186 is_comment = True

187 elif is_citation and normalized_tag == "string-name":

188 is_citation_author = True

189 elif is_citation and (

190 normalized_tag == "article-title"

191 or normalized_tag == "chapter-title"

192 or normalized_tag == "italic"

193 ):

194 is_citation_title = True

195 elif is_citation and normalized_tag == "volume":

196 is_citation_volume = True

197

198 text += prefix

199

200 if is_citation and normalized_tag == "ext-link":

201 type = node.get("ext-link-type")

202 if type is None:

203 href = get_href_attrib(node)

204 if not href:

205 href = node.text

206 if "www.numdam.org" not in href:

207 href = make_links_clickable(href, node.text)

208 text += href

209 elif is_citation and normalized_tag == "uri":

210 href = get_href_attrib(node)

211 if not href:

212 href = node.text

213 href = make_links_clickable(href, node.text)

214 text += href

215 # elif normalized_tag == "nav-pointer":

216 # rid = get_attribute_value(node,'rid')

217 # if rid is not '':

218 # href = '/item/%s' % rid

219 # #href = make_links_clickable(href, node.text) non car make links clickable cree un lien absolu avec target _blank

220 # link = '<a href="%s">%s</a>' % (href, node.text)

221 # else:

222 # link = node.text

223 # text += link

224 elif is_comment and node.text:

225 match = re.match(r"[\n ]+", node.text)

226 if not match:

227 comment = make_links_clickable(node.text, node.text)

228 text += comment

229 elif node.text:

230 text += node.text

231

232 label = ""

233 if (

234 normalized_tag == "sec"

235 or normalized_tag == "statement"

236 or normalized_tag == "fig"

237 or normalized_tag == "list-item"

238 or normalized_tag == "table-wrap"

239 ):

240 child = node.find("label")

241 if child is not None:

242 label += child.text

243 node.remove(child)

244 child = node.find("title")

245 if child is not None:

246 if label:

247 label += " "

248 label += child.text

249 node.remove(child)

250

251 if normalized_tag == "sec" or normalized_tag == "statement":

252 text = "<h" + str(sec_level) + ">" + label + "</h" + str(sec_level) + ">"

253 sec_level += 1

254

255 if normalized_tag == "table-wrap":

256 text = "<strong>" + label + "</strong>"

257

258 if normalized_tag == "fig":

259 is_figure = True

260 child = node.find("caption")

261 if child is not None:

262 child_text, figures = get_html_mixed_content_with_figures(

263 child,

264 False,

265 is_citation,

266 is_comment,

267 is_figure,

268 "",

269 "",

270 sec_level,

271 "",

272 figures,

273 base_url,

274 )

275 label += " : " + child_text

276 node.remove(child)

277

278 if normalized_tag == "list-item":

279 label_title = label

280

281 if normalized_tag == "p":

282 if label_title:

283 text = label_title + " " + text

284 label_title = ""

285

286 if normalized_tag == "inline-formula" or normalized_tag == "disp-formula":

287 for child in node:

288 if child.tag == "alternatives":

289 math_text = ""

290 tex_text = ""

291

292 for great_child in child:

293 normalized_tag = normalize(great_child.tag)

294 if normalized_tag == "math":

295 math_text = get_mixed_content(great_child)

296 else:

297 tex_text = get_tex(great_child)

298

299 text += '<span title="' + tex_text + '">' + math_text + "</span>"

300

301 else:

302 for child in node:

303 child_text, figures = get_html_mixed_content_with_figures(

304 child,

305 False,

306 is_citation,

307 is_comment,

308 is_figure,

309 "",

310 "",

311 sec_level,

312 label_title,

313 figures,

314 base_url,

315 )

316 text += child_text

317

318 if is_citation_title:

319 text = '<span class="citation-document-title">' + text + "</span>"

320 elif is_citation_author:

321 text = '<span class="citation-author">' + text.title() + "</span>"

322 elif is_citation_volume:

323 text = '<span class="citation-volume">' + text + "</span>"

324 elif normalized_tag == "list":

325 type = node.get("list-type")

326 if type is None or type == "bullet":

327 text = "<ul>" + text + "</ul>"

328 else:

329 if type == "order":

330 text = '<ol type="1">' + text + "</ol>"

331 elif type == "alpha-lower":

332 text = '<ol type="a">' + text + "</ol>"

333 elif type == "alpha-upper":

334 text = '<ol type="A">' + text + "</ol>"

335 elif type == "roman-lower":

336 text = '<ol type="i">' + text + "</ol>"

337 elif type == "roman-upper":

338 text = '<ol type="I">' + text + "</ol>"

339 else:

340 text = (

341 '<ul class="no-bullet" style="list-style-type:none;">' + text + "</ul>"

342 )

343 elif normalized_tag == "list-item":

344 text = "<li>" + text + "</li>"

345 elif normalized_tag == "strong" or normalized_tag == "bold":

346 text = "<strong>" + text + "</strong>"

347 elif normalized_tag == "italic":

348 text = '<span class="italique">' + text + "</span>"

349 elif normalized_tag == "p":

350 type = node.get("specific-use")

351 if type:

352 text = '<p class="' + type + '">' + text + "</p>"

353 else:

354 text = "<p>" + text + "</p>"

355 elif normalized_tag == "caption" and not is_figure:

356 text = '<div class="caption">' + text + "</div>"

357 elif normalized_tag == "sec" or normalized_tag == "statement":

358 text = "<section>" + text + "</section>"

359 elif normalized_tag == "fig":

360 id = node.get("id")

361 if id:

362 tag = '<figure id="' + id + '">'

363 else:

364 tag = "<figure>"

365 text = tag + text

366 if label:

367 text += "<figcaption>" + label + "</figcaption>"

368 text += "</figure>"

369 elif normalized_tag == "sub" or normalized_tag == "sup":

370 text = "<" + normalized_tag + ">" + text + "</" + normalized_tag + ">"

371 elif normalized_tag == "xref":

372 id = node.get("rid")

373 if id:

374 text = '<a href="#' + id + '">' + text + "</a>"

375 elif normalized_tag == "graphic" and is_figure:

376 href = ""

377 for attrib in node.attrib:

378 name = normalize(attrib)

379 href = node.attrib[attrib] if name == "href" else ""

380

381 if len(href) > 0:

382 basename = os.path.basename(href)

383 ext = basename.split(".")[-1]

384 if ext == "png":

385 mimetype = "image/png"

386 else:

387 mimetype = "image/jpeg"

388

389 location = "src/tex/figures/" + basename

390 v = {

391 "rel": "image",

392 "mimetype": mimetype,

393 "location": location,

394 "base": None,

395 "text": node.text if node.text is not None else "",

396 }

397

398 if ext == "png":

399 location = os.path.join(base_url, "png", location)

400 else:

401 location = os.path.join(base_url, "jpg", location)

402 text = '<img src="' + location + '" class="article-body-img" />'

403

404 figures.append(v)

405 elif (

406 normalized_tag == "table"

407 or normalized_tag == "th"

408 or normalized_tag == "thead"

409 or normalized_tag == "tr"

410 or normalized_tag == "td"

411 ):

412 tag = "<" + normalized_tag

413 if "rowspan" in node.attrib:

414 tag += ' rowspan="' + node.attrib["rowspan"] + '"'

415 text = tag + ">" + text + "</" + normalized_tag + ">"

416 elif normalized_tag == "table-wrap":

417 tag = '<div class="table-wrap"'

418 id = node.get("id")

419 if id:

420 tag += ' id="' + id + '"'

421

422 text = tag + ">" + text + "</div>"

423

424 if node.tail and not is_top:

425 # match = None

426 # if is_citation:

427 # match = re.match(r'[\n ]+', node.tail)

428 # if not match:

429 text += node.tail

430

431 text += suffix

432

433 return text, figures

434

435

436def get_html_mixed_content(

437 node,

438 is_top=True,

439 is_citation=False,

440 is_comment=False,

441 prefix="",

442 suffix="",

443 sec_level=2,

444 label="",

445):

446 text, _ = get_html_mixed_content_with_figures(

447 node, is_top, is_citation, is_comment, False, prefix, suffix, sec_level, label, None

448 )

449 return text

450

451

452##########################################################################

453#

454# get_element_citation_str: get the mixed content of an element-citation node

455#

456# An element-citation node is specific as the order of its children might not be

457# the correct order for display

458#

459# Used to prepare the HTML pages. A Django template can simply display title_html

460#

461##########################################################################

462def get_element_citation_str(node, is_top=False, is_html=True):

463 text = document_title = ""

464 REF_JEP_STYLE = getattr(settings, "REF_JEP_STYLE", False)

465

466 # xbibitem = BibItem(node.getparent())

467 # ids = xbibitem.extids

468

469 if node is not None:

470 type = node.get("publication-type")

471

472 name_str = get_author_str(node)

473 text += name_str

474

475 if is_html:

476 prefix = " "

477 suffix = ""

478 if REF_JEP_STYLE:

479 prefix = " - “"

480 suffix = "”"

481 document_title += get_html_mixed_content(

482 node.find("article-title"), True, True, False, prefix, suffix

483 )

484

485 if REF_JEP_STYLE and type == "incollection":

486 document_title += get_html_mixed_content(

487 node.find("chapter-title"), True, True, False, prefix, suffix

488 )

489 else:

490 document_title += get_html_mixed_content(

491 node.find("chapter-title"), True, True, False, " "

492 )

493 else:

494 document_title += " " + get_tex(node.find("article-title"))

495 document_title += " " + get_tex(node.find("chapter-title"))

496

497 text += document_title

498

499 prefix = ""

500 suffix = "</span>"

501

502 if document_title:

503 if REF_JEP_STYLE and type == "incollection":

504 prefix += ', in <span class="citation-publication-title">'

505 else:

506 prefix += ', <span class="citation-publication-title">'

507 else:

508 if name_str:

509 prefix = " "

510 if REF_JEP_STYLE:

511 prefix = " - "

512 if type in ["unpublished", "misc"]:

513 prefix += "“"

514 suffix += "”"

515 prefix += '<span class="citation-document-title">'

516

517 source = get_html_mixed_content(node.find("source"), True, True, False, prefix, suffix)

518 if REF_JEP_STYLE and type == "book":

519 source = f"<i>{source}</i>"

520 if type in ["book", "incollection"]:

521 editor = get_editor_str(node.find("person-group"))

522 source += editor

523 else:

524 editor = ""

525 text += source

526

527 if document_title:

528 if REF_JEP_STYLE and type == "incollection":

529 prefix = ", "

530 else:

531 prefix = " ("

532 suffix = ")"

533 else:

534 if REF_JEP_STYLE and type == "book":

535 prefix = ', <span class="citation-publication-title-book">'

536 else:

537 prefix = ', <span class="citation-publication-title">'

538 suffix = "</span>"

539

540 serie = get_html_mixed_content(node.find("series"), True, True, False, prefix, suffix)

541 text += serie

542

543 if REF_JEP_STYLE:

544 if type in ["incollection", "book"]:

545 prefix = ", vol. "

546 else:

547 prefix = " "

548 else:

549 if document_title:

550 prefix = " " if serie else ", "

551 else:

552 prefix = ", " if serie else " "

553 prefix += str(_("Tome")) + " "

554

555 text += get_html_mixed_content(node.find("volume"), True, True, False, prefix)

556 if type in ["incollection", "book"]:

557 text = text.replace("citation-volume", "citation-volume-incollection")

558 text += get_html_mixed_content(node.find("publisher-name"), True, True, False, ", ")

559 text += get_html_mixed_content(node.find("publisher-loc"), True, True, False, ", ")

560 text += get_html_mixed_content(node.find("institution"), True, True, False, ", ")

561 prefix = ", "

562 suffix = ""

563 elif type == "misc":

564 prefix = ", "

565 suffix = ""

566 else:

567 prefix = " ("

568 suffix = ")"

569 text += get_html_mixed_content(node.find("year"), True, True, False, prefix, suffix)

570 text += get_html_mixed_content(node.find("issue"), True, True, False, " no. ")

571

572 for child in node.findall("pub-id"):

573 if child.get("pub-id-type") == "eid":

574 text += ", " + child.text

575

576 for child in node.findall("ext-link"):

577 if child.get("ext-link-type") == "eid":

578 if REF_JEP_STYLE:

579 text += ", article ID " + child.text

580 else:

581 text += ", " + child.text

582

583 if not (REF_JEP_STYLE and type == "book"):

584 text += get_pages_str(node)

585

586 for child in node.findall("ext-link"):

587 type = child.get("ext-link-type")

588 if type is None:

589 href = get_href_attrib(child)

590 if not href:

591 href = child.text

592 # bibitem with ext-links pointing to numdam.org have a numdam-id

593 # ext-links to doi.org are transformed in an extid

594 # We can ignore both cases

595 if "www.numdam.org" not in href and "doi.org" not in href and not REF_JEP_STYLE:

596 href = make_links_clickable(href, child.text)

597 text += " " + href

598

599 if REF_JEP_STYLE:

600 text += get_html_mixed_content(node.find("comment"), True, True, True, ", ")

601 else:

602 text += get_html_mixed_content(node.find("comment"), True, True, True, " (", ")")

603

604 # if type is None or type == 'article':

605 # elif type == 'book' or type == 'proceedings':

606 # elif type == 'incollection':

607 # elif type == 'conference':

608 # elif type == 'unpublished':

609 # elif type == "booklet":

610 # elif type == 'inbook' or type == 'inproceedings':

611 # elif type == "misc":

612 # elif type == 'phdthesis' or type == 'masterthesis':

613 # elif type == 'techreport' or type == 'manual':

614

615 # Fallback in case the publication-type is unknown

616 # else:

617 # if node.text:

618 # text += node.text

619 #

620 # for child in node:

621 # text += get_html_mixed_content(child, False, True)

622 #

623 # if node.tail and not is_top:

624 # text += node.tail

625

626 return text

627

628

629def get_name_str(node):

630 text = ""

631 REF_JEP_STYLE = getattr(settings, "REF_JEP_STYLE", False)

632

633 if node is not None:

634 names = node.findall("name")

635 i = 1

636 for name_node in names:

637 first_name = last_name = prefix = suffix = string_name = ""

638

639 for child in name_node:

640 if child.tag == "given-names":

641 if REF_JEP_STYLE:

642 first_name += child.get("initials", "")

643 else:

644 if child.text is None:

645 child.text = ""

646 first_name += child.text

647 if child.tag == "surname":

648 last_name += child.text

649 if child.tag == "prefix":

650 prefix += child.text

651 if child.tag == "suffix":

652 suffix += child.text

653

654 if prefix:

655 string_name = prefix + " "

656

657 if getattr(settings, "DISPLAY_FIRST_NAME_FIRST", False):

658 if first_name:

659 string_name += first_name + " "

660 string_name += last_name

661 else:

662 string_name += last_name

663

664 if first_name:

665 string_name += ", " + first_name

666

667 if suffix:

668 string_name += " " + suffix

669

670 if text:

671 if i == len(names) and REF_JEP_STYLE:

672 text += " & "

673 elif REF_JEP_STYLE:

674 text += ", "

675 else:

676 text += "; "

677

678 text += string_name

679 i += 1

680

681 names = node.findall("string-name")

682 i = 1

683 for name_node in names:

684 string_name = get_tex(name_node)

685

686 if text:

687 if i == len(names) and REF_JEP_STYLE:

688 text += " & "

689 elif REF_JEP_STYLE:

690 text += ", "

691 else:

692 text += "; "

693

694 text += string_name

695 i += 1

696 return text

697

698

699def get_author_str(node):

700 authors = get_name_str(node)

701 return f'<span class="citation-author">{authors}</span>'

702

703

704def get_editor_str(node):

705 editors = get_name_str(node)

706 if not editors:

707 return ""

708 # Here, we replace '&' (used in JEP) by ';' and then split in order to

709 # find if there are multiple editors

710 suffix = "eds." if len(editors.replace("&", ";").split(";")) > 1 else "ed."

711 return f" ({editors}, {suffix})"

712

713

714def get_pages_str(node):

715 text = ""

716 REF_JEP_STYLE = getattr(settings, "REF_JEP_STYLE", False)

717 child = node.find("page-count")

718 if child is not None:

719 text += get_html_mixed_content(child, True, True, False, ", ", " pages")

720

721 if not text:

722 child = node.find("size")

723 if child is not None:

724 text += get_html_mixed_content(child, True, True, False, ", ", " pages")

725

726 if not text:

727 first_page_child = node.find("fpage")

728 if first_page_child is not None:

729 fpage_text = get_html_mixed_content(first_page_child, True, True, False)

730 lpage_text = ""

731 fpage_int = lpage_int = 0

732 try:

733 fpage_int = int(fpage_text)

734 except BaseException:

735 pass

736

737 last_page_child = node.find("lpage")

738 if last_page_child is not None:

739 lpage_text = get_html_mixed_content(last_page_child, True, True, False)

740 try:

741 lpage_int = int(lpage_text)

742 except BaseException:

743 pass

744

745 if lpage_int > 0 and lpage_int - fpage_int > 1 and not REF_JEP_STYLE:

746 text += ", pp. "

747 else:

748 text += ", p. "

749 text += fpage_text

750 if lpage_text:

751 text += "-" + lpage_text

752

753 if not text:

754 child = node.find("page-range")

755 if child is not None:

756 prefix = ", pp. "

757 suffix = ""

758 if REF_JEP_STYLE:

759 prefix = ", p. "

760

761 text += get_html_mixed_content(child, True, True, False, prefix, suffix)

762

763 return text

764

765

766##########################################################################

767#

768# Parse a name node ("name", "string-name", or "name-alternative) and find the fields related to a person name:

769# first_name <given-names>

770# last_name <surname>

771# prefix <prefix>

772# suffix <suffix>

773# string_name <string_name> or built with "<prefix> <last_name>, <first_name>, <suffix>"

774# reference_name <string_name specific-use="index"> or string_name

775# Used in Solr for facets (regroup multiple orthographies under the same person)

776#

777# Note: parse_name and get_name_str can not be merged...today

778# string-names in mixed-citation mix structured data (ex: "surname") and non structured content.

779# Ex: <surname>ROBERTSON</surname>, <given-names>D. H.</given-names></string-name>

780# Notice the ", " inside.

781# get_name_str is used for web pages and need to preserve everything (the ', " in particular)

782# parse_name is used to export bibtex: only structured data are preserved.

783# TODO: discuss this workflow. Why add or preserve the mix content of a string-name ?

784#

785# TODO: merge parse_name and parse_contrib

786# 1. <contrib> can have multiple entries (ex: <name> then <string-name specific-use="index") for 1 single person,

787# whereas <mixed-citation> or <element-citation> use 1 entry per person.

788# 2. string-name is a contrib is a simple text, string-name in mixed-citation is a tree

789#

790##########################################################################

791

792

793def get_name_params(first_name, last_name, prefix, suffix, string_name="", reference_name=""):

794 if string_name and not reference_name:

795 reference_name = string_name

796

797 if last_name and not string_name:

798 if prefix:

799 string_name = prefix + " "

800

801 string_name += last_name

802

803 if first_name:

804 string_name += ", " + first_name

805

806 if suffix:

807 string_name += " " + suffix

808

809 elif string_name and not last_name:

810 array = string_name.split(",")

811 if len(array) > 1:

812 last_name = array[0]

813 first_name = array[1]

814

815 if not reference_name and last_name:

816 if getattr(settings, "DISPLAY_FIRST_NAME_FIRST", False):

817 reference_name = ""

818 if first_name:

819 reference_name = first_name + " "

820 reference_name += last_name

821 else:

822 reference_name = last_name

823 if first_name:

824 reference_name += ", " + first_name

825

826 params = {

827 "first_name": first_name,

828 "last_name": last_name,

829 "prefix": prefix,

830 "suffix": suffix,

831 "string_name": string_name,

832 "reference_name": reference_name,

833 }

834

835 return params

836

837

838def parse_name(node):

839 first_name = last_name = prefix = suffix = string_name = reference_name = ""

840

841 if node is not None:

842 if node.tag == "name":

843 for child in node:

844 if child.tag == "given-names":

845 first_name += child.text

846 if child.tag == "surname":

847 last_name += child.text

848 if child.tag == "prefix":

849 prefix += child.text

850 if child.tag == "suffix":

851 suffix += child.text

852 if node.tag == "string-name":

853 for child in node:

854 if child.text:

855 if child.tag == "given-names":

856 first_name += child.text

857 if child.tag == "surname":

858 last_name += child.text

859 if child.tag == "prefix":

860 prefix += child.text

861 if child.tag == "suffix":

862 suffix += child.text

863

864 if not first_name and not last_name:

865 string_name = node.text

866

867 if node.tag == "name-alternatives":

868 for child in node:

869 if child.tag == "string-name":

870 if child.get("specific-use") == "index":

871 reference_name += child.text

872

873 params = get_name_params(first_name, last_name, prefix, suffix, string_name, reference_name)

874

875 return params

876

877

878##########################################################################

879#

880# Parse a Contrib node and find the fields related to a person name:

881# first_name <given-names>

882# last_name <surname>

883# prefix <prefix>

884# suffix <suffix>

885# string_name <string_name> or built with "<prefix> <last_name>, <first_name>, <suffix>"

886# reference_name <string_name specific-use="index"> or string_name

887# Used in Solr for facets (regroup multiple orthographies under the same person)

888#

889##########################################################################

890

891

892def parse_contrib(node):

893 first_name = last_name = prefix = suffix = string_name = reference_name = ""

894

895 if node is not None:

896 for child in node:

897 if child.tag == "name":

898 for great_child in child:

899 if great_child.text is not None:

900 if great_child.tag == "given-names":

901 first_name += great_child.text

902 if great_child.tag == "surname":

903 last_name += great_child.text

904 if great_child.tag == "prefix":

905 prefix += great_child.text

906 if great_child.tag == "suffix":

907 suffix += great_child.text

908 if child.tag == "string-name":

909 if child.text is not None:

910 string_name += child.text

911 if child.tag == "name-alternatives":

912 for great_child in child:

913 if great_child.text is not None:

914 if great_child.tag == "string-name":

915 if great_child.get("specific-use") == "index":

916 reference_name += great_child.text

917

918 if string_name and not reference_name:

919 reference_name = string_name

920

921 if last_name and not string_name:

922 if prefix:

923 string_name = prefix + " "

924

925 string_name += last_name

926

927 if first_name:

928 string_name += ", " + first_name

929

930 if suffix:

931 string_name += " " + suffix

932

933 elif string_name and not last_name:

934 array = string_name.split(",")

935 if len(array) > 1:

936 last_name = array[0]

937 first_name = array[1]

938

939 if not reference_name and last_name:

940 if getattr(settings, "DISPLAY_FIRST_NAME_FIRST", False):

941 reference_name = ""

942 if first_name:

943 reference_name = first_name + " "

944 reference_name += last_name

945 else:

946 reference_name = last_name

947 if first_name:

948 reference_name += ", " + first_name

949

950 params = {

951 "first_name": first_name,

952 "last_name": last_name,

953 "prefix": prefix,

954 "suffix": suffix,

955 "string_name": string_name,

956 "reference_name": reference_name,

957 }

958

959 return params

960

961

962def make_int(value):

963 v = value.split("-")[0]

964 try:

965 v = int(v)

966 except BaseException:

967 v = [x for x in v if x.isdigit()]

968 v = int(v)

969 else:

970 pass

971 return v

972

973

974def uni2ascii(s):

975 s = unicodedata.normalize("NFKD", str(s)).encode("ascii", "ignore")

976 return s

977

978

979sid_type = None

980pid_type = None

981

982

983def set_sid_type(id_type):

984 global sid_type

985 sid_type = id_type

986

987

988def set_pid_type(id_type):

989 global pid_type

990 pid_type = id_type

991

992

993class XmlData:

994 ids_xpath = None

995 id_type_attr = "pub-id-type"

996

997 extids_xpath = None

998 extid_type_attr = None

999 title_group_elt_path = None

1000 title_path = None

1001 trans_title_group_elt_path = None

1002 trans_title_path = None

1003 alternate_title_path = None

1004 alternate_title_group_elt_path = None

1005 meta_root_xpath = ""

1006 custom_meta_path = "custom-meta-group"

1007 counts_path = "counts"

1008 remove_links = False

1009

1010 def __init__(self, tree):

1011 self.tree = tree

1012 if self.meta_root_xpath:

1013 self.meta_root = tree.find(self.meta_root_xpath)

1014 else:

1015 self.meta_root = None

1016

1017 def __getattr__(self, name):

1018 mname = "get_" + name if "self" not in name else name

1019 getter = getattr(self, mname)

1020 obj = getter()

1021 setattr(self, name, obj)

1022 return obj

1023

1024 def get_doi(self):

1025 return None

1026

1027 def xpath(self, xpath):

1028 return self.tree.xpath(xpath)

1029

1030 def xget_subtree(self, xpath):

1031 subtree = self.tree.xpath(xpath)

1032 if subtree:

1033 return subtree[0]

1034 return None

1035

1036 def xget_subtrees(self, xpath):

1037 return self.tree.xpath(xpath)

1038

1039 def get_subtree(self, path):

1040 return self.tree.find(path)

1041

1042 def get_subtrees(self, path):

1043 return self.tree.findall(path)

1044

1045 def get_node_text(self, path, return_none=""):

1046 node = self.tree.find(path)

1047 if node is None:

1048 return return_none

1049 if node.text is None:

1050 return return_none

1051 xml_text = etree.tostring(

1052 node, encoding="unicode", method="text", xml_declaration=False, with_tail=False

1053 )

1054 return xml_text

1055

1056 def get_nodes_text(self, path):

1057 return [x.text for x in self.tree.findall(path)]

1058

1059 def get_ascii_text(self, path, return_none=""):

1060 return uni2ascii(self.get_node_text(path, return_none=return_none))

1061

1062 def xget_node_text(self, xpath, return_none=None):

1063 try:

1064 return self.tree.xpath(xpath)[0].text

1065 except BaseException:

1066 return return_none

1067

1068 def xget_ascii_text(self, xpath, return_none=""):

1069 return uni2ascii(self.xget_node_text(xpath, return_none=return_none))

1070

1071 def tostring(self):

1072 self.prune()

1073 return etree.tostring(self.tree, encoding="utf-8", xml_declaration=False)

1074

1075 __str__ = tostring

1076

1077 def prune(self):

1078 pass

1079

1080 def get_ids(self):

1081 if self.ids_xpath is not None:

1082 nodes = self.xget_subtrees(self.ids_xpath)

1083 return [(x.get(self.id_type_attr), x.text) for x in nodes if x.text is not None]

1084 return []

1085

1086 def get_mathdoc_id(self):

1087 if self.mathdoc_id_xpath is not None:

1088 try:

1089 node = self.xget_subtrees(self.mathdoc_id_xpath)[0]

1090 except IndexError:

1091 return None

1092 else:

1093 return node.text

1094 return None

1095

1096 def get_title_xml(self):

1097 title_xml = ""

1098 node = self.tree.find(self.title_group_elt_path)

1099 if node is None and self.alternate_title_group_elt_path:

1100 node = self.tree.find(self.alternate_title_group_elt_path)

1101 if node is not None:

1102 title_xml = get_mixed_content(node)

1103 return title_xml

1104

1105 def inner_get_title_html(self, path, alternate_path=None):

1106 title_html = ""

1107 node = self.tree.find(path)

1108 if node is None and alternate_path:

1109 node = self.tree.find(alternate_path)

1110 if node is not None:

1111 title_html = get_html_mixed_content(node)

1112 return title_html

1113

1114 def get_title_html(self):

1115 return self.inner_get_title_html(self.title_path, self.alternate_title_path)

1116

1117 def get_trans_title_html(self):

1118 return self.inner_get_title_html(self.trans_title_path)

1119

1120 def inner_get_title_tex(self, path, alternate_path=None):

1121 title_tex = ""

1122 node = self.tree.find(path)

1123 if node is None and alternate_path:

1124 node = self.tree.find(alternate_path)

1125 if node is not None:

1126 title_tex = get_tex(node)

1127 return title_tex

1128

1129 def get_title_tex(self):

1130 return self.inner_get_title_tex(self.title_path, self.alternate_title_path)

1131

1132 def get_trans_title_tex(self):

1133 return self.inner_get_title_tex(self.trans_title_path)

1134

1135 def get_lang(self):

1136 tree = self.tree

1137

1138 lang = get_lang_attrib(tree)

1139

1140 if lang == "und":

1141 parent = tree.getparent()

1142

1143 grand_parent = parent

1144 while grand_parent is not None:

1145 parent = grand_parent

1146 grand_parent = parent.getparent()

1147

1148 lang = get_lang_attrib(parent)

1149

1150 return lang

1151

1152 def get_trans_lang(self):

1153 lang = "und"

1154 node = self.tree.find(self.trans_title_group_elt_path)

1155 lang = get_lang_attrib(node)

1156

1157 return lang

1158

1159 def get_extids(self):

1160 if self.extids_xpath is not None:

1161 nodes = self.xget_subtrees(self.extids_xpath)

1162 links = []

1163 for n in nodes:

1164 id_type = n.get(self.extid_type_attr)

1165 value = n.text.strip()

1166 if id_type is None and value.find("doi.org/") > 0:

1167 id_type = "doi"

1168 if id_type in (

1169 "mr-item-id",

1170 "zbl-item-id",

1171 "sps-id",

1172 "numdam-id",

1173 "mathdoc-id",

1174 "jfm-item-id",

1175 "eudml-item-id",

1176 "doi",

1177 "eid",

1178 ):

1179 if id_type == "numdam-id":

1180 id_type = "mathdoc-id"

1181 if id_type == "doi":

1182 if value.find("doi.org") > 0:

1183 value = value.replace("http://dx.doi.org/", "")

1184 value = value.replace("https://doi.org/", "")

1185 value = value.replace("doi:", "")

1186 links.append((id_type, value))

1187 if self.remove_links:

1188 n.getparent().remove(n)

1189 return links

1190 return []

1191

1192 def get_xml(self, path, return_none=""):

1193 node = self.get_subtree(path)

1194 if node is not None:

1195 return etree.tostring(node, encoding="utf-8", xml_declaration=False)

1196 return return_none

1197

1198 def get_inner_xml(self, path, return_none=""):

1199 node = self.get_subtree(path)

1200 if node is not None:

1201 return innerxml(node)

1202 return return_none

1203

1204 def xget_xml(self, path):

1205 node = self.xget_subtree(path)

1206 if node is not None:

1207 return etree.tostring(node, encoding="utf-8", xml_declaration=False)

1208 return ""

1209

1210 def get_catxml(self, path):

1211 nodes = self.get_subtrees(path)

1212 text = []

1213 for node in nodes:

1214 text.append(etree.tostring(node))

1215 return "".join(text)

1216

1217 def get_streams(self):

1218 if self.meta_root is not None:

1219 self_uris = self.meta_root.findall("self-uri")

1220 else:

1221 self_uris = self.tree.findall("self-uri")

1222 vv = []

1223 for node in self_uris:

1224 href = base = type = ""

1225 for attrib in node.attrib:

1226 name = normalize(attrib)

1227

1228 href = node.attrib[attrib] if name == "href" else href

1229 base = node.attrib[attrib] if name == "base" else base

1230 type = node.attrib[attrib] if name == "content-type" else type

1231

1232 v = {

1233 "rel": "full-text",

1234 "mimetype": type or "text/html",

1235 "location": href,

1236 "base": base,

1237 "text": node.text if node.text else "Link",

1238 }

1239

1240 vv.append(v)

1241 return vv

1242

1243 def get_related_objects(self):

1244 related = []

1245 if self.meta_root is not None:

1246 nodes = self.meta_root.findall("related-object")

1247 else:

1248 nodes = self.tree.findall("related-object")

1249 for node in nodes:

1250 rel = href = base = type = ""

1251 for attrib in node.attrib:

1252 name = normalize(attrib)

1253

1254 rel = node.attrib[attrib] if name == "link-type" else rel

1255 href = node.attrib[attrib] if name == "href" else href

1256 base = node.attrib[attrib] if name == "base" else base

1257 type = node.attrib[attrib] if name == "content-type" else type

1258

1259 text = innerxml(node)

1260 v = {"rel": rel, "mimetype": type, "location": href, "base": base, "metadata": text}

1261 related.append(v)

1262 return related

1263

1264 def get_supplementary_materials(self):

1265 materials = []

1266 if self.meta_root is not None:

1267 nodes = self.meta_root.findall("supplementary-material")

1268 else:

1269 nodes = self.tree.findall("supplementary-material")

1270 for node in nodes:

1271 try:

1272 location = node.attrib["href"]

1273 except KeyError:

1274 location = node.attrib["id"]

1275 material = {

1276 "rel": node.attrib.get("content-type"),

1277 "mimetype": node.attrib.get("mimetype"),

1278 "location": location,

1279 "base": "",

1280 "metadata": "",

1281 "caption": node.xpath("caption/text()")[0],

1282 }

1283 materials.append(material)

1284 return materials

1285

1286 def get_metadataparts(self):

1287 return []

1288

1289 def get_custom_meta(self):

1290 cm = {}

1291 if self.custom_meta_path:

1292 node = self.tree.find(self.custom_meta_path)

1293 if node is not None:

1294 for child in node:

1295 key = child[0].text

1296 value = child[1].text

1297 cm[key] = value

1298 return cm

1299

1300 def get_wall(self):

1301 try:

1302 wall = self.custom_meta["wall"]

1303 except KeyError:

1304 return 0

1305 return int(wall)

1306

1307 def get_pid(self):

1308 # try:

1309 # name = self.custom_meta['provider']

1310 # except KeyError:

1311 # return None

1312 # provider_id_type = name + '-id'

1313 for id_type, id_value in self.ids:

1314 if id_type == pid_type or (

1315 (id_type == "numdam-id" or id_type == "mathdoc-id")

1316 and (pid_type == "numdam-id" or pid_type == "mathdoc-id")

1317 ):

1318 return id_value

1319

1320 def get_provider(self):

1321 return self.custom_meta.get("provider", None)

1322

1323 def get_sid(self):

1324 for id_type, id_value in self.ids:

1325 if id_type == sid_type:

1326 return id_value

1327 return None

1328

1329 def get_counts(self):

1330 counts = []

1331 if self.counts_path:

1332 node = self.tree.find(self.counts_path)

1333 if node is not None:

1334 page_count = node.find("page-count")

1335 if page_count is None:

1336 page_count = node.find("book-page-count")

1337 count = page_count.get("count")

1338 if not count:

1339 count = get_node_text(node)

1340 counts.append(("page-count", count))

1341 return counts

1342

1343 def get_ext_links(self):

1344 referentials = [

1345 "jfm-item-id",

1346 "zbl-item-id",

1347 "mr-item-id",

1348 "nmid",

1349 "numdam-id",

1350 "mathdoc-id",

1351 "sps-id",

1352 "dmlid",

1353 "eudml-item-id",

1354 ]

1355 result = []

1356 if self.meta_root is not None:

1357 nodes = self.meta_root.findall("ext-link")

1358 else:

1359 nodes = self.tree.findall("ext-link")

1360 for node in nodes:

1361 rel = href = base = ""

1362 for attrib in node.attrib:

1363 name = normalize(attrib)

1364

1365 rel = node.attrib[attrib] if name == "ext-link-type" else rel

1366 href = node.attrib[attrib] if name == "href" else href

1367 base = node.attrib[attrib] if name == "base" else base

1368

1369 if rel in referentials:

1370 continue

1371

1372 text = innerxml(node)

1373 v = {"rel": rel, "mimetype": "", "location": href, "base": base, "metadata": text}

1374 result.append(v)

1375 return result

1376

1377 def get_last_modified_iso_8601_date_str(self):

1378 if self.last_modified_path:

1379 node = self.tree.find(self.last_modified_path)

1380 if node is not None:

1381 last_modified_iso_8601_date_str = node.attrib["iso-8601-date"]

1382 return last_modified_iso_8601_date_str

1383 # on traite le cas où le container arrive via ptf-tools et donc la date de

1384 # dernière modification est la date d'import

1385 return timezone.now().isoformat()

1386

1387 def get_date_published_iso_8601_date_str(self):

1388 date_str = None

1389 if self.published_path:

1390 node = self.tree.find(self.published_path)

1391 if node is not None:

1392 if "iso-8601-date" in node.attrib:

1393 date_published_iso_8601_date_str = node.attrib["iso-8601-date"]

1394 date_str = date_published_iso_8601_date_str

1395 else:

1396 year = month = day = ""

1397

1398 sub_node = node.find("year")

1399 if sub_node is not None:

1400 year = sub_node.text

1401 sub_node = node.find("month")

1402 if sub_node is not None:

1403 month = sub_node.text

1404 sub_node = node.find("day")

1405 if sub_node is not None:

1406 day = sub_node.text

1407

1408 date_str = year

1409 if date_str and month:

1410 date_str += "-" + month

1411 if date_str and day:

1412 date_str += "-" + day

1413

1414 return date_str

1415

1416 def get_prod_deployed_date_iso_8601_date_str(self):

1417 if self.prod_deployed_date_path:

1418 node = self.tree.find(self.prod_deployed_date_path)

1419 if node is not None:

1420 prod_deployed_date_iso_8601_date_str = node.attrib["iso-8601-date"]

1421 return prod_deployed_date_iso_8601_date_str

1422 return None

1423

1424

1425class StreamGroup:

1426 def __init__(self, tree):

1427 self.use = tree.get("use").lower()

1428 streams = []

1429 for node in tree:

1430 link = node.find("link")

1431 rel = href = seq = type = ""

1432 for attrib in link.attrib:

1433 name = normalize(attrib)

1434

1435 rel = link.attrib[attrib] if name == "rel" else rel

1436 href = link.attrib[attrib] if name == "href" else href

1437 seq = link.attrib[attrib] if name == "seq" else seq

1438 type = node.attrib[attrib] if name == "content-type" else type

1439

1440 v = {

1441 "rel": rel,

1442 "mimetype": type,

1443 "location": href,

1444 "seq": seq,

1445 "text": link.text or "",

1446 }

1447 streams.append(v)

1448 self.streams = streams

1449

1450

1451###

1452#

1453class Work(XmlData):

1454 lang = "und"

1455 back_paths = ("back", "book-back")

1456 biblio_xpath = "ref-list"

1457

1458 def inner_get_lang(self, node):

1459 the_lang = get_lang_attrib(node)

1460 if the_lang == "und":

1461 the_lang = self.lang

1462

1463 return the_lang

1464

1465 def inner_get_abstract(self, node, tag, attrs):

1466 if node is not None:

1467 the_lang = self.inner_get_lang(node)

1468

1469 value_xml = get_mixed_content(node)

1470 value_html = get_html_mixed_content(node)

1471 value_tex = get_tex(node)

1472

1473 attrs.append(

1474 {

1475 "tag": tag,

1476 "lang": the_lang,

1477 "value_xml": value_xml,

1478 "value_html": value_html,

1479 "value_tex": value_tex,

1480 }

1481 )

1482

1483 def get_abstracts(self):

1484 attrs = []

1485 nodes = self.tree.findall(self.abstract_path)

1486 for node in nodes:

1487 tag = node.get("abstract-type") or "abstract"

1488 self.inner_get_abstract(node, tag, attrs)

1489

1490 nodes = self.tree.findall(self.trans_abstract_path)

1491 for node in nodes:

1492 tag = node.get("abstract-type") or "abstract"

1493 tag = "trans-" + tag

1494 self.inner_get_abstract(node, tag, attrs)

1495

1496 return attrs

1497

1498 def get_contrib_groups(self):

1499 groups = []

1500 grps = self.tree.findall(self.contrib_path)

1501 for g in grps:

1502 contribs = g.findall("contrib")

1503 gc = []

1504 for contrib in contribs:

1505 params = parse_contrib(contrib)

1506 params["contrib_type"] = contrib.get("contrib-type") or ""

1507 params["deceased"] = contrib.get("deceased") or ""

1508 params["contrib_xml"] = get_mixed_content(contrib)

1509 if (

1510 params["first_name"]

1511 or params["last_name"]

1512 or params["string_name"]

1513 or params["reference_name"]

1514 ):

1515 gc.append(params)

1516 if gc:

1517 groups.append({"content_type": g.get("content-type") or "", "contribs": gc})

1518 return groups

1519

1520 def get_kwd_groups(self):

1521 groups = []

1522 grps = self.tree.findall(self.kwd_path)

1523 for g in grps:

1524 ugrp = g.find("unstructured-kwd-group")

1525 the_lang = self.inner_get_lang(g)

1526 if ugrp is not None:

1527 value_xml = get_mixed_content(ugrp)

1528 value_tex = get_tex(ugrp)

1529 value_html = get_html_mixed_content(ugrp)

1530 groups.append(

1531 {

1532 "content_type": g.get("content-type") or "",

1533 "lang": the_lang,

1534 "value_xml": value_xml,

1535 "value_html": value_html,

1536 "value_tex": value_tex,

1537 "kwds": [],

1538 }

1539 )

1540 else:

1541 kwds = g.findall("kwd")

1542 values = [innerxml(x) for x in kwds]

1543 groups.append(

1544 {

1545 "content_type": g.get("kwd-group-type") or "",

1546 "lang": the_lang,

1547 "value": "",

1548 "kwds": values,

1549 }

1550 )

1551 return groups

1552

1553 def get_subj_groups(self):

1554 groups = []

1555 grps = self.tree.findall(self.subj_path)

1556 for g in grps:

1557 the_lang = self.inner_get_lang(g)

1558 subjects = g.findall("subject")

1559 values = [innerxml(x) for x in subjects]

1560 groups.append(

1561 {

1562 "content_type": g.get("subj-group-type") or "",

1563 "lang": the_lang,

1564 "value": "",

1565 "subjects": values,

1566 }

1567 )

1568 return groups

1569

1570 def get_awards(self):

1571 awards = []

1572

1573 nodes = self.tree.findall(self.funding_path)

1574 for node in nodes:

1575 abbrev = award_id = None

1576 names = node.findall("funding-source/named-content")

1577 for name_node in names:

1578 tag = name_node.get("content-type") or ""

1579 if tag == "abbrevation":

1580 abbrev = innerxml(name_node)

1581 id_node = node.find("award-id")

1582 if id_node is not None:

1583 award_id = innerxml(id_node)

1584

1585 if abbrev is not None and id is not None:

1586 awards.append({"abbrev": abbrev, "award_id": award_id})

1587

1588 return awards

1589

1590 # def get_title_group(self):

1591 # title_xml = ''

1592 # group = self.tree.find(self.title_group_elt_path)

1593 # if group is not None:

1594 # title_xml = get_mixed_content(group)

1595 # return title_xml

1596 # return innerxml(group)

1597 # return ""

1598 #

1599 # def get_title_text(self):

1600 # return self.get_node_text(self.title_path)

1601

1602 # def get_abstract(self):

1603 # return self.get_xml(self.abstract_path)

1604 #

1605 # def get_abstract_text(self):

1606 # return self.get_node_text(self.abstract_path)

1607 #

1608 # def get_trans_abstracts(self):

1609 # return self.get_catxml(self.trans_abstract_path)

1610

1611 def get_keywords(self):

1612 return self.get_catxml(self.kwd_path)

1613

1614 def get_bibitems(self):

1615 for back_path in self.back_paths:

1616 back = self.tree.find(back_path)

1617 if back is not None:

1618 break

1619 if back is None:

1620 return []

1621 ref_list = back.find(self.biblio_xpath)

1622 if ref_list is None:

1623 return []

1624 items = []

1625 for ref in ref_list:

1626 if ref.tag == "ref":

1627 items.append(BibItem(ref))

1628 # try:

1629 # self.tree.getroot().remove(back)

1630 # except:

1631 # self.tree.remove(back)

1632 return items

1633

1634

1635class InCollection(XmlData):

1636 def __init__(self, tree):

1637 super().__init__(tree)

1638 self.volume, self.seq, self.vseries = get_volume_and_seq(tree)

1639 colmeta = tree.find("collection-meta")

1640 self.collection = Collection(colmeta)

1641

1642

1643class BitsCollection(XmlData):

1644 def __init__(self, tree):

1645 try:

1646 seq = int(tree.get("seq"))

1647 except BaseException:

1648 try:

1649 seq = int(tree.find("volume-in-collection/volume-number").text)

1650 except BaseException:

1651 seq = 0

1652 try:

1653 volume = tree.find("volume-in-collection/volume-number").text

1654 except BaseException:

1655 volume = ""

1656 try:

1657 series = tree.find("volume-in-collection/volume-series").text

1658 except BaseException:

1659 series = ""

1660 self.volume = volume

1661 self.seq = seq

1662 self.vseries = series

1663 self.collection = Collection(tree)

1664

1665

1666class Publisher(XmlData):

1667 mathdoc_id_xpath = 'publisher-id[@publisher-id-type="mathdoc-id"]'

1668

1669 def get_name(self):

1670 return self.get_node_text("publisher-name")

1671

1672 def get_loc(self):

1673 return self.get_node_text("publisher-loc")

1674

1675

1676class EventSeries(XmlData):

1677 def __init__(self, tree):

1678 super().__init__(tree)

1679 self.event_type = tree.get("event-type")

1680

1681 def get_title(self):

1682 return self.get_node_text("event-name")

1683

1684 def get_acro(self):

1685 return self.get_node_text("event-acronym")

1686

1687 def get_short_title(self):

1688 return ""

1689

1690

1691class Event(XmlData):

1692 def __init__(self, tree):

1693 super().__init__(tree)

1694 self.event_type = tree.get("event-type")

1695

1696 def get_title(self):

1697 return self.get_node_text("event-name")

1698

1699 def get_acro(self):

1700 return self.get_node_text("event-acronym")

1701

1702 def get_year(self):

1703 return self.get_node_text("event-date")

1704

1705 def get_number(self):

1706 return self.get_node_text("event-num")

1707

1708 def get_loc(self):

1709 return self.get_node_text("event-loc")

1710

1711

1712# <collection-meta> d'un <book>

1713

1714

1715class Collection(Work):

1716 lang = "und"

1717 title_group_elt_path = "title-group"

1718 title_path = "title-group/title"

1719 subtitle_path = "title-group/subtitle"

1720 abstract_path = "abstract"

1721 trans_abstract_path = "trans-abstract"

1722 kwd_path = "kwd-group"

1723 subj_path = "Not-supported"

1724 ids_xpath = "collection-id"

1725 mathdoc_id_xpath = 'collection-id[@collection-id-type="mathdoc-id"]'

1726 trans_title_group_elt_path = "title-group/trans-title-group"

1727 trans_title_path = "title-group/trans-title-group/trans-title"

1728 funding_path = "Not supported"

1729

1730 contrib_path = "contrib-group"

1731 id_type_attr = "collection-id-type"

1732

1733 def get_coltype(self):

1734 return self.tree.get("collection-type") or "collection"

1735

1736 def get_publisher(self):

1737 node = self.tree.find("publisher")

1738 if node is not None:

1739 return Publisher(node)

1740 return None

1741

1742 def get_title(self):

1743 return self.get_node_text("title-group/title")

1744

1745 def get_abbrev(self):

1746 return self.get_node_text("title-group/abbrev-title")

1747

1748 def get_ids(self):

1749 ids = XmlData.get_ids(self)

1750 issns = self.tree.findall("issn")

1751 for issn in issns:

1752 itp = issn.get("pub-type")

1753 if itp == "ppub":

1754 ids.append(("issn", issn.text))

1755 elif itp == "epub":

1756 ids.append(("e-issn", issn.text))

1757 else:

1758 pass

1759 return ids

1760

1761

1762# <journal-meta> d'un <journal-issue>

1763

1764

1765class Journal(Work):

1766 ids_xpath = "journal-id"

1767 id_type_attr = "journal-id-type"

1768 title_group_elt_path = "journal-title-group"

1769 title_path = "journal-title-group/journal-title"

1770 abbrev_title_path = "journal-title-group/abbrev-title"

1771 trans_title_group_elt_path = "journal-title-group/trans-title-group"

1772 trans_title_path = "journal-title-group/trans-title-group/trans-title"

1773 abstract_path = "abstract"

1774 trans_abstract_path = "trans-abstract"

1775 contrib_path = "contrib-group"

1776 kwd_path = "kwd-group"

1777 subj_path = "Not-supported"

1778 funding_path = "Not-supported"

1779

1780 def get_ids(self):

1781 ids = XmlData.get_ids(self)

1782 issns = self.tree.findall("issn")

1783 for issn in issns:

1784 itp = issn.get("pub-type")

1785 if issn.text:

1786 if itp == "ppub":

1787 ids.append(("issn", issn.text))

1788 elif itp == "epub":

1789 ids.append(("e-issn", issn.text))

1790 else:

1791 pass

1792 return ids

1793

1794 def get_publisher(self):

1795 node = self.tree.find("publisher")

1796 if node is not None:

1797 return Publisher(node)

1798 return None

1799

1800 def get_title_group(self):

1801 node = self.tree.find(self.title_group_elt_path)

1802 if node is not None:

1803 return innerxml(node)

1804 return ""

1805

1806 def get_title_xml(self):

1807 title_xml = ""

1808 node = self.tree.find(self.title_group_elt_path)

1809 if node is not None:

1810 title_xml = get_mixed_content(node)

1811 return title_xml

1812

1813 def get_title_html(self):

1814 title_html = ""

1815 node = self.tree.find(self.title_path)

1816 if node is not None:

1817 title_html = get_html_mixed_content(node)

1818 return title_html

1819

1820 def get_title_tex(self):

1821 title_tex = ""

1822 node = self.tree.find(self.title_path)

1823 if node is not None:

1824 title_tex = get_tex(node)

1825 return title_tex

1826

1827 def get_abbrev(self):

1828 return self.get_node_text(self.abbrev_title_path)

1829

1830 def get_coltype(self):

1831 return self.custom_meta.get("serial-type")

1832

1833

1834class Publication(Journal):

1835 ids_xpath = "publication-id"

1836 id_type_attr = "publication-id-type"

1837 title_group_elt_path = "title-group"

1838 title_path = "title-group/title"

1839 abbrev_title_path = "title-group/abbrev-title"

1840 trans_title_group_elt_path = "title-group/trans-title-group"

1841 trans_title_path = "title-group/trans-title-group/trans-title"

1842

1843

1844class Issue(Work):

1845 mathdoc_id_xpath = 'issue-meta/issue-id[@issue-id-type="mathdoc-id"]'

1846 ids_xpath = "issue-meta/issue-id"

1847 abstract_path = "issue-meta/abstract"

1848 trans_abstract_path = "issue-meta/trans-abstract"

1849 kwd_path = "issue-meta/kwd-group"

1850 subj_path = "Not-supported"

1851 contrib_path = "issue-meta/contrib-group"

1852 title_group_elt_path = "issue-meta/issue-title"

1853 title_path = "issue-meta/issue-title"

1854 # TODO support langs in issue-title

1855 subtitle_path = ""

1856 trans_title_path = ""

1857 trans_title_group_elt_path = ""

1858 counts_path = "issue-meta/counts"

1859 last_modified_path = 'issue-meta/history/date[@date-type="last-modified"]'

1860 published_path = 'issue-meta/pub-date[@date-type="pub"]'

1861 prod_deployed_date_path = 'issue-meta/history/date[@date-type="prod-deployed-date"]'

1862 funding_path = "Not-supported"

1863

1864 lang = "und"

1865 meta_root_xpath = "issue-meta"

1866 custom_meta_path = "issue-meta/custom-meta-group"

1867

1868 def get_journal(self):

1869 node = self.tree.find("journal-meta")

1870 return Journal(node)

1871

1872 def get_ctype(self):

1873 return "issue"

1874

1875 def get_vseries(self):

1876 return self.get_node_text("issue-meta/volume-series")

1877

1878 def get_vseries_int(self):

1879 v = self.get_node_text("issue-meta/volume-series")

1880 if v:

1881 return make_int(v)

1882 return 0

1883

1884 def get_volume(self):

1885 return self.get_node_text("issue-meta/volume")

1886

1887 def get_volume_int(self):

1888 v = self.get_node_text("issue-meta/volume")

1889 if v:

1890 return make_int(v)

1891 return 0

1892

1893 def get_number(self):

1894 return self.get_node_text("issue-meta/issue")

1895

1896 def get_number_int(self):

1897 v = self.get_node_text("issue-meta/issue")

1898 if v:

1899 return make_int(v)

1900 return 0

1901

1902 def get_year(self):

1903 return self.get_node_text("issue-meta/pub-date/year")

1904

1905 def get_event(self):

1906 node = self.tree.find("event")

1907 if node is not None:

1908 return Event(node)

1909 return None

1910

1911 def get_publisher(self):

1912 xpublisher = None

1913 xjournal = self.get_journal()

1914 if xjournal is not None:

1915 xpublisher = xjournal.publisher

1916 return xpublisher

1917

1918 def __iter__(self):

1919 body = self.tree.find("body")

1920 for node in body:

1921 yield Article(node)

1922

1923

1924class BibItem(XmlData):

1925 extids_xpath = "*/ext-link"

1926 extid_type_attr = "ext-link-type"

1927

1928 # remove_links = True

1929 def __init__(self, tree):

1930 super().__init__(tree)

1931 self.extids = self.get_extids()

1932

1933 # Temporary code

1934 # Some xml only have a pub-id (doi) and do not have an ext-link with a ext-link-type=doi

1935 # We need to manually create the link

1936

1937 has_doi = False

1938 for id_type, _id_value in self.extids:

1939 if id_type == "doi":

1940 has_doi = True

1941

1942 nodes = self.tree.findall("*/pub-id")

1943 for node in nodes:

1944 id_type = node.get("pub-id-type")

1945 if id_type == "doi" and not has_doi:

1946 value = node.text

1947 value = value.replace("http://dx.doi.org/", "")

1948 value = value.replace("https://doi.org/", "")

1949 value = value.replace("doi:", "")

1950 self.extids.append(("doi", value))

1951 elif id_type in ["eid", "arxiv", "tel", "hal", "theses.fr"]:

1952 value = node.text

1953 self.extids.append((id_type, value))

1954

1955 def get_ref(self):

1956 return self.tostring()

1957

1958 def split_label(self):

1959 """

1960 Used when sorting non-digit bibitems

1961 """

1962 label = self.label.lower()

1963

1964 try:

1965 self.label_prefix, self.label_suffix = re.split(r"[\d]+", label)

1966 except ValueError:

1967 # Special case where label is similar as "Sma" instead of "Sma15"

1968 self.label_prefix, self.label_suffix = [label, ""]

1969

1970 def get_label(self):

1971 node = self.tree.find("label")

1972 if node is not None:

1973 return node.text

1974 return ""

1975

1976 def get_user_id(self):

1977 return self.tree.get("id", "")

1978

1979 def get_citation_xml(self):

1980 text = ""

1981 for name in ("mixed-citation", "element-citation"):

1982 if not text:

1983 node = self.tree.find(name)

1984 text = get_mixed_content(node)

1985

1986 label = self.get_label()

1987 if label:

1988 text = "<label>" + label + "</label>" + text

1989

1990 return text

1991

1992 def get_citation_html(self):

1993 text = ""

1994 for name in ("mixed-citation", "element-citation"):

1995 if not text:

1996 node = self.tree.find(name)

1997 text = get_html_mixed_content(node)

1998

1999 label = self.get_label()

2000 if label:

2001 if label[0] != "[":

2002 label = "[" + label + "]"

2003 text = label + " " + text

2004

2005 return text

2006

2007 def get_citation_tex(self):

2008 text = ""

2009 for name in ("mixed-citation", "element-citation"):

2010 if not text:

2011 node = self.tree.find(name)

2012 text = get_tex(node)

2013

2014 label = self.get_label()

2015 if label:

2016 if label[0] != "[":

2017 label = "[" + label + "]"

2018 text = label + " " + text

2019

2020 return text

2021

2022 def get_citation_node(self):

2023 tree = self.tree.find("element-citation")

2024 if tree is None:

2025 tree = self.tree.find("mixed-citation")

2026

2027 return tree

2028

2029 def get_type(self):

2030 type = "misc"

2031

2032 tree = self.get_citation_node()

2033 if tree is not None:

2034 type = tree.get("publication-type", "misc")

2035

2036 return type

2037

2038 def get_node_text(self, node_name, tex=False):

2039 text = ""

2040 tree = self.get_citation_node()

2041 if tree is not None:

2042 node = tree.find(node_name)

2043 if node is not None:

2044 if tex:

2045 text = get_tex(node)

2046 else:

2047 text = node.text

2048 return text

2049

2050 def get_publisher_name(self):

2051 return self.get_node_text("publisher-name")

2052

2053 def get_publisher_loc(self):

2054 return self.get_node_text("publisher-loc")

2055

2056 def get_institution(self):

2057 return self.get_node_text("institution")

2058

2059 def get_series(self):

2060 return self.get_node_text("series")

2061

2062 def get_volume(self):

2063 return self.get_node_text("volume")

2064

2065 def get_issue(self):

2066 return self.get_node_text("issue")

2067

2068 def get_year(self):

2069 return self.get_node_text("year")

2070

2071 # TODO: comments may have ext-link like arxiv. Add ExtId ?

2072 def get_comment(self):

2073 return self.get_node_text("comment", tex=True)

2074

2075 def get_fpage(self):

2076 return self.get_node_text("fpage")

2077

2078 def get_lpage(self):

2079 return self.get_node_text("lpage")

2080

2081 def get_page_range(self):

2082 return self.get_node_text("page-range")

2083

2084 def get_size(self):

2085 text = self.get_node_text("page-count")

2086 if not text:

2087 text = self.get_node_text("size")

2088 return text

2089

2090 def get_source_tex(self):

2091 return self.get_node_text("source", tex=True)

2092

2093 def get_article_title_tex(self):

2094 return self.get_node_text("article-title", tex=True)

2095

2096 def get_chapter_title_tex(self):

2097 return self.get_node_text("chapter-title", tex=True)

2098

2099 def get_contrib_groups(self):

2100 groups = []

2101

2102 tree = self.get_citation_node()

2103 if tree is not None:

2104 gc = []

2105

2106 for child in tree:

2107 if (

2108 child.tag == "name"

2109 or child.tag == "string-name"

2110 or child.tag == "name-alternatives"

2111 ):

2112 params = parse_name(child)

2113 params["contrib_type"] = ""

2114 params["contrib_xml"] = get_mixed_content(child)

2115 gc.append(params)

2116

2117 groups.append({"content_type": "", "contribs": gc})

2118 return groups

2119

2120

2121class Relation(XmlData):

2122 def get_id_type(self):

2123 return self.tree.get("ext-link-type") or ""

2124

2125 def get_rel_type(self):

2126 return self.tree.get("related-article-type") or ""

2127

2128 def get_id_value(self):

2129 return self.tree.text or ""

2130

2131 def get_right_pid(self):

2132 return self.id_value

2133

2134

2135class Article(Work):

2136 mathdoc_id_xpath = 'article-id[@pub-id-type="mathdoc-id"]'

2137 ids_xpath = "front/article-meta/article-id"

2138 article_xpath = "front/article-meta"

2139

2140 extids_xpath = (

2141 'front/article-meta/ext-link[@ext-link-type="mr-item-id"]'

2142 '|front/article-meta/ext-link[@ext-link-type="zbl-item-id"]'

2143 '|front/article-meta/ext-link[@ext-link-type="sps-id"]'

2144 '|front/article-meta/ext-link[@ext-link-type="jfm-item-id"]'

2145 )

2146 extid_type_attr = "ext-link-type"

2147 title_group_elt_path = "front/article-meta/title-group"

2148 title_path = "front/article-meta/title-group/article-title"

2149 subtitle_path = "front/article-meta/title-group/subtitle"

2150 trans_title_group_elt_path = "front/article-meta/title-group/trans-title-group"

2151 trans_title_path = "front/article-meta/title-group/trans-title-group/trans-title"

2152 abstract_path = "front/article-meta/abstract"

2153 trans_abstract_path = "front/article-meta/trans-abstract"

2154 kwd_path = "front/article-meta/kwd-group"

2155 subj_path = "front/article-meta/article-categories/subj-group"

2156 contrib_path = "front/article-meta/contrib-group"

2157 meta_root_xpath = "front/article-meta"

2158 custom_meta_path = "front/article-meta/custom-meta-group"

2159 counts_path = "front/article-meta/counts"

2160 published_path = 'front/article-meta/pub-date[@date-type="pub"]'

2161 prod_deployed_date_path = 'front/article-meta/history/date[@date-type="prod-deployed-date"]'

2162 history_path = "front/article-meta/history/date"

2163 funding_path = "front/article-meta/funding-group/award-group"

2164

2165 def __init__(self, tree):

2166 # Case when we import the JATS article from OAI.

2167 # The <article> tag is surrounded by a <header> tag. Remove this tag.

2168 if tree.tag != "article":

2169 remove_namespace(tree)

2170 tree = tree.xpath("metadata/article")[0]

2171

2172 super().__init__(tree)

2173 self.article_meta = self.get_subtree(self.article_xpath)

2174 self.atype = tree.get("article-type") or ""

2175 self.numbering = ""

2176 self.lang = self.get_lang()

2177

2178 def get_doi(self):

2179 try:

2180 text = self.tree.xpath('front/article-meta/article-id[@pub-id-type="doi"]')[0].text

2181 except BaseException:

2182 return None

2183 else:

2184 return text

2185

2186 # When the JATS XML has only an <article>, we need to construct the Journal on the fly

2187 def get_journal(self):

2188 node = self.tree.xpath("front/journal-meta")[0]

2189 return Journal(node)

2190

2191 def get_issue_id(self):

2192 try:

2193 return self.tree.xpath("front/article-meta/issue-id")[0].text

2194 except:

2195 return ""

2196

2197 def get_volume(self):

2198 try:

2199 return self.tree.xpath("front/article-meta/volume")[0].text

2200 except:

2201 return ""

2202

2203 def get_fpage(self):

2204 return self.get_node_text("front/article-meta/fpage")

2205

2206 def get_lpage(self):

2207 return self.get_node_text("front/article-meta/lpage")

2208

2209 def get_page_type(self):

2210 page_type = ""

2211 node = self.tree.find("front/article-meta/fpage")

2212 if node is not None:

2213 page_type = node.get("content-type")

2214

2215 if page_type is None:

2216 page_type = ""

2217

2218 return page_type

2219

2220 # Olivier 2016-01-13 add page-range & elocation

2221 def get_page_range(self):

2222 return self.get_node_text("front/article-meta/page-range")

2223

2224 def get_elocation(self):

2225 return self.get_node_text("front/article-meta/elocation-id")

2226

2227 def get_body(self):

2228 node = self.tree.find("body")

2229 text = get_node_text(node)

2230 return text

2231

2232 def body_jats_to_html(self, base_url):

2233 body_html = ""

2234 figures = []

2235 node = self.tree.find("body")

2236 if node is not None:

2237 body_html, figures = get_html_mixed_content_with_figures(

2238 node,

2239 is_top=True,

2240 is_citation=False,

2241 is_comment=False,

2242 is_figure=False,

2243 prefix="",

2244 suffix="",

2245 sec_level=2,

2246 label_title="",

2247 figures=figures,

2248 base_url=base_url,

2249 )

2250 return body_html, figures

2251

2252 def get_body_tex(self):

2253 node = self.tree.find("body")

2254 # TODO: body_tex devrait être en fait le HTML va les fourmules TeX en texte

2255 value_tex = get_tex(node)

2256 return value_tex

2257

2258 def get_body_xml(self):

2259 node = self.tree.find("body")

2260 value_xml = get_mixed_content(node)

2261 return value_xml

2262

2263 def get_seq(self):

2264 issue = self.get_subtree("front/article-meta/issue")

2265 seq = 0

2266 if issue is not None:

2267 seq = issue.get("seq") or 0

2268 if not seq:

2269 fpage = self.get_subtree("front/article-meta/fpage")

2270 if fpage is not None:

2271 seq = fpage.get("seq") or 0

2272 try:

2273 seq = int(seq)

2274 except BaseException:

2275 seq = 0

2276 return seq

2277

2278 def get_relations(self):

2279 relations = []

2280 nodes = self.tree.findall("front/article-meta/related-article")

2281 for n in nodes:

2282 rel = Relation(n)

2283 rel.left_pid = self.pid

2284 relations.append(rel)

2285 return relations

2286

2287 def get_history_dates(self):

2288 dates = []

2289 nodes = self.tree.findall(self.history_path)

2290 for node in nodes:

2291 type = node.attrib["date-type"]

2292 date = node.attrib["iso-8601-date"]

2293 dates.append({"type": type, "date": date})

2294

2295 return dates

2296

2297 def get_article_number(self):

2298 return self.custom_meta.get("article-number", "")

2299

2300 def get_talk_number(self):

2301 return self.custom_meta.get("talk-number", "")

2302

2303

2304class BookSeries(XmlData):

2305 mathdoc_id_xpath = 'collection-id[@collection-id-type="mathdoc-id"]'

2306 ids_xpath = "collection-id"

2307 extid_type_attr = "collection-id-type"

2308 title_group_elt_path = "title-group"

2309 title_path = "title-group/title"

2310 subtitle_path = "title-group/subtitle"

2311 lang = "und"

2312

2313 def get_ids(self):

2314 ids = []

2315 issn = self.get_node_text("issn")

2316 if issn:

2317 ids.append(("issn", issn))

2318 nodes = self.tree.findall("collection-id")

2319 for n in nodes:

2320 id_type = n.get("collection-id-type")

2321 id_val = n.text

2322 ids.append((id_type, id_val))

2323 return ids

2324

2325 def get_title(self):

2326 return self.get_node_text(self.title_path)

2327

2328 def get_abbrev(self):

2329 return self.get_node_text("title-group/abbrev-title")

2330

2331 def get_publisher(self):

2332 node = self.tree.find("publisher")

2333 if node is not None:

2334 return Publisher(node)

2335 return None

2336

2337 def get_stype(self):

2338 return self.custom_meta.get("serial-type")

2339

2340

2341# Mixin

2342class HasParts:

2343 def get_parts(self):

2344 xparts = []

2345 for name in ("book-body", "body"):

2346 parts = self.xget_subtrees("%s/book-part" % name)

2347 if parts:

2348 break

2349 if parts:

2350 for tree in parts:

2351 part = self.__class__.get_book_part_class()(tree)

2352 xparts.append(part)

2353 for name in ("book-body", "body"):

2354 body = self.get_subtree(name)

2355 if body is not None:

2356 break

2357 if body is not None:

2358 try:

2359 self.tree.getroot().remove(body) # XSLT result tree

2360 except BaseException:

2361 self.tree.remove(body) # Element tree

2362 return xparts

2363

2364

2365class BookPart(Work, HasParts):

2366 id_type_attr = "book-part-id-type"

2367 part_xpath = "book-part-meta"

2368 ids_xpath = "book-part-meta/book-part-id"

2369 mathdoc_id_xpath = 'book-part-meta/book-part-id[@book-part-id-type="mathdoc-id"]'

2370 meta_xpath = "book-part-meta"

2371 extids_xpath = (

2372 'book-part-meta/ext-link[@ext-link-type="mr-item-id"]'

2373 '|book-part-meta/ext-link[@ext-link-type="zbl-item-id"]'

2374 '|book-part-meta/ext-link[@ext-link-type="jfm-item-id"]'

2375 )

2376 extid_type_attr = "ext-link-type"

2377 title_group_elt_path = "book-part-meta/title-group"

2378 title_path = "book-part-meta/title-group/title"

2379 subtitle_path = "book-part-meta/title-group/subtitle"

2380 trans_title_group_elt_path = "book-part-meta/title-group/trans-title-group"

2381 trans_title_path = "book-part-meta/title-group/trans-title-group/trans-title"

2382 abstract_path = "book-part-meta/abstract"

2383 trans_abstract_path = "book-part-meta/trans-abstract"

2384 kwd_path = "book-part-meta/kwd-group"

2385 subj_path = "front/book-part-meta/article-categories/subj-group"

2386 contrib_path = "book-part-meta/contrib-group"

2387 meta_root_xpath = "book-part-meta"

2388 custom_meta_path = "book-part-meta/custom-meta-group"

2389 funding_path = "book-part-meta/funding-group/award-group"

2390

2391 def __init__(self, tree):

2392 super().__init__(tree)

2393 self.part_meta = self.get_subtree(self.part_xpath)

2394 indexed = tree.get("indexed", "true")

2395 self.indexed = True if indexed == "true" else False

2396 self.atype = tree.get("book-part-type") or ""

2397 self.numbering = tree.get("book-part-number") or ""

2398 self.parts = self.get_parts()

2399 self.lang = self.get_lang()

2400

2401 def get_fpage(self):

2402 return self.get_node_text("book-part-meta/fpage")

2403

2404 def get_lpage(self):

2405 return self.get_node_text("book-part-meta/lpage")

2406

2407 def get_page_range(self):

2408 return ""

2409

2410 def get_page_type(self):

2411 page_type = ""

2412 node = self.tree.find("book-part-meta/fpage")

2413 if node is not None:

2414 page_type = node.get("content-type")

2415

2416 if page_type is None:

2417 page_type = ""

2418

2419 return page_type

2420

2421 def get_seq(self):

2422 v = self.fpage

2423 try:

2424 v = int(v)

2425 except BaseException:

2426 return 0

2427 return v

2428

2429 def get_body(self):

2430 node = self.tree.find("body")

2431 if node is not None:

2432 return etree.tostring(node, encoding="utf-8", xml_declaration=False)

2433 return ""

2434

2435 def get_relations(self):

2436 relations = []

2437 nodes = self.tree.findall("book-part-meta/related-article")

2438 for n in nodes:

2439 rel = Relation(n)

2440 rel.left_pid = self.pid

2441 relations.append(rel)

2442 return relations

2443

2444 def get_article_number(self):

2445 return self.custom_meta.get("article-number", "")

2446

2447 def get_talk_number(self):

2448 return self.custom_meta.get("talk-number", "")

2449

2450

2451def get_volume_and_seq(incol):

2452 v = incol.find("volume")

2453 try:

2454 seq = int(incol.get("seq"))

2455 except BaseException:

2456 if v is None:

2457 seq = 0

2458 else:

2459 vt = v.text.split("-")[0]

2460 vt = [x for x in vt if x.isdigit()]

2461 try:

2462 seq = int(vt)

2463 except BaseException:

2464 seq = 0

2465 try:

2466 volume = v.text

2467 except BaseException:

2468 volume = ""

2469 try:

2470 vseries = incol.find("volume-series").text

2471 except BaseException:

2472 vseries = ""

2473 if vseries:

2474 try:

2475 # pas plus de 10000 ouvrages dans une série (gasp)

2476 seq = int(vseries) * 10000 + seq

2477 except BaseException:

2478 pass

2479 return (volume, seq, vseries)

2480

2481

2482class Book(Work, HasParts):

2483 id_type_attr = "book-id-type"

2484 mathdoc_id_xpath = 'book-meta/book-id[@book-id-type="mathdoc-id"]'

2485 ids_xpath = "book-meta/book-id"

2486 book_xpath = "book-meta"

2487 extids_xpath = (

2488 'book-meta/ext-link[@ext-link-type="mr-item-id"]'

2489 '|book-meta/ext-link[@ext-link-type="zbl-item-id"]'

2490 '|book-meta/ext-link[@ext-link-type="jfm-item-id"]'

2491 )

2492 extid_type_attr = "ext-link-type"

2493

2494 title_group_elt_path = "book-meta/book-title-group"

2495 title_path = "book-meta/book-title-group/book-title"

2496 alternate_title_group_elt_path = "collection-meta/volume-in-collection/volume-title"

2497 alternate_title_path = "collection-meta/volume-in-collection/volume-title"

2498 trans_title_group_elt_path = "book-meta/book-title-group/trans-title-group"

2499 trans_title_path = "book-meta/book-title-group/trans-title-group/trans-title"

2500 subtitle_path = "book-meta/book-title-group/subtitle"

2501

2502 abstract_path = "book-meta/abstract"

2503 trans_abstract_path = "book-meta/trans-abstract"

2504 kwd_path = "book-meta/kwd-group"

2505 subj_path = "Not-supported"

2506 contrib_path = "book-meta/contrib-group"

2507 meta_root_xpath = "book-meta"

2508 custom_meta_path = "book-meta/custom-meta-group"

2509 counts_path = "book-meta/counts"

2510 last_modified_path = 'book-meta/pub-history/date[@date-type="last-modified"]'

2511 published_path = 'book-meta/pub-date[@date-type="pub"]'

2512 prod_deployed_date_path = 'book-meta/pub-history/date[@date-type="prod-deployed-date"]'

2513 year_path = "book-meta/pub-date/year"

2514 funding_path = "Not-supported"

2515

2516 mbook_seq = 0

2517 mbook_volume = ""

2518 mbook_vseries = ""

2519

2520 def __init__(self, tree):

2521 # Case when we import the book from OAI.

2522 # The <book> tag is surrounded by a <header> tag. Remove this tag.

2523 if tree.tag != "book":

2524 remove_namespace(tree)

2525 tree = tree.xpath("metadata/book")[0]

2526 if tree.getchildren()[0].tag == "front":

2527 tree = tree.xpath("front")[0]

2528

2529 super().__init__(tree)

2530 self.book_meta = self.get_subtree(self.book_xpath)

2531 self.contrib_groups = []

2532 try:

2533 self.book_type = tree.get("book-type") or "Book"

2534 except BaseException:

2535 self.book_type = tree.getroot().get("book-type") or "Book"

2536 # if self.book_type == 'proceedings' or self.book_type == 'edited-book'

2537 # or self.book_type == 'monograph' :

2538 if self.book_type:

2539 self.parts = self.get_parts()

2540

2541 # patch for book without contrib-group:

2542 # 1 : monograph with book_parts : contrib-group of book egal to the

2543 # contrib-group of the first book-part

2544 # OR 2 : edited-books with same author for all of its book_parts : book-type become 'monograph' and

2545 # contrib-group of book equal to the contrib-group of the first book-part

2546 # OR 3 : edited-books but not same author for all book-parts : contrib-group of

2547 # book become "Collectif"

2548 self.contrib_groups = self.get_contrib_groups()

2549 if not self.contrib_groups:

2550 if self.book_type == "monograph" and self.parts:

2551 first_part = self.parts[0]

2552 self.contrib_groups = first_part.get_contrib_groups()

2553 elif self.book_type == "edited-book" and self.parts:

2554 # check if authors of the book-parts are identical

2555 equal = True

2556 book_part_contrib_group = self.parts[0].get_contrib_groups()

2557 for xparts in self.parts:

2558 if xparts.get_contrib_groups() != book_part_contrib_group:

2559 equal = False

2560 break

2561 if equal:

2562 # FIXME : ? is it a check or an assignation ?

2563 self.book_type == "monograph"

2564 self.contrib_groups = book_part_contrib_group

2565 else:

2566 self.contrib_groups = [

2567 {

2568 "contribs": [

2569 {

2570 "first_name": "",

2571 "last_name": "Collectif",

2572 "suffix": "",

2573 "string_name": "Collectif",

2574 "reference_name": "Collectif",

2575 "contrib_xml": "<contrib><name><surname>Collectif</surname><given-names>"

2576 + "</given-names></name><name-alternatives>"

2577 + '<string-name specific-use="index">Collectif</string-name></name-alternatives></contrib>',

2578 "prefix": "",

2579 "contrib_type": "author",

2580 }

2581 ],

2582 "content_type": "authors",

2583 }

2584 ]

2585

2586 self.body = ""

2587 # else: #or self.book_type == 'monograph': pour monograph pas de book-part, body contient le plein text

2588 # self.parts = []

2589 self.incollection = self.get_incollection()

2590

2591 self.lang = self.get_lang()

2592

2593 @staticmethod

2594 def get_book_part_class():

2595 return BookPart

2596

2597 def get_doi(self):

2598 try:

2599 text = self.tree.xpath('book-meta/book-id[@book-id-type="doi"]')[0].text

2600 except BaseException:

2601 return None

2602 else:

2603 return text

2604

2605 def get_ctype(self):

2606 return "book-%s" % self.book_type

2607

2608 def get_contrib_groups(self):

2609 if self.contrib_groups:

2610 return self.contrib_groups

2611 return super().get_contrib_groups()

2612

2613 def get_publisher(self):

2614 node = self.tree.find("book-meta/publisher")

2615 if node is not None:

2616 return Publisher(node)

2617 return None

2618

2619 def get_year(self):

2620 return self.get_node_text(self.year_path)

2621

2622 def get_title(self):

2623 text = self.get_node_text("book-meta/title-group/title")

2624 if not text:

2625 self.get_node_text("collection-meta/volume-in-collection/volume-title")

2626 return text

2627

2628 def get_body(self):

2629 node = self.tree.find("book-body")

2630 if node is not None:

2631 return etree.tostring(node, encoding="utf-8", xml_declaration=False)

2632 return ""

2633

2634 def get_incollection(self):

2635 nodes = self.tree.findall("in-collection")

2636 incols = []

2637 for node in nodes:

2638 incols.append(InCollection(node))

2639 if incols:

2640 return incols

2641 nodes = self.tree.findall("collection-meta")

2642 for node in nodes:

2643 incols.append(BitsCollection(node))

2644 return incols

2645

2646 def get_event(self):

2647 node = self.tree.find("book-meta/event")

2648 if node is not None:

2649 return Event(node)

2650 return None

2651

2652 def get_event_series(self):

2653 node = self.tree.find("book-meta/event-series")

2654 if node is not None:

2655 return EventSeries(node)

2656 return None

2657

2658 def get_vseries(self):

2659 return self.get_node_text("book-meta/volume-series")

2660

2661 def get_frontmatter(self):

2662 node = self.tree.find("front-matter")

2663 if node is not None:

2664 return innerxml(node)

2665 return ""

2666

2667 def get_relations(self):

2668 relations = []

2669 nodes = self.tree.findall("book-meta/related-article")

2670 for n in nodes:

2671 rel = Relation(n)

2672 rel.left_pid = self.pid

2673 relations.append(rel)

2674 return relations

2675

2676

2677factories = {

2678 "collection": Collection,

2679 "publisher": Publisher,

2680 "journal": Journal,

2681 "issue": Issue,

2682 "article": Article,

2683 "book": Book,

2684}

2685

2686

2687def xobj_fromtree(classname, tree):

2688 factory = factories[classname]

2689 return factory(tree)

2690

2691

2692def xobj_fromstring(classname, metadata):

2693 tree = etree.fromstring(metadata)

2694 return xobj_fromtree(classname, tree)

2695

2696

2697def xobj_fromfile(classname, path):

2698 metadata = open(path, "rb").read()

2699 return xobj_fromstring(classname, metadata)

2700

2701

2702def update_bibitem_xml(bibitem, new_ids):

2703 xml = "<ref>" + bibitem.citation_xml + "</ref>"

2704 parser = etree.XMLParser(

2705 huge_tree=True, recover=True, remove_blank_text=True, remove_comments=True

2706 )

2707 tree = etree.fromstring(xml, parser=parser)

2708

2709 node = tree.find("element-citation")

2710 if node is None:

2711 node = tree.find("mixed-citation")

2712 if node is not None:

2713 children_to_remove = []

2714 for child in node:

2715 if child.tag == "ext-link":

2716 type = child.get("ext-link-type")

2717 if type and type in new_ids:

2718 children_to_remove.append(child)

2719 elif child.tag == "pub-id":

2720 type = child.get("pub-id-type")

2721 if type and type in new_ids:

2722 children_to_remove.append(child)

2723

2724 for child in children_to_remove:

2725 node.remove(child)

2726

2727 for type, value_dict in new_ids.items():

2728 if value_dict["checked"] and not value_dict["false_positive"]:

2729 if type in ["doi", "arxiv", "tel", "hal", "theses.fr"]:

2730 new_node = etree.Element("pub-id")

2731 new_node.set("pub-id-type", type)

2732 else:

2733 new_node = etree.Element("ext-link")

2734 new_node.set("ext-link-type", type)

2735

2736 new_node.text = value_dict["id_value"]

2737 node.append(new_node)

2738

2739 result = BibItem(tree)

2740 return result

2741

2742

2743#########################################################################################

2744#

2745# Create XML strings based on internal data

2746#

2747#########################################################################################

2748

2749

2750def get_contrib_xml(type, first_name, last_name, prefix, suffix, deceased):

2751 xml = "<contrib"

2752 if type:

2753 xml += ' contrib-type="' + type + '"'

2754 if deceased:

2755 xml += ' deceased="yes"'

2756 xml += "><name>"

2757

2758 if prefix:

2759 xml += "<prefix>" + prefix + "</prefix>"

2760 if first_name:

2761 xml += "<given-names>" + first_name + "</given-names>"

2762 if last_name:

2763 xml += "<surname>" + last_name + "</surname>"

2764 if suffix:

2765 xml += "<suffix>" + suffix + "</suffix>"

2766

2767 xml += "</name></contrib>"

2768

2769 return xml

2770

2771

2772def get_title_xml(title):

2773 xml = '<title-group xmlns:xlink="http://www.w3.org/1999/xlink"><article-title xml:space="preserve">'

2774 xml += title

2775 xml += "</article-title></title-group>"

2776

2777 return xml