Coverage for apps/ptf/cmds/xml

1import copy

2import datetime

3import os.path

4import subprocess

5import sys

6import traceback

8from lxml import ElementInclude

9from lxml import etree

11from django.conf import settings

12from django.db import transaction

13from django.db.models import Prefetch

14from django.utils import timezone

16from ptf import exceptions

17from ptf import model_data

18from ptf import model_data_comparator

19from ptf import model_data_converter

20from ptf import model_helpers

21from ptf import tex

22from ptf import utils

23from ptf.cmds import ptf_cmds

24from ptf.cmds import solr_cmds

25from ptf.cmds.base_cmds import baseCmd

26from ptf.cmds.xml import xml_utils

27from ptf.cmds.xml.cedrics import cedrics_parser

29# KEEP THIS UNUSED IMPORT THEY ARE USED

30from ptf.cmds.xml.jats import jats_parser

31from ptf.cmds.xml.jats import xmldata as xmldata_jats

32from ptf.cmds.xml.xml_utils import normalize

33from ptf.display import resolver

34from ptf.models import Article

35from ptf.models import Collection

36from ptf.models import Container

37from ptf.models import Person

38from ptf.models import backup_obj_not_in_metadata

39from ptf.models import backup_translation

40from ptf.models import restore_obj_not_in_metadata

41from ptf.models import restore_translation

44def find_file(name):

45 paths = settings.MANAGER_XSLT_DIRS

46 for path in paths:

47 for root, _, files in os.walk(path):

48 if name in files:

49 return os.path.join(root, name)

50 return None

53def get_transform(name):

54 file_path = find_file(f"{name}.xsl")

55 xslt_doc = etree.parse(file_path)

56 return etree.XSLT(xslt_doc)

59class addXmlCmd(baseCmd):

60 """

61 addXmlCmd: base class for commands that take an XML as input

62 The XML is passed with the body param

64 from_folder / to_folder: location of binary files to copy

66 Example with a file:

67 f = open('journal.xml')

68 body = f.read()

69 f.close()

70 cmd = add...XmlCmd( { "body":body } )

72 Exception raised:

73 - ValueError if the init params are empty

74 """

76 use_body = True

77 body = None

78 tree = None

79 solr_commit_at_the_end = True

80 xml_filename_in_log = None

81 remove_blank_text = False

82 xml_file_folder = None

84 def __init__(self, params=None):

85 super().__init__(params)

87 if self.use_body:

88 self.required_params.extend(["body"])

90 def get_logname(self):

91 filename = ""

93 if hasattr(settings, "LOG_DIR"): 93 ↛ 103line 93 didn't jump to line 103, because the condition on line 93 was never false

94 i = 0

95 today = datetime.date.today()

96 basename = str(today) + "-" + self.__class__.__name__ + "-"

97 filename = os.path.join(settings.LOG_DIR, basename + str(i) + ".xml")

99 while os.path.isfile(filename):

100 i += 1

101 filename = os.path.join(settings.LOG_DIR, basename + str(i) + ".xml")

102

103 return filename

104

105 def pre_do(self):

106 super().pre_do()

107

108 if self.use_body:

109 # The Cedrics -> JATS XSLT transform manually adds space=preserve around

110 # the nodes with mixed-content, but leaves the text unchanged.

111 # As such, parsing the Cedrics XML cannot be done with remove_blank_text=True

112 # Or the spaces will be removed whereas the JATS XML will keep them.

113 # We still need the remove_blank_text=True for JATS XML for all the other nodes

114 parser = etree.XMLParser(

115 huge_tree=True,

116 recover=True,

117 remove_blank_text=self.remove_blank_text,

118 remove_comments=True,

119 resolve_entities=True,

120 )

121 # if isinstance(self.body, str):

122 # self.body = self.body

123 if self.xml_file_folder is not None:

124 if self.xml_file_folder[-1] != "/":

125 self.xml_file_folder += "/"

126 # For ElementInclude to find the href

127 self.body = self.body.replace(

128 'xmlns:xlink="http://www.w3.org/1999/xlink"', ""

129 ).replace("xlink:href", "href")

130 tree = etree.fromstring(self.body.encode("utf-8"), parser=parser)

131

132 if self.xml_file_folder is not None:

133 ElementInclude.include(tree, base_url=self.xml_file_folder)

134 # t = get_transform('strip-namespace')

135 # self.tree = t(tree).getroot()

136 self.tree = tree

137

138 if self.tree is None: 138 ↛ 139line 138 didn't jump to line 139, because the condition on line 138 was never true

139 raise ValueError("tree est vide")

140

141 # Write the xml body on disk

142 if hasattr(settings, "LOG_DIR") and self.body and self.use_body:

143 self.xml_filename_in_log = self.get_logname()

144

145 with open(self.xml_filename_in_log, "w", encoding="utf-8") as file_:

146 file_.write(self.body)

147

148 @transaction.atomic

149 def do(self, parent=None):

150 try:

151 obj = super().do(parent)

152 except Exception as e:

153 ptf_cmds.do_solr_rollback()

154

155 # Empty sub_cmds to ignore undo

156 self.cmds = []

157

158 # Write the xml body on disk

159 if hasattr(settings, "LOG_DIR") and self.body and self.use_body:

160 with open(

161 os.path.join(settings.LOG_DIR, "cmds.log"), "a", encoding="utf-8"

162 ) as file_:

163 file_.write("----------------------\n")

164

165 if self.xml_filename_in_log is None: 165 ↛ 166line 165 didn't jump to line 166, because the condition on line 165 was never true

166 self.xml_filename_in_log = self.get_logname()

167

168 file_.write(self.xml_filename_in_log + " : FAILED\n")

169 exc_type, exc_value, exc_traceback = sys.exc_info()

170 lines = traceback.format_exception(exc_type, exc_value, exc_traceback)

171 for line in lines:

172 file_.write(line + "\n")

173 file_.write("----------------------\n")

174

175 raise e

176

177 if self.solr_commit_at_the_end:

178 ptf_cmds.do_solr_commit()

179

180 return obj

181

182 def post_undo(self):

183 super().post_undo()

184

185 Person.objects.clean()

186

187 def post_do(self, resource=None):

188 super().post_do(resource)

189

190 Person.objects.clean()

191

192 if hasattr(settings, "LOG_DIR") and resource and self.use_body:

193 today = datetime.date.today()

194 basename = str(today) + "-" + self.__class__.__name__

195

196 pids = ""

197 first = True

198 if isinstance(resource, list):

199 for resource_item in resource:

200 if first: 200 ↛ 203line 200 didn't jump to line 203, because the condition on line 200 was never false

201 first = False

202 else:

203 pids += ", "

204

205 pids += resource_item.pid

206 else:

207 pids = resource.pid

208

209 with open(os.path.join(settings.LOG_DIR, "cmds.log"), "a", encoding="utf-8") as file_:

210 file_.write(basename + " : " + pids + "\n")

211

212 if hasattr(resource, "my_collection") and resource.my_collection:

213 folder = os.path.join(

214 settings.LOG_DIR, resource.get_top_collection().pid, resource.pid

215 )

216 filename = os.path.join(folder, resource.pid + ".xml")

217 resolver.create_folder(folder)

218 with open(filename, "w", encoding="utf-8") as file_:

219 file_.write(self.body)

220

221 # #if test, then raise an exeption if self.warnings not empty (in self.warnings we have all tags not parsed)

222 # if 'test' in sys.argv:

223 # if len(self.warnings) > 0:

224 # print(self.warnings)

225 # raise UserWarning("All tags are not parsed", self.warnings)

226

227 def undo(self):

228 super().undo()

229

230 if self.solr_commit_at_the_end:

231 ptf_cmds.do_solr_commit()

232

233 def add_objects_with_location(self, xobjs, resource, cmd_type):

234 seq = 1

235

236 for xobj in xobjs:

237 base = None

238

239 if xobj["base"]:

240 base_name = xobj["base"]

241 base = model_helpers.get_xmlbase(base_name)

242 if base is None:

243 cmd = ptf_cmds.addXmlBasePtfCmd({"base": xobj["base"], "solr_commit": False})

244 base = cmd.do(self)

245

246 rel = xobj["rel"]

247 location = xobj["location"]

248

249 params = {

250 "rel": rel,

251 "mimetype": xobj.get("mimetype", ""),

252 "location": location,

253 "seq": seq,

254 "solr_commit": False,

255 "from_folder": self.from_folder,

256 "to_folder": self.to_folder,

257 }

258

259 # Ignore XML file

260 if params["mimetype"] != "application/xml": 260 ↛ 236line 260 didn't jump to line 236, because the condition on line 260 was never false

261 if "metadata" in xobj:

262 params["metadata"] = xobj["metadata"]

263

264 if "text" in xobj:

265 params["text"] = xobj["text"]

266

267 # TODO: cmd factory ?

268 cmd = None

269 if cmd_type == "ExtLink":

270 cmd = ptf_cmds.addExtLinkPtfCmd(params)

271 elif cmd_type == "RelatedObject":

272 cmd = ptf_cmds.addRelatedObjectPtfCmd(params)

273 elif cmd_type == "SupplementaryMaterial": 273 ↛ 274line 273 didn't jump to line 274, because the condition on line 273 was never true

274 params["caption"] = xobj.get("caption", "")

275 params["supplementary_material"] = True

276 cmd = ptf_cmds.addSupplementaryMaterialPtfCmd(params)

277 elif cmd_type == "DataStream": 277 ↛ 283line 277 didn't jump to line 283, because the condition on line 277 was never false

278 cmd = ptf_cmds.addDataStreamPtfCmd(params)

279

280 # Always try to add an ExtLink or a RelatedObject

281 # May raise ResourceExists if the ExtLink/RelatedObject is added twice

282

283 if cmd is not None: 283 ↛ 289line 283 didn't jump to line 289, because the condition on line 283 was never false

284 cmd.set_base(base)

285 cmd.set_resource(resource)

286

287 cmd.do(self)

288

289 seq += 1

290

291 # def add_metadata_parts(self, xobj, resource):

292 # for (seq, name, data) in xobj.metadataparts:

293 # params = {"name": name,

294 # "data": data,

295 # "seq": seq,

296 # "solr_commit": False}

297 #

298 # cmd = ptf_cmds.addMetaDataPartPtfCmd(params)

299 # cmd.set_resource(resource)

300 # cmd.do(self)

301

302 @staticmethod

303 def remove_publisher(publisher):

304 cmd = ptf_cmds.addPublisherPtfCmd()

305 cmd.set_object_to_be_deleted(publisher)

306 cmd.undo()

307

308 # Update the published years of a collection (journal/acta/book-series...)

309 @staticmethod

310 def update_collection_years(pid, container, save=True):

311 collection = Collection.objects.get(pid=pid)

312 if container.year:

313 year = container.year

314 fyear, lyear = model_helpers.get_first_last_years(year)

315 fyear = int(fyear)

316 lyear = int(lyear)

317

318 if fyear < collection.fyear or not collection.fyear:

319 collection.fyear = fyear

320

321 if lyear > collection.lyear or not collection.lyear:

322 collection.lyear = lyear

323

324 if save:

325 collection.save()

326

327

328class addCollectionsXmlCmd(addXmlCmd):

329 """

330 addCollectionsXmlCmd: adds/remove a collection

331

332 TODO: merge Collection and Journal ?

333

334 Exception raised:

335 - exceptions.ResourceExists during do

336 if the Collection already exists

337 if the collection defines the same extlink/relatedobject multiple times

338 - exceptions.ResourceDoesNotExist

339 during undo if the Collection does not exist

340 during do of the provider does not exist

341 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value>

342 - RuntimeError during undo if resources are still published

343 """

344

345 provider = None

346 xml_format = None

347

348 def set_provider(self, provider):

349 self.provider = provider

350

351 def add_collection(self, xcol, update=False):

352 if not xcol: 352 ↛ 353line 352 didn't jump to line 353, because the condition on line 352 was never true

353 return None

354

355 if xcol.provider: 355 ↛ 358line 355 didn't jump to line 358, because the condition on line 355 was never false

356 provider = model_helpers.get_provider_by_name(xcol.provider)

357 else:

358 provider = self.provider

359

360 col_id = xcol.pid

361 collection = model_helpers.get_collection(col_id)

362

363 existing = False

364

365 if collection is not None:

366 existing = True

367 if not update: 367 ↛ 371line 367 didn't jump to line 371

368 raise exceptions.ResourceExists(f"Collection {collection.pid} already exists")

369

370 # Create a collection

371 params = {

372 "xobj": xcol,

373 "from_folder": self.from_folder,

374 "to_folder": self.to_folder,

375 "solr_commit": False,

376 }

377

378 cls = ptf_cmds.addCollectionPtfCmd

379 if update and existing: 379 ↛ 380line 379 didn't jump to line 380, because the condition on line 379 was never true

380 cls = ptf_cmds.updateCollectionPtfCmd

381

382 cmd = cls(params)

383 cmd.set_provider(provider)

384 collection = cmd.do(self)

385

386 self.add_objects_with_location(xcol.ext_links, collection, "ExtLink")

387

388 # if publisher:

389 # model_helpers.publish_resource(publisher, journal)

390

391 return collection

392

393 def internal_do(self):

394 super().internal_do()

395

396 collections = []

397

398 if self.tree.tag == "journal-meta": 398 ↛ 399line 398 didn't jump to line 399, because the condition on line 398 was never true

399 raise ValueError(

400 "Creation of a journal on the fly from an article is not yet supported"

401 )

402 # # Code used when a journal is created on the fly while parsing an article (GDML - OAI)

403 # # TODO 1 : Refactor all the JATS parsers (eudml/bdim/dmlcz/....)

404 # # to be compatible with jats_parser.py

405 # # TODO 2 : Prevent the creation of the collection on the fly ?

406 # # Shouldn't the collection be monitored/controlled ?

407 # xmldata = globals()[self.xml_format]

408 # xcol = xmldata.Journal(self.tree)

409 # collection = self.add_collection(xcol, update=True)

410 # collections.append(collection)

411 else:

412 for node in self.tree:

413 xcol = None

414 if node.tag == "collection-meta": 414 ↛ 415line 414 didn't jump to line 415, because the condition on line 414 was never true

415 raise ValueError("Collection can only be created from <publication-meta>")

416 # xcol = jats_parser.BitsCollection(tree=node)

417 elif node.tag == "journal-meta": 417 ↛ 418line 417 didn't jump to line 418, because the condition on line 417 was never true

418 raise ValueError(

419 "Collection can only be created from <publication-meta>, <journal-meta> are handled while parsing a <journal-issue>"

420 )

421 # xcol = jats_parser.JatsJournal(tree=node)

422 elif node.tag == "publication-meta": 422 ↛ 425line 422 didn't jump to line 425, because the condition on line 422 was never false

423 xcol = jats_parser.MathdocPublication(tree=node)

424

425 collection = self.add_collection(xcol)

426 collections.append(collection)

427

428 return collections

429

430

431class addIssueXmlCmd(addXmlCmd):

432 """

433 addIssueXmlCmd: adds/remove an issue

434

435 from_folder / to_folder (declared in addXmlCmd): location of binary files to copy

436

437 extra_folder: folder where extra data (extid false_positive...) are stored in a json

438 It is used

439 - when you call addIssueXmlCmd directly to import from an archive,

440 - when you call addOrUpdateIssueXmlCmd and we need to restore extra data after the import

441

442 Exception raised:

443 - exceptions.ResourceExists during do if the issue already exists

444 - exceptions.ResourceDoesNotExist

445 during undo if the Issue does not exist

446 during do if the serial/provider does not exist

447 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value>

448 - RuntimeError during undo if resources are still published

449 """

450

451 assign_doi = False

452 full_text_folder = ""

453 extra_folder = None

454 prod_deployed_date_iso_8601_date_str = None

455 xissue = None

456 count = 0

457 no_bib = False # Ignore the references during the import (used in Geodesic)

458 embargo = False # Import only the open articles (used in Geodesic)

459

460 def create_child_collection(self, xjournal, journal):

461 issn = xjournal.issn if xjournal.issn else xjournal.e_issn

462

463 new_xjournal = copy.deepcopy(xjournal)

464 new_xjournal.wall = 0

465 new_xjournal.pid = f"{xjournal.pid}-{issn}"

466 new_xjournal.coltype = journal.coltype

467

468 params = {"xobj": new_xjournal}

469 provider = model_helpers.get_provider_by_name("mathdoc")

470

471 cmd = ptf_cmds.addCollectionPtfCmd(params)

472 cmd.set_parent(journal)

473 cmd.set_provider(provider)

474

475 collection = cmd.do()

476 # collection.parent = journal

477 # journal = collection

478 return collection

479

480 def get_historic_collection(self, xjournal, journal):

481 use_meta_collections = (

482 settings.USE_META_COLLECTIONS if hasattr(settings, "USE_META_COLLECTIONS") else False

483 )

484

485 if not use_meta_collections: 485 ↛ 486line 485 didn't jump to line 486, because the condition on line 485 was never true

486 return journal

487

488 # meta-collections are used : journal may be the top collection or one of its children

489

490 value = id_type = None

491

492 # Take care of special case of STNB :

493 # For that, we ignore the issn of STNB 2nd series

494 if xjournal.pid == "JTNB" and xjournal.issn == "0989-5558": 494 ↛ 495line 494 didn't jump to line 495, because the condition on line 494 was never true

495 xjournal.issn = None

496 xjournal.e_issn = None

497 xjournal.ids = []

498 else:

499 if xjournal.issn:

500 value = xjournal.issn

501 id_type = "issn"

502 elif xjournal.e_issn:

503 value = xjournal.e_issn

504 id_type = "e-issn"

505

506 if value:

507 # collection has at least one issn

508 qs = Collection.objects.filter(resourceid__id_value=value, resourceid__id_type=id_type)

509 if qs.exists():

510 journal = qs.first()

511 else:

512 # xjournal does not exist yet.

513 journal = self.create_child_collection(xjournal, journal)

514 else:

515 # collection has no issn

516 possible_pids = [xjournal.pid, f"{xjournal.pid}-{value}"]

517 qs = Collection.objects.exclude(resourceid__id_value__isnull=False).filter(

518 pid__in=possible_pids

519 )

520 if qs.exists(): 520 ↛ 523line 520 didn't jump to line 523, because the condition on line 520 was never false

521 journal = qs.first()

522 else:

523 journal = self.create_child_collection(xjournal, journal)

524

525 return journal

526

527 def internal_do(self):

528 super().internal_do()

529

530 #######################################################################

531 # get xissue

532

533 if self.xissue:

534 xissue = self.xissue

535 else:

536 xissue = jats_parser.JatsIssue(tree=self.tree, no_bib=self.no_bib)

537 self.warnings.extend(xissue.warnings)

538

539 #######################################################################

540 # Check if there is an existing issue / journal

541

542 issue_id = xissue.pid

543 issue = model_helpers.get_container(issue_id)

544

545 if issue is not None:

546 raise exceptions.ResourceExists(f"Issue {issue_id} already exists")

547

548 xjournal = xissue.journal

549 journal_id = xjournal.pid

550 journal = model_helpers.get_collection(journal_id)

551

552 # Note: Why use <issue-meta><custom-meta-group><custom-meta> to find the provider and then the journal

553 # as there is a <journal-meta> with an id ?

554 # The ptf_resource table (Resource objects) are created with only 1 id.

555 # When you add a journal, the journal id is the one of its

556 # <custom-meta-group><custom-meta> provider.

557 # If you want to find the journal of an issue based on the <journal-meta> information, you might

558 # have to search among the other ids (ptf_resourceid table, ResourceId objects) : sql JOIN select

559 # To avoid the join select, it's better to use <issue-meta><custom-meta-group><custom-meta> to make sure

560 # we use the correct provider. A simple select in the ptf_resource table is then needed.

561 if journal is None: 561 ↛ 562line 561 didn't jump to line 562, because the condition on line 561 was never true

562 raise exceptions.ResourceDoesNotExist(f"Journal {journal_id} does not exist")

563

564 # Journal is the top collection (ex: AFST)

565 # We want to get (or create) the journal that corresponds to the issue

566 journal = self.get_historic_collection(xjournal, journal)

567

568 if self.embargo and journal.wall > 0: 568 ↛ 571line 568 didn't jump to line 571, because the condition on line 568 was never true

569 # Geodesic is for open access articles.

570 # We do not want to import the issues under embargo

571 if resolver.embargo(journal.wall, xissue.year):

572 print(f"Embargo, ignore {xissue.pid}")

573 return None

574

575 #######################################################################

576 # Get provider/publisher

577

578 provider_name = xissue.provider if xissue.provider else "mathdoc"

579 provider = model_helpers.get_provider_by_name(provider_name)

580

581 #######################################################################

582 # Add the issue

583

584 params = {

585 "xobj": xissue,

586 "pid": xissue.pid,

587 "from_folder": self.from_folder,

588 "to_folder": self.to_folder,

589 "solr_commit": False,

590 }

591

592 cmd = ptf_cmds.addContainerPtfCmd(params)

593 cmd.add_collection(journal)

594 cmd.set_provider(provider)

595 issue = cmd.do(self)

596

597 self.add_objects_with_location(xissue.ext_links, issue, "ExtLink")

598 self.add_objects_with_location(xissue.related_objects, issue, "RelatedObject")

599 self.add_objects_with_location(xissue.streams, issue, "DataStream")

600

601 #######################################################################

602 # Add the issue's articles

603

604 # JatsIssue is an iterator (has the __iter__ function)

605 # you simply iterate the xissue to get its articles

606 for seq, xarticle in enumerate(xissue, start=1):

607 params = {

608 "xarticle": xarticle,

609 "journal": journal,

610 "issue": issue,

611 "seq": seq,

612 "provider": provider,

613 "assign_doi": self.assign_doi,

614 "full_text_folder": self.full_text_folder,

615 "use_body": False,

616 "from_folder": self.from_folder,

617 "to_folder": self.to_folder,

618 "solr_commit_at_the_end": False,

619 }

620 cmd = addArticleXmlCmd(params)

621 cmd.do(self)

622

623 # Update the top journal first year and last year

624 self.update_collection_years(journal_id, issue)

625

626 # The collection maybe updated with update_collection_years and the assign_doi param (col.last_doi)

627 # Update issue before returning the object.

628 # Note that refresh_from_db does not update ForeignKey fields, we can't simply call issue.refresh_from_db()

629 issue.my_collection.refresh_from_db()

630

631 # Used in post_do

632 self._prod_deployed_date_iso_8601_date_str = xissue.prod_deployed_date_iso_8601_date_str

633

634 return issue

635

636 def post_do(self, resource=None):

637 super().post_do(resource)

638

639 # Si le XML de l'issue a une last-modified, on la garde, sinon on en créé une.

640 if resource.last_modified is None: 640 ↛ 641line 640 didn't jump to line 641, because the condition on line 640 was never true

641 resource.last_modified = timezone.now()

642 resource.save()

643

644 # Sur ptf-tools, si le XML de l'issue a une prod_deployed_date,

645 # On la propage aux Articles/Issue.

646 # La restoration éventuelle des données (avec importExtraDataPtfCmd) peut écraser prod_deployed_date

647 if self._prod_deployed_date_iso_8601_date_str and settings.SITE_NAME == "ptf_tools":

648 prod_deployed_date = model_helpers.parse_date_str(

649 self._prod_deployed_date_iso_8601_date_str

650 )

651 journal_site = model_helpers.get_site_mersenne(resource.my_collection.pid)

652 if journal_site: 652 ↛ 655line 652 didn't jump to line 655, because the condition on line 652 was never false

653 model_helpers.update_deployed_date(resource, journal_site, prod_deployed_date)

654

655 if self.extra_folder:

656 ptf_cmds.importExtraDataPtfCmd(

657 {"pid": resource.pid, "import_folder": self.extra_folder}

658 ).do()

659

660

661class addArticleXmlCmd(addXmlCmd):

662 """

663 addArticleXmlCmd: adds/remove an issue

664

665 Exception raised:

666 - exceptions.ResourceExists during do if the article already exists

667 - exceptions.ResourceDoesNotExist

668 during undo if the Article does not exist

669 during do if the serial/issue/provider does not exist

670 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value>

671 """

672

673 xarticle = None

674 journal = None

675 issue = None

676 provider = None

677 provider_col = None

678 assign_doi = False

679 full_text_folder = ""

680 xml_format = "xmldata_jats"

681 # restricted_mode is used by maxiDML. We do not try to import all the metadata, but only a subset

682 restricted_mode = False

683 # standalone is used to import isolated article, without issues

684 standalone = False

685 seq = (

686 0 # seq is used by the breadcrumbs. Generate it if it's not specified in the XML (ex: PCJ)

687 )

688 keep_translations = False

689

690 def set_collection(self, collection):

691 self.journal = collection

692 self.provider = collection.provider

693

694 def set_xml_format(self, xml_format):

695 self.xml_format = xml_format

696

697 def set_provider(self, provider):

698 self.provider = provider

699

700 def set_provider_col(self, provider_col):

701 self.provider_col = provider_col

702

703 def set_article_single_mode(self):

704 self.xarticle = jats_parser.JatsArticle(tree=self.tree)

705 self.warnings.extend(self.xarticle.warnings)

706

707 # TODO: MaxiDML: allow the creation of an issue on the fly

708 # if not self.provider:

709 # self.provider = model_helpers.get_provider_by_name(self.xarticle.provider)

710 #

711 # xmldata_jats.set_pid_type(self.provider.pid_type)

712 #

713 # bdy = etree.tostring(self.xarticle.journal.tree).decode("utf-8")

714 # cmd = addCollectionsXmlCmd({'body': bdy,

715 # 'xml_format': self.xml_format,

716 # 'coltype': "journal"})

717 # cmd.set_provider(self.provider_col if self.provider_col else self.provider)

718 # self.journal = cmd.do()[0]

719 #

720 # self.issue = model_helpers.get_container(self.xarticle.issue_id)

721 # if self.issue is None:

722 # # need to create the issue

723 # date = datetime.datetime.strptime(self.xarticle.date_published_iso_8601_date_str,

724 # '%Y-%m-%d')

725 # pid = "{name}_{year}".format(name=self.journal.pid, year=date.year)

726 # self.issue = model_helpers.get_container(pid)

727 # if self.issue is None:

728 # params = {'ctype': 'issue', 'year': date.year, 'pid': pid,

729 # 'last_modified_iso_8601_date_str': datetime.datetime.now().strftime(

730 # "%Y-%m-%d %H:%M:%S"), 'volume': self.xarticle.volume,

731 # # if copy binary, need from_folder / to_folder

732 # }

733 #

734 # cmd = ptf_cmds.addContainerPtfCmd(params)

735 # cmd.add_collection(self.journal)

736 # cmd.set_provider(self.provider)

737 # self.issue = cmd.do()

738

739 def get_oai_identifier(self):

740 return self.xarticle.oai_identifier

741

742 def update_xobj_with_body(self):

743 # Import CEDRICS, le plein texte provient d'un fichier séparé

744 if self.full_text_folder and not self.xarticle.body:

745 if self.full_text_folder == settings.CEDRAM_TEX_FOLDER: 745 ↛ 757line 745 didn't jump to line 757, because the condition on line 745 was never false

746 text = ""

747 locs = [

748 stream["location"]

749 for stream in self.xarticle.streams

750 if stream["mimetype"] == "application/pdf"

751 ]

752 if locs: 752 ↛ 755line 752 didn't jump to line 755, because the condition on line 752 was never false

753 full_pdf_location = os.path.join(self.full_text_folder, locs[0])

754 text = utils.pdf_to_text(full_pdf_location)

755 self.xarticle.body = text

756 else:

757 full_text_file = self.full_text_folder + self.xarticle.pid + ".xml"

758

759 with open(full_text_file, mode="rb") as file_:

760 body = file_.read()

761

762 parser = etree.XMLParser(huge_tree=True, recover=True)

763 tree = etree.fromstring(body, parser=parser)

764 node = tree.find("body")

765 self.xarticle.body = xml_utils.get_text_from_node(node)

766 self.xarticle.body_xml = xml_utils.get_xml_from_text("body", self.xarticle.body)

767 elif not self.xarticle.body_xml and hasattr(self.xarticle, "pii"): 767 ↛ 768line 767 didn't jump to line 768, because the condition on line 767 was never true

768 full_text_file = os.path.join(

769 "/numdam_dev/acquisition/donnees_traitees",

770 self.journal.pid,

771 self.issue.pid,

772 self.xarticle.pid,

773 self.xarticle.pid + ".xml",

774 )

775 if os.path.isfile(full_text_file):

776 with open(full_text_file, mode="rb") as file_:

777 body = file_.read()

778

779 parser = etree.XMLParser(huge_tree=True, recover=True)

780 tree = etree.fromstring(body, parser=parser)

781 node = tree.find("body")

782 self.xarticle.body = xml_utils.get_text_from_node(node)

783 self.xarticle.body_xml = xml_utils.get_xml_from_text("body", self.xarticle.body)

784

785 def internal_do(self):

786 super().internal_do()

787

788 if self.xarticle is None and self.journal is not None: 788 ↛ 790line 788 didn't jump to line 790, because the condition on line 788 was never true

789 # self.restricted_mode = True

790 self.set_article_single_mode()

791 self.update = True

792 else:

793 self.update = False

794

795 if self.xarticle.pid is None:

796 self.xarticle.pid = (

797 self.xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_")

798 )

799

800 for xtranslated_article in self.xarticle.translations: 800 ↛ 801line 800 didn't jump to line 801, because the loop on line 800 never started

801 for xtream in xtranslated_article.streams:

802 if xtream["mimetype"] == "text/html":

803 if self.from_folder is None:

804 raise ValueError(

805 "The article has its full text in a separate HTML file. You need to set from_folder"

806 )

807

808 location = os.path.join(self.from_folder, xtream["location"])

809 body_html = resolver.get_body(location)

810 body = xml_utils.get_text_from_xml_with_mathml(body_html)

811 xtranslated_article.body_html = body_html

812 xtranslated_article.body = body

813

814 for stream in self.xarticle.streams:

815 if stream["mimetype"] == "text/html":

816 location = os.path.join(self.from_folder, stream["location"])

817 body_html = resolver.get_body(location)

818 body = xml_utils.get_text_from_xml_with_mathml(body_html)

819 self.xarticle.body_html = body_html

820 self.xarticle.body = body

821

822 if self.xarticle.doi:

823 article = model_helpers.get_article_by_doi(self.xarticle.doi)

824 else:

825 article = model_helpers.get_article(self.xarticle.pid)

826 needs_to_restore_article = False

827

828 if article is not None: 828 ↛ 829line 828 didn't jump to line 829, because the condition on line 828 was never true

829 if self.update or self.standalone:

830 if self.standalone:

831 self.provider = article.provider

832

833 needs_to_restore_article = True

834 backup_obj_not_in_metadata(article)

835

836 if self.keep_translations:

837 backup_translation(article)

838

839 cmd = ptf_cmds.addArticlePtfCmd(

840 {

841 "pid": article.pid,

842 "to_folder": self.to_folder, # on supprime les fichiers pour être sûr

843 }

844 )

845 cmd.set_object_to_be_deleted(article)

846 cmd.undo()

847 else:

848 raise exceptions.ResourceExists(f"Article {self.xarticle.pid} already exists")

849

850 # Override seq

851 if self.standalone and article is not None: 851 ↛ 852line 851 didn't jump to line 852, because the condition on line 851 was never true

852 self.xarticle.seq = article.seq

853 elif (

854 not self.standalone and self.issue and int(self.xarticle.seq) == 0 and self.seq != 0

855 ) or (hasattr(self, "pii") and self.seq != 0):

856 self.xarticle.seq = self.seq

857

858 # Get the article's text (body) for SolR if it is empty from the PDF

859 self.update_xobj_with_body()

860

861 params = {

862 "xobj": self.xarticle,

863 "pid": self.xarticle.pid,

864 "from_folder": self.from_folder,

865 "to_folder": self.to_folder,

866 "assign_doi": self.assign_doi and not self.xarticle.doi,

867 "solr_commit": False,

868 }

869

870 cmd = ptf_cmds.addArticlePtfCmd(params)

871 if self.issue or not self.standalone: 871 ↛ 873line 871 didn't jump to line 873, because the condition on line 871 was never false

872 cmd.set_container(self.issue)

873 cmd.add_collection(self.journal)

874 article = cmd.do(self)

875

876 self.add_objects_with_location(self.xarticle.ext_links, article, "ExtLink")

877 self.add_objects_with_location(self.xarticle.streams, article, "DataStream")

878 if not self.restricted_mode: 878 ↛ 883line 878 didn't jump to line 883, because the condition on line 878 was never false

879 self.add_objects_with_location(

880 self.xarticle.supplementary_materials, article, "SupplementaryMaterial"

881 )

882

883 if (

884 hasattr(settings, "SHOW_BODY") and settings.SHOW_BODY

885 ) or settings.SITE_NAME == "ptf_tools":

886 self.add_objects_with_location(self.xarticle.figures, article, "RelatedObject")

887

888 for xtrans_article, trans_article in zip( 888 ↛ 891line 888 didn't jump to line 891, because the loop on line 888 never started

889 self.xarticle.translations, cmd.cmd.translated_articles

890 ):

891 self.add_objects_with_location(xtrans_article.streams, trans_article, "DataStream")

892

893 if needs_to_restore_article: 893 ↛ 894line 893 didn't jump to line 894, because the condition on line 893 was never true

894 restore_obj_not_in_metadata(article)

895

896 if self.keep_translations:

897 restore_translation(article)

898

899 return article

900

901

902class addTranslatedArticleXmlCmd(addXmlCmd):

903 """

904 addTranslatedArticleXmlCmd: adds/remove translations.

905 The original article is not changed

906 The current translations are first removed

907 """

908

909 lang = ""

910 html_file_name = ""

911 pdf_file_name = ""

912 date_published_str = ""

913

914 def internal_do(self):

915 super().internal_do()

916

917 xarticle = jats_parser.JatsArticle(tree=self.tree)

918 article = model_helpers.get_article(xarticle.pid)

919

920 if article is None:

921 raise exceptions.ResourceDoesNotExist(f"Article {self.xarticle.pid} does not exist")

922

923 # Merge existing article with new translation

924 data_article = model_data_converter.db_to_article_data(article)

925 new_translations = [

926 translation

927 for translation in data_article.translations

928 if translation.lang != self.lang

929 ]

930

931 for xtrans_article in xarticle.translations:

932 if xtrans_article.lang == self.lang:

933 # Upload/views has copied the HTML file on disk

934 # Add a DataStream.

935 # TODO: check if the datastream is not already present

936 if self.html_file_name:

937 data = model_data.create_datastream()

938 data["rel"] = "full-text"

939 data["mimetype"] = "text/html"

940 data["location"] = self.html_file_name

941 xtrans_article.streams.append(data)

942

943 if self.pdf_file_name:

944 # Create a pdf file

945 # pdf-translate needs the article/sub-article XML

946 # Simply add a datastream for now

947 # The new Article created in Django will be complete

948 # But generate the PDF file at the end

949 data = model_data.create_datastream()

950 data["rel"] = "full-text"

951 data["mimetype"] = "application/pdf"

952 data["location"] = self.pdf_file_name

953 xtrans_article.streams.append(data)

954

955 if self.date_published_str:

956 xtrans_article.date_published_iso_8601_date_str = self.date_published_str

957

958 new_translations.append(xtrans_article)

959

960 data_article.translations = new_translations

961

962 cmd = addArticleXmlCmd(

963 {

964 "xarticle": data_article,

965 "use_body": False,

966 "issue": article.my_container,

967 "standalone": True,

968 "from_folder": self.from_folder,

969 }

970 )

971 cmd.set_collection(article.get_collection())

972 article = cmd.do()

973

974 # pdf-translate needs the article/sub-article XML

975 xml = ptf_cmds.exportPtfCmd(

976 {

977 "pid": article.pid,

978 "with_body": False,

979 "with_djvu": False,

980 "article_standalone": True,

981 "collection_pid": settings.COLLECTION_PID,

982 }

983 ).do()

984

985 tex.create_translated_pdf(

986 article,

987 xml,

988 self.lang,

989 os.path.join(self.from_folder, self.pdf_file_name),

990 os.path.join(self.from_folder, self.html_file_name),

991 # If the date_published is specified, we assume that the PDF already exists

992 skip_compilation=self.date_published_str != "",

993 )

994

995 return article

996

997

998class addPCJArticleXmlCmd(addXmlCmd):

999 """

1000 addPCJArticleXmlCmd:

1001 """

1002

1003 html_file_name = ""

1004

1005 def internal_do(self):

1006 super().internal_do()

1007

1008 xarticle = jats_parser.JatsArticle(tree=self.tree)

1009

1010 if self.html_file_name: 1010 ↛ 1017line 1010 didn't jump to line 1017, because the condition on line 1010 was never false

1011 data = model_data.create_datastream()

1012 data["rel"] = "full-text"

1013 data["mimetype"] = "text/html"

1014 data["location"] = self.html_file_name

1015 xarticle.streams.append(data)

1016

1017 cmd = addArticleXmlCmd(

1018 {

1019 "xarticle": xarticle,

1020 "use_body": False,

1021 "issue": self.issue,

1022 "standalone": True,

1023 "from_folder": self.from_folder,

1024 }

1025 )

1026 cmd.set_collection(self.collection)

1027 article = cmd.do()

1028

1029 return article

1030

1031

1032class addBookXmlCmd(addXmlCmd):

1033 """

1034 addBookXmlCmd: adds/remove a book

1035

1036 Exception raised:

1037 - exceptions.ResourceExists during do if the book already exists

1038 - exceptions.ResourceDoesNotExist

1039 during undo if the Book does not exist

1040 during do if the serial/provider does not exist

1041 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value>

1042 - RuntimeError during undo if resources are still published

1043 """

1044

1045 provider = None

1046 import_oai_mode = False

1047 journal = None

1048 xml_format = "xmldata_jats"

1049 xbook = None

1050 _collection = None

1051

1052 def set_provider(self, provider):

1053 self.provider = provider

1054

1055 def add_parts(self, xparts, pseq):

1056 if xparts:

1057 seq = 1

1058 for xpart in xparts:

1059 self.add_part(xpart, seq, pseq)

1060 seq += 1

1061

1062 def add_part(self, xpart, seq, pseq):

1063 if xpart is None: 1063 ↛ 1064line 1063 didn't jump to line 1064, because the condition on line 1063 was never true

1064 return

1065

1066 # An Article is used to store a book part in the database

1067 article = model_helpers.get_article(xpart.pid)

1068

1069 if article is not None: 1069 ↛ 1070line 1069 didn't jump to line 1070, because the condition on line 1069 was never true

1070 raise exceptions.ResourceExists(f"BookPart {xpart.pid} already exists")

1071

1072 params = {

1073 "xobj": xpart,

1074 "pid": xpart.pid,

1075 "seq": seq,

1076 "pseq": pseq,

1077 # "deployed": deployed,

1078 "from_folder": self.from_folder,

1079 "to_folder": self.to_folder,

1080 "solr_commit": False,

1081 }

1082

1083 cmd = ptf_cmds.addBookPartPtfCmd(params)

1084 cmd.set_container(self.book)

1085 cmd.add_collection(self._collection)

1086 article = cmd.do(self)

1087

1088 self.add_objects_with_location(xpart.ext_links, article, "ExtLink")

1089 self.add_objects_with_location(xpart.streams, article, "DataStream")

1090

1091 self.add_parts(xpart.parts, seq)

1092

1093 def set_import_oai_mode(self):

1094 self.import_oai_mode = True

1095

1096 def internal_do(self):

1097 super().internal_do()

1098

1099 #######################################################################

1100 # Get xbook

1101

1102 if self.import_oai_mode: 1102 ↛ 1103line 1102 didn't jump to line 1103, because the condition on line 1102 was never true

1103 xmldata = globals()[self.xml_format]

1104 xbook = xmldata.Book(self.tree)

1105 self.journal = model_helpers.get_collection("GDML_Books")

1106

1107 else:

1108 if self.xbook:

1109 xbook = self.xbook

1110 else:

1111 xbook = jats_parser.BitsBook(tree=self.tree)

1112 self.warnings.extend(xbook.warnings)

1113

1114 #######################################################################

1115 # Get existing book if any

1116

1117 if not self.provider: 1117 ↛ 1121line 1117 didn't jump to line 1121, because the condition on line 1117 was never false

1118 provider = model_helpers.get_provider_by_name(xbook.provider)

1119 self.provider = provider

1120

1121 book_id = xbook.pid

1122 book = model_helpers.get_container(book_id)

1123

1124 #######################################################################

1125 # Delete any existing book

1126

1127 if book is not None:

1128 if self.import_oai_mode: 1128 ↛ 1129line 1128 didn't jump to line 1129, because the condition on line 1128 was never true

1129 publisher = book.my_publisher

1130

1131 # Note: the existing collection is not removed even if it no longer has a resource

1132 # TODO: urls/commands to add/update/delete a collection

1133

1134 # Removes the book

1135 cmd = ptf_cmds.addContainerPtfCmd()

1136 cmd.set_object_to_be_deleted(book)

1137 cmd.undo()

1138

1139 if publisher and publisher.publishes.count() == 0:

1140 self.remove_publisher(publisher)

1141 else:

1142 raise exceptions.ResourceExists("Book %s already exists" % book_id)

1143

1144 #######################################################################

1145 # Add new book

1146

1147 if xbook.incollection: 1147 ↛ 1152line 1147 didn't jump to line 1152, because the condition on line 1147 was never false

1148 colid = xbook.incollection[0].pid

1149 self._collection = model_helpers.get_collection(colid)

1150 if self._collection is None:

1151 raise exceptions.ResourceDoesNotExist(f"The collection {colid} does not exist")

1152 elif self.import_oai_mode:

1153 self._collection = self.journal

1154

1155 params = {

1156 "xobj": xbook,

1157 "pid": xbook.pid,

1158 "from_folder": self.from_folder,

1159 "to_folder": self.to_folder,

1160 "solr_commit": False,

1161 }

1162

1163 cmd = ptf_cmds.addContainerPtfCmd(params)

1164 cmd.add_collection(self._collection)

1165 cmd.set_provider(provider)

1166

1167 book = cmd.do(self)

1168 self.book = book

1169

1170 self.add_objects_with_location(xbook.ext_links, book, "ExtLink")

1171 self.add_objects_with_location(xbook.related_objects, book, "RelatedObject")

1172 self.add_objects_with_location(xbook.streams, book, "DataStream")

1173

1174 # self.add_metadata_parts(xbook, book) TODO support Metadataparts ?

1175

1176 #######################################################################

1177 # Add Book parts

1178

1179 # JatsIssue is an iterator (has the __iter__ function)

1180 # TODO make JatsBook an iterator as well ?

1181 self.add_parts(xbook.parts, 0)

1182

1183 # Update the collection first year and last year

1184 for incol in xbook.incollection:

1185 self.update_collection_years(incol.pid, book)

1186

1187 return book

1188

1189

1190######################################################################################

1191######################################################################################

1192#

1193# Update Commands

1194#

1195######################################################################################

1196######################################################################################

1197

1198

1199class updateCollectionsXmlCmd(addXmlCmd):

1200 """

1201 updateSerialsXmlCmd: updates one or more journals

1202

1203 Exception raised:

1204 - exceptions.ResourceDoesNotExist during do if the Collection does not exist

1205 - RuntimeError if undo is called

1206 """

1207

1208 def update_collection(self, xcol, do_update=True):

1209 if not xcol: 1209 ↛ 1210line 1209 didn't jump to line 1210, because the condition on line 1209 was never true

1210 return None

1211

1212 provider = model_helpers.get_provider_by_name(xcol.provider)

1213

1214 col_id = xcol.pid

1215 col = model_helpers.get_collection(col_id)

1216

1217 if col is None:

1218 raise exceptions.ResourceDoesNotExist("Collection %s does not exist" % xcol.pid)

1219

1220 if do_update:

1221 params = {

1222 "xobj": xcol,

1223 "solr_commit": False,

1224 "from_folder": self.from_folder,

1225 "to_folder": self.to_folder,

1226 }

1227

1228 # The existing other_ids, abstracts are removed in updateCollectionDatabaseCmd::internal_do

1229 # and the new ones are added in the post_do (addResourceDatabaseCmd)

1230

1231 cmd = ptf_cmds.updateCollectionPtfCmd(params)

1232 cmd.set_provider(provider)

1233 # cmd.set_publisher(publisher)

1234 col = cmd.do()

1235

1236 # The existing extlinks are removed in updateCollectionDatabaseCmd::internal_do

1237 self.add_objects_with_location(xcol.ext_links, col, "ExtLink")

1238 resolver.copy_binary_files(col, self.from_folder, self.to_folder)

1239

1240 # if publisher:

1241 # model_helpers.publish_resource(publisher, col)

1242

1243 return col

1244

1245 def internal_do(self):

1246 super().internal_do()

1247

1248 collections = []

1249

1250 # First, check that all journals exist

1251 for node in self.tree:

1252 xcol = None

1253 if node.tag == "collection-meta": 1253 ↛ 1254line 1253 didn't jump to line 1254, because the condition on line 1253 was never true

1254 xcol = jats_parser.BitsCollection(tree=node)

1255 elif node.tag == "journal-meta": 1255 ↛ 1256line 1255 didn't jump to line 1256, because the condition on line 1255 was never true

1256 xcol = jats_parser.JatsJournal(tree=node)

1257 elif node.tag == "publication-meta": 1257 ↛ 1259line 1257 didn't jump to line 1259, because the condition on line 1257 was never false

1258 xcol = jats_parser.MathdocPublication(tree=node)

1259 self.update_collection(xcol, False)

1260

1261 for node in self.tree:

1262 xcol = None

1263 if node.tag == "collection-meta": 1263 ↛ 1264line 1263 didn't jump to line 1264, because the condition on line 1263 was never true

1264 xcol = jats_parser.BitsCollection(tree=node)

1265 elif node.tag == "journal-meta": 1265 ↛ 1266line 1265 didn't jump to line 1266, because the condition on line 1265 was never true

1266 xcol = jats_parser.JatsJournal(tree=node)

1267 elif node.tag == "publication-meta": 1267 ↛ 1269line 1267 didn't jump to line 1269, because the condition on line 1267 was never false

1268 xcol = jats_parser.MathdocPublication(tree=node)

1269 self.warnings.extend(xcol.warnings)

1270 xcol = self.update_collection(xcol)

1271 collections.append(xcol)

1272

1273 return collections

1274

1275 def internal_undo(self):

1276 raise RuntimeError("update commands do not support the undo")

1277

1278

1279#####################################################################

1280#

1281# replaceIssueXmlCmd: updates an issue

1282#

1283# Exception raised:

1284# - exceptions.ResourceDoesNotExist during do if the Collection/Issue/Provider does not exist

1285# <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value>

1286# - RuntimeError if undo is called

1287#

1288######################################################################

1289class replaceIssueXmlCmd(addXmlCmd):

1290 def internal_do(self):

1291 super().internal_do()

1292

1293 xissue = jats_parser.JatsIssue(tree=self.tree)

1294 self.warnings.extend(xissue.warnings)

1295

1296 xjournal = xissue.journal

1297 journal_id = xjournal.pid

1298 journal = model_helpers.get_collection(journal_id)

1299

1300 if journal is None: 1300 ↛ 1301line 1300 didn't jump to line 1301, because the condition on line 1300 was never true

1301 raise exceptions.ResourceDoesNotExist("Journal %s does not exist" % xjournal.pid)

1302

1303 issue_id = xissue.pid

1304 issue = model_helpers.get_container(issue_id)

1305

1306 if issue is None: 1306 ↛ 1307line 1306 didn't jump to line 1307, because the condition on line 1306 was never true

1307 raise exceptions.ResourceDoesNotExist("Issue %s does not exist" % issue_id)

1308

1309 publisher = issue.my_publisher

1310

1311 cmd = ptf_cmds.addContainerPtfCmd()

1312 cmd.set_object_to_be_deleted(issue)

1313 cmd.undo()

1314

1315 if publisher.publishes.count() == 0:

1316 self.remove_publisher(publisher)

1317

1318 # update the journal first and last year

1319 for the_issue in journal.content.all():

1320 self.update_collection_years(journal_id, the_issue, False)

1321

1322 journal.save()

1323

1324 cmd = addIssueXmlCmd(

1325 {

1326 "xissue": xissue,

1327 "use_body": False,

1328 "solr_commit": False,

1329 "extra_folder": self.from_folder,

1330 "to_folder": self.to_folder,

1331 }

1332 )

1333 issue = cmd.do()

1334

1335 return issue

1336

1337 # node_tag = self.tree.tag

1338 # for child in self.tree:

1339 # node_tag = child.tag

1340

1341 def internal_undo(self):

1342 raise RuntimeError("update commands do not support the undo")

1343

1344

1345class updateBookXmlCmd(addXmlCmd):

1346 """

1347 updateBookXmlCmd: updates a book

1348

1349 Exception raised:

1350 - exceptions.ResourceDoesNotExist during do if the Book does not exist

1351 - RuntimeError if undo is called

1352 """

1353

1354 def internal_do(self):

1355 super().internal_do()

1356

1357 xbook = jats_parser.BitsBook(tree=self.tree)

1358 self.warnings.extend(xbook.warnings)

1359

1360 book_id = xbook.pid

1361 book = model_helpers.get_container(book_id)

1362

1363 if book is None: 1363 ↛ 1364line 1363 didn't jump to line 1364, because the condition on line 1363 was never true

1364 raise exceptions.ResourceDoesNotExist("Book %s does not exist" % xbook.pid)

1365

1366 # unpublish and delete the existing publisher if necessary

1367 # self.update_publisher(xbook, book)

1368

1369 # Note: the existing collection is not removed even if it no longer has a resource

1370 # TODO: urls/commands to add/update/delete a collection

1371

1372 # Removes the book

1373 cmd = ptf_cmds.addContainerPtfCmd()

1374 cmd.set_object_to_be_deleted(book)

1375 cmd.undo()

1376

1377 cmd = addBookXmlCmd(

1378 {

1379 "xbook": xbook,

1380 "use_body": False,

1381 "solr_commit": False,

1382 "from_folder": self.from_folder,

1383 "to_folder": self.to_folder,

1384 }

1385 )

1386 book = cmd.do()

1387

1388 return book

1389

1390 def internal_undo(self):

1391 raise RuntimeError("update commands do not support the undo")

1392

1393

1394class addOrUpdateContainerXmlCmd(addXmlCmd):

1395 """

1396 addOrUpdateContainerXmlCmd: detects Container type from xml and adds or updates an issue or a book

1397

1398 just detect Container type (do not check params etc.)

1399 """

1400

1401 keep_metadata = False

1402 keep_translations = False

1403 backup_folder = None

1404 full_text_folder = ""

1405 fake = False # Parse the XML but do not import

1406 no_bib = False # Ignore the references during the import (used in Geodesic)

1407 embargo = False # Import only the open articles (used in Geodesic)

1408

1409 def check_params(self):

1410 super().check_params()

1411

1412 def internal_do(self):

1413 super().internal_do()

1414

1415 tag = normalize(self.tree.tag)

1416

1417 if tag == "journal-issue": 1417 ↛ 1435line 1417 didn't jump to line 1435, because the condition on line 1417 was never false

1418 cmd = addOrUpdateIssueXmlCmd(

1419 {

1420 "body": self.body,

1421 "keep_metadata": self.keep_metadata,

1422 "keep_translations": self.keep_translations,

1423 "backup_folder": self.backup_folder,

1424 "to_folder": self.to_folder,

1425 "from_folder": self.from_folder,

1426 "xml_file_folder": self.xml_file_folder,

1427 "fake": self.fake,

1428 "no_bib": self.no_bib,

1429 "embargo": self.embargo,

1430 }

1431 )

1432 obj = cmd.do()

1433 self.warnings.extend(cmd.warnings)

1434 return obj

1435 elif tag == "book":

1436 cmd = addOrUpdateBookXmlCmd(

1437 {

1438 "body": self.body,

1439 "from_folder": self.from_folder,

1440 "to_folder": self.to_folder,

1441 "no_bib": self.no_bib,

1442 "embargo": self.embargo,

1443 }

1444 )

1445 obj = cmd.do()

1446 self.warnings.extend(cmd.warnings)

1447 return obj

1448 else:

1449 raise RuntimeError("addOrupdateContainer command can't detect container type")

1450

1451 def internal_undo(self):

1452 raise RuntimeError("update commands do not support the undo")

1453

1454

1455class addOrUpdateIssueXmlCmd(addXmlCmd):

1456 """

1457 addOrUpdateIssueXmlCmd: adds or updates an issue

1458

1459 Adds an issue if it is not in the system or updates the issue if it is already there.

1460 By default, no DOI is assigned for the articles. Set assign_doi to True.

1461

1462 from_folder / to_folder (declared in addXmlCmd): location of binary files to copy

1463 backup_folder: folder where extra data (extid false_positive...) are (to be) stored in a json

1464

1465 keep_metadata:

1466 True if you want to back up extra data (icon, dates, matching ids, ...) in the backup_folder

1467 Default: False

1468 Note: backup_obj_not_in_metadata / restore_obj_not_in_metadata is always called

1469 We always want to preserve GraphicalAbstracts (they are not in the issue XML)

1470

1471 keep_translations:

1472 True if you want back up/restore translations.

1473 Default: False

1474 Note: When you post an article to a journal (test) website, the translation is declared in the XML

1475 But if you import a Cedrics article in Trammel, the XML does not list translations

1476

1477 Exception raised:

1478 - exceptions.ResourceDoesNotExist during do if the Collection/Issue/Provider does not exist

1479 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value>

1480 - RuntimeError if undo is called

1481 """

1482

1483 keep_metadata = False

1484 keep_translations = False

1485 backup_folder = None

1486 assign_doi = False

1487 full_text_folder = ""

1488

1489 xissue = None

1490 fake = False # Parse the XML but do not import

1491 no_bib = False # Ignore the references during the import (used in Geodesic)

1492 embargo = False # Import only the open articles (used in Geodesic)

1493

1494 def check_params(self):

1495 super().check_params()

1496

1497 if self.keep_metadata and self.assign_doi: 1497 ↛ 1498line 1497 didn't jump to line 1498, because the condition on line 1497 was never true

1498 raise ValueError("keep_metadata and assign_doi cannot both be true.")

1499

1500 if self.keep_metadata and self.backup_folder is None: 1500 ↛ 1501line 1500 didn't jump to line 1501, because the condition on line 1500 was never true

1501 raise ValueError("backup_folder needs to be set when keep_metadata is true.")

1502

1503 def internal_do(self):

1504 super().internal_do()

1505

1506 if not self.xissue:

1507 self.xissue = xissue = jats_parser.JatsIssue(

1508 tree=self.tree, from_folder=self.from_folder, no_bib=self.no_bib

1509 )

1510 if len(xissue.warnings) > 0 and self.xml_file_folder:

1511 warnings = []

1512 warning_keys = []

1513 for warning in xissue.warnings:

1514 for key, value in warning.items():

1515 if value not in warning_keys:

1516 warning_keys.append(value)

1517 warnings.append({key: value})

1518 for warning in warnings:

1519 print(warning)

1520 self.warnings.extend(xissue.warnings)

1521 else:

1522 xissue = self.xissue

1523

1524 if self.fake: 1524 ↛ 1525line 1524 didn't jump to line 1525, because the condition on line 1524 was never true

1525 return

1526

1527 xjournal = xissue.journal

1528 journal_id = xjournal.pid

1529 journal = model_helpers.get_collection(journal_id)

1530

1531 if journal is None: 1531 ↛ 1532line 1531 didn't jump to line 1532, because the condition on line 1531 was never true

1532 raise exceptions.ResourceDoesNotExist("Journal %s does not exist" % xjournal.pid)

1533

1534 existing_issue = model_helpers.get_container(xissue.pid)

1535

1536 if existing_issue:

1537 if self.embargo and existing_issue.embargo(): 1537 ↛ 1540line 1537 didn't jump to line 1540, because the condition on line 1537 was never true

1538 # Geodesic is for open access articles.

1539 # We do not want to import the issues under embargo

1540 print(f"Embargo, ignore {xissue.pid}")

1541 return None

1542

1543 if self.keep_metadata:

1544 # On commence par faire un backup de l'existant en cas de bug.

1545 ptf_cmds.exportPtfCmd(

1546 {

1547 "pid": existing_issue.pid,

1548 "with_internal_data": True,

1549 "with_binary_files": False,

1550 "for_archive": False,

1551 "export_folder": os.path.join(settings.MERSENNE_TMP_FOLDER, "backup"),

1552 }

1553 ).do()

1554

1555 # On sauvegarde les données additionnelles (extid, deployed_date,...)

1556 # dans un json qui sera ré-importé avec l'import du nouvel issue

1557 params = {

1558 "pid": existing_issue.pid,

1559 "export_folder": self.backup_folder,

1560 "export_all": True,

1561 "with_binary_files": True,

1562 }

1563 ptf_cmds.exportExtraDataPtfCmd(params).do()

1564

1565 for article in existing_issue.article_set.all():

1566 backup_obj_not_in_metadata(article)

1567 if self.keep_translations:

1568 backup_translation(article)

1569

1570 # On efface l'issue existant, sinon l'import va se plaindre d'articles existants

1571 cmd = ptf_cmds.addContainerPtfCmd()

1572 cmd.set_object_to_be_deleted(existing_issue)

1573 cmd.undo()

1574

1575 # update the journal first and last year

1576 for the_issue in journal.content.all():

1577 self.update_collection_years(journal_id, the_issue, False)

1578

1579 journal.save()

1580 else:

1581 issue_to_appear = model_helpers.get_issue_to_appear(journal_id)

1582

1583 # Dans le cas des AIF, les articles du volume à paraitre sont déplacés

1584 # dans un nouveau volume avant publication (de AIF_0__0_ vers AIF_2018... par ex)

1585 # La 1ère fois, AIF_2018_ n'est pas encore dans PTF et existing_issue vaut None.

1586 # Exemple : AIF_0_0 contient doi1, doi2 et doi3, AIF_2018 contient doi1 et doi2.

1587 # L'import va échouer car on ne peut avoir 2 fois le même article.

1588 # La solution d'effacer AIF_0_0 n'est pas bonne car on perd doi3.

1589 # Il faut supprimer les articles en commun (de _0__0 et 2018_) avant l'import

1590 # du nouveau volume sinon il va y avoir des conflits.

1591

1592 if issue_to_appear and xissue.pid != issue_to_appear.pid:

1593 # On sauvegarde les données additionnelles (extid, deployed_date,...)

1594 # dans un json qui sera ré-importé avec l'import du nouvel issue

1595 # ainsi que image associée via ptf-tools

1596 if self.keep_metadata:

1597 params = {

1598 "pid": issue_to_appear.pid,

1599 "force_pid": xissue.pid,

1600 "export_folder": self.backup_folder,

1601 "export_all": True,

1602 "with_binary_files": True,

1603 }

1604 ptf_cmds.exportExtraDataPtfCmd(params).do()

1605

1606 for xarticle in xissue:

1607 xdoi = getattr(xarticle, "doi")

1608 article = issue_to_appear.article_set.filter(doi=xdoi).first()

1609 if article:

1610 backup_obj_not_in_metadata(article)

1611 if self.keep_translations:

1612 backup_translation(article)

1613

1614 params = {"to_folder": self.to_folder} # pour suppression des binaires

1615 cmd = ptf_cmds.addArticlePtfCmd(params)

1616 cmd.set_object_to_be_deleted(article)

1617 cmd.undo()

1618

1619 # si backup_folder est différent de None, alors addIssueXmlCmd.post_do() utilise importExtraDataPtfCmd

1620 cmd = addIssueXmlCmd(

1621 {

1622 "xissue": xissue,

1623 "use_body": False,

1624 # "body": self.body,

1625 "assign_doi": self.assign_doi,

1626 "full_text_folder": self.full_text_folder, # Cedrics: the full text for SolR is in a separate file

1627 "extra_folder": self.backup_folder,

1628 "from_folder": self.from_folder,

1629 "to_folder": self.to_folder,

1630 "no_bib": self.no_bib,

1631 "embargo": self.embargo,

1632 "solr_commit": False,

1633 }

1634 )

1635 new_issue = cmd.do()

1636

1637 if new_issue: 1637 ↛ 1650line 1637 didn't jump to line 1650, because the condition on line 1637 was never false

1638 new_articles = new_issue.article_set.all()

1639

1640 # Avec l'option self.assign_doi, on vérifie que les doi ont bien été assignés

1641 for article in new_articles:

1642 if self.assign_doi and article.doi is None: 1642 ↛ 1643line 1642 didn't jump to line 1643, because the condition on line 1642 was never true

1643 raise exceptions.ResourceHasNoDoi("The article %s has no DOI" % article.pid)

1644

1645 # TODO garbage collector on articles no longer in the issue

1646 restore_obj_not_in_metadata(article)

1647 if self.keep_translations:

1648 restore_translation(article)

1649

1650 return new_issue

1651

1652 def internal_undo(self):

1653 raise RuntimeError("update commands do not support the undo")

1654

1655

1656class addOrUpdateBookXmlCmd(addXmlCmd):

1657 xbook = None

1658

1659 def internal_do(self):

1660 super().internal_do()

1661

1662 if not self.xbook: 1662 ↛ 1666line 1662 didn't jump to line 1666, because the condition on line 1662 was never false

1663 xbook = jats_parser.BitsBook(tree=self.tree)

1664 self.warnings.extend(xbook.warnings)

1665 else:

1666 xbook = self.xbook

1667

1668 book_id = xbook.pid

1669 book = model_helpers.get_container(book_id)

1670

1671 if book: 1671 ↛ 1672line 1671 didn't jump to line 1672, because the condition on line 1671 was never true

1672 cmd = ptf_cmds.addContainerPtfCmd()

1673 cmd.set_object_to_be_deleted(book)

1674 cmd.undo()

1675

1676 collection = book.get_collection()

1677

1678 # update the collection first and last year

1679 for container in collection.content.all():

1680 self.update_collection_years(collection.pid, container, False)

1681

1682 collection.save()

1683

1684 cmd = addBookXmlCmd(

1685 {

1686 "xbook": xbook,

1687 "use_body": False,

1688 # "body": self.body,

1689 "from_folder": self.from_folder,

1690 "to_folder": self.to_folder,

1691 "solr_commit": False,

1692 }

1693 )

1694 book = cmd.do()

1695 return book

1696

1697

1698class updateBibitemCitationXmlCmd(baseCmd):

1699 """ """

1700

1701 def __init__(self, params=None):

1702 self.bibitem = None

1703

1704 super().__init__(params)

1705

1706 self.required_params.extend(["bibitem"])

1707

1708 def set_bibitem(self, bibitem):

1709 self.bibitem = bibitem

1710

1711 def internal_do(self):

1712 super().internal_do()

1713

1714 new_ids = {}

1715 for bibitemid in self.bibitem.bibitemid_set.all():

1716 new_ids[bibitemid.id_type] = {

1717 "id_type": bibitemid.id_type,

1718 "id_value": bibitemid.id_value,

1719 "checked": bibitemid.checked,

1720 "false_positive": bibitemid.false_positive,

1721 }

1722

1723 xbibitem = jats_parser.update_bibitem_xml(self.bibitem, new_ids)

1724 self.warnings.extend(xbibitem.warnings)

1725

1726 self.bibitem.citation_xml = xbibitem.citation_xml

1727 self.bibitem.citation_html = xbibitem.citation_html

1728 self.bibitem.citation_tex = xbibitem.citation_tex

1729 self.bibitem.save()

1730

1731 def internal_undo(self):

1732 raise RuntimeError("update commands do not support the undo")

1733

1734

1735######################################################################################

1736######################################################################################

1737#

1738# Import Commands

1739#

1740######################################################################################

1741######################################################################################

1742

1743

1744class collectEntireCollectionXmlCmd(baseCmd):

1745 """

1746 Get the PIDs of all the XML of a collection (collection.xml, issues.xml) of a given folder

1747

1748 results:

1749 """

1750

1751 def __init__(self, params=None):

1752 self.pid = None

1753 self.folder = None

1754

1755 super().__init__(params)

1756

1757 self.required_params.extend(["pid", "folder"])

1758

1759 def internal_do(self):

1760 super().internal_do()

1761 pids = [pid for pid, _ in resolver.iterate_collection_folder(self.folder, self.pid)]

1762 return pids

1763

1764

1765class importEntireCollectionXmlCmd(baseCmd):

1766 """

1767 Import all the XML of a collection (collection.xml, issues.xml) of a given folder

1768

1769 results:

1770 """

1771

1772 def __init__(self, params=None):

1773 self.pid = None

1774 self.from_folder = None

1775 self.to_folder = None

1776 self.backup_folder = None

1777 self.keep_metadata = False

1778 self.keep_translations = False

1779

1780 self.with_cedrics = True

1781 self.from_cedrics = False # The entire collection is in Cedrics format

1782 self.date_for_pii = False # Fetch publication_date for Elsevier articles

1783 self.first_issue = ""

1784 self.fake = False # Parse the XML but do not import

1785

1786 self.no_bib = False # Ignore the references during the import (used in Geodesic)

1787 self.embargo = False # Import only the open articles (used in Geodesic)

1788

1789 self.caller = None

1790 self.callback = None

1791 self.job = None

1792

1793 super().__init__(params)

1794

1795 self.required_params.extend(["pid", "from_folder"])

1796

1797 def internal_do(self):

1798 super().internal_do()

1799

1800 pid = self.pid

1801 resource = model_helpers.get_resource(pid)

1802 if not resource and not self.fake: 1802 ↛ 1811line 1802 didn't jump to line 1811, because the condition on line 1802 was never false

1803 body = resolver.get_archive_body(self.from_folder, pid, None)

1804 journals = addCollectionsXmlCmd(

1805 {"body": body, "from_folder": self.from_folder, "to_folder": self.to_folder}

1806 ).do()

1807 if not journals: 1807 ↛ 1808line 1807 didn't jump to line 1808, because the condition on line 1807 was never true

1808 raise ValueError(self.from_folder + " does not contain a collection")

1809 resource = journals[0]

1810

1811 obj = resource.cast()

1812

1813 if obj.classname != "Collection": 1813 ↛ 1814line 1813 didn't jump to line 1814, because the condition on line 1813 was never true

1814 raise ValueError(pid + " does not contain a collection")

1815

1816 if self.with_cedrics: 1816 ↛ 1819line 1816 didn't jump to line 1819, because the condition on line 1816 was never true

1817 # with_cedrics means that you want to import everything from scratch

1818 # Delete solr documents (01/28/2020: Solr can have multiple docs with the same PID)

1819 cmd = solr_cmds.solrDeleteCmd({"q": "pid:" + self.pid + "*"})

1820 cmd.do()

1821

1822 i = 0

1823 for pid, file_ in resolver.iterate_collection_folder(

1824 self.from_folder, self.pid, self.first_issue

1825 ):

1826 if self.callback is None: 1826 ↛ 1829line 1826 didn't jump to line 1829, because the condition on line 1826 was never false

1827 print(pid)

1828

1829 if self.from_cedrics: 1829 ↛ 1830line 1829 didn't jump to line 1830, because the condition on line 1829 was never true

1830 cmd = importCedricsIssueDirectlyXmlCmd(

1831 {

1832 "colid": self.pid,

1833 "input_file": file_,

1834 "remove_email": False,

1835 "remove_date_prod": True,

1836 "copy_files": True,

1837 "force_dois": False,

1838 }

1839 )

1840 else:

1841 body = resolver.get_body(file_)

1842 xml_file_folder = os.path.dirname(file_)

1843 cmd = addOrUpdateContainerXmlCmd(

1844 {

1845 "body": body,

1846 "from_folder": self.from_folder,

1847 "to_folder": self.to_folder,

1848 "backup_folder": self.backup_folder, # Read extra data (if any) stored in a json file

1849 "xml_file_folder": xml_file_folder, # when article.XML are in separate files

1850 "keep_metadata": self.keep_metadata, # Backup/Restore existing data not in the XML

1851 "keep_translations": self.keep_translations, # Backup/Restore existing translations

1852 "no_bib": self.no_bib,

1853 "embargo": self.embargo,

1854 # Needed in Trammel

1855 "fake": self.fake,

1856 }

1857 )

1858 cmd.do()

1859

1860 i += 1

1861 if self.callback: 1861 ↛ 1862line 1861 didn't jump to line 1862, because the condition on line 1861 was never true

1862 self.callback(self.job, i)

1863

1864 if self.with_cedrics: 1864 ↛ 1865line 1864 didn't jump to line 1865, because the condition on line 1864 was never true

1865 src_folder = os.path.join(settings.CEDRAM_XML_FOLDER, self.pid, "metadata")

1866

1867 xml_files = [

1868 os.path.join(src_folder, f)

1869 for f in os.listdir(src_folder)

1870 if os.path.isfile(os.path.join(src_folder, f)) and f.endswith(".xml")

1871 ]

1872 for xml_file in xml_files:

1873 if self.callback is None:

1874 print(xml_file)

1875

1876 cmd = importCedricsIssueXmlCmd(

1877 {

1878 "colid": self.pid,

1879 "input_file": xml_file,

1880 "from_folder": self.from_folder,

1881 "to_folder": self.to_folder,

1882 }

1883 )

1884 cmd.do()

1885

1886

1887class importCedricsIssueXmlCmd(baseCmd):

1888 def __init__(self, params=None):

1889 self.colid = None

1890 self.input_file = None

1891 self.remove_email = True

1892 self.remove_date_prod = True

1893 self.diff_only = False

1894 self.body = None

1895 self.xissue = None

1896 self.copy_files = True

1897

1898 super().__init__(params)

1899

1900 self.required_params.extend(["colid"])

1901

1902 def import_full_text(self, issue):

1903 """

1904 Some journals want to display the full text in HTML (CRCHIM/CRGEOS/CEBIOL)

1905 Read the XML file and convert the body in HTML

1906 """

1907 tex_src_folder = resolver.get_cedram_issue_tex_folder(self.colid, issue.pid)

1908 tex_folders, _ = resolver.get_cedram_tex_folders(self.colid, issue.pid)

1909

1910 if len(tex_folders) > 0: 1910 ↛ exitline 1910 didn't return from function 'import_full_text', because the condition on line 1910 was never false

1911 i = 0

1912 for article in issue.article_set.all():

1913 article_folder = tex_folders[i]

1914 xml_file = os.path.join(

1915 tex_src_folder, article_folder, "FullText", article_folder + ".xml"

1916 )

1917

1918 cmd = ptf_cmds.updateResourceIdPtfCmd(

1919 {"id_type": "ojs-id", "id_value": article_folder}

1920 )

1921 cmd.set_resource(article)

1922 cmd.do()

1923

1924 if os.path.isfile(xml_file):

1925 with open(xml_file, encoding="utf-8") as f:

1926 body = f.read()

1927

1928 cmd = addBodyInHtmlXmlCmd(

1929 {

1930 "body": body,

1931 "from_folder": settings.CEDRAM_XML_FOLDER,

1932 # nécessaire pour la copie des binaires type image

1933 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER, # idem

1934 }

1935 )

1936 cmd.set_article(article)

1937 cmd.do()

1938

1939 i += 1

1940

1941 def import_in_db(self):

1942 """

1943 Import Cedrics issue from /cedram_dev/exploitation/cedram

1944 This worflow is no longer used.

1945 """

1946

1947 # Cedrics: the full text for SolR is in a separate file

1948 full_text_folder = os.path.dirname(os.path.dirname(self.input_file)) + "/plaintext/"

1949

1950 params = {

1951 "assign_doi": False,

1952 "full_text_folder": full_text_folder,

1953 "keep_metadata": True,

1954 "keep_translations": True,

1955 "use_body": False,

1956 "xissue": self.xissue,

1957 "backup_folder": settings.MERSENNE_TMP_FOLDER,

1958 "from_folder": settings.CEDRAM_XML_FOLDER,

1959 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER if self.copy_files else None,

1960 }

1961

1962 # params['body'] = self.body

1963

1964 cmd = addOrUpdateIssueXmlCmd(params)

1965 issue = cmd.do()

1966 self.warnings.extend(cmd.get_warnings())

1967

1968 # resolver.copy_binary_files(

1969 # issue,

1970 # settings.CEDRAM_XML_FOLDER,

1971 # settings.MERSENNE_TEST_DATA_FOLDER)

1972

1973 self.import_full_text(issue)

1974

1975 return issue

1976

1977 def compare_issue(self):

1978 xissue = self.xissue

1979 issues_diff = {}

1980 result = True

1981

1982 time1 = timezone.now()

1983

1984 new_dois = [article.doi for article in xissue.articles]

1985

1986 article_qs = Article.objects.filter(doi__in=new_dois).prefetch_related(

1987 "abstract_set",

1988 "kwd_set",

1989 "subj_set",

1990 "datastream_set",

1991 "relatedobject_set",

1992 "resourcecount_set",

1993 "contributions",

1994 "contributions__contribaddress_set",

1995 "bibitem_set__bibitemid_set",

1996 "bibitem_set__contributions",

1997 "bibitem_set__contributions__contribaddress_set",

1998 )

1999

2000 issue = None

2001 try:

2002 issue = (

2003 Container.objects.select_related("my_collection", "my_publisher")

2004 .prefetch_related(

2005 Prefetch("article_set", queryset=article_qs, to_attr="articles_from_doi")

2006 )

2007 .get(sites__id=settings.SITE_ID, pid=xissue.pid)

2008 )

2009 except Container.DoesNotExist:

2010 pass

2011

2012 if issue:

2013 data_issue = model_data_converter.db_to_issue_data(issue, issue.articles_from_doi)

2014

2015 time2 = timezone.now()

2016 delta = time2 - time1

2017

2018 delta.seconds + delta.microseconds / 1e6

2019 print(delta)

2020

2021 # Handle xml cmds side effects (ex: "numdam" changed into "mathdoc", ...)

2022 model_data_comparator.prepare_issue_for_comparison(xissue)

2023

2024 issue_comparator = model_data_comparator.IssueDataComparator()

2025

2026 result = issue_comparator.compare(data_issue, xissue, issues_diff)

2027

2028 return (result, issues_diff, xissue)

2029

2030 def delete_previous_file(self, output_folder):

2031 basename = os.path.basename(self.input_file)

2032

2033 output_file = os.path.join(output_folder, self.colid, basename)

2034 if os.path.isfile(output_file):

2035 os.remove(output_file)

2036

2037 os.makedirs(output_folder, exist_ok=True)

2038 os.makedirs(os.path.dirname(output_file), exist_ok=True)

2039

2040 return output_file

2041

2042 def import_cedrics_issue(self):

2043 """

2044 Import Cedrics issue from /cedram_dev/exploitation/cedram

2045 This worflow is no longer used.

2046 Cedrics issues are imported from /cedram_dev/production_tex/CEDRAM

2047 (see importCedricsIssueDirectlyXmlCmd below)

2048 """

2049

2050 output_folder = settings.MERSENNE_TMP_FOLDER

2051 ptf_xsl_folder = settings.PTF_XSL_FOLDER

2052 log_file = os.path.join(output_folder, settings.MERSENNE_LOG_FILE)

2053

2054 # 1. Delete the previous file

2055 output_file = self.delete_previous_file(output_folder)

2056

2057 # 2. Transform the cedrics XML into JATS

2058 cmd_folder = os.path.join(ptf_xsl_folder, "cedram")

2059

2060 cmd_str = 'cd {}; {} cedram2ptf.py -v -x {} -p {} -o {} -b "" -l {} {} {} > {} 2>&1'.format(

2061 cmd_folder,

2062 os.path.join(settings.VIRTUALENV_DIR, "bin/python"),

2063 "-s" if self.colid in settings.MERSENNE_SEMINARS else "",

2064 self.input_file,

2065 output_folder,

2066 log_file + "1",

2067 # option -e for cedram2ptf.py for not removing email

2068 "-e" if not self.remove_email else "",

2069 "-t" if self.remove_date_prod else "",

2070 log_file,

2071 )

2072

2073 log_file2 = log_file + "2"

2074 with open(log_file2, "w", encoding="ascii") as file_:

2075 file_.write(cmd_str + "\n")

2076

2077 sys.path.append(ptf_xsl_folder + "/lib")

2078

2079 try:

2080 result = subprocess.check_output(cmd_str, shell=True)

2081 except Exception as e:

2082 with open(log_file) as logfile_:

2083 logfile_body = logfile_.read()

2084 message = str(e) + "\n" + logfile_body + "\n"

2085 file_.write(message)

2086 file_.close()

2087 raise RuntimeError(message)

2088

2089 file_.write(str(result) + "\n")

2090

2091 # Check if the output_file has been created

2092 if not os.path.isfile(output_file):

2093 raise RuntimeError("The file was not converted in JATS")

2094

2095 with open(output_file, encoding="utf-8") as f:

2096 self.body = f.read()

2097

2098 parser = etree.XMLParser(

2099 huge_tree=True, recover=True, remove_blank_text=True, remove_comments=True

2100 )

2101 tree = etree.fromstring(self.body.encode("utf-8"), parser=parser)

2102 self.xissue = jats_parser.JatsIssue(tree=tree)

2103 self.warnings.extend(self.xissue.warnings)

2104

2105 def internal_do(self):

2106 super().internal_do()

2107

2108 if not self.xissue: 2108 ↛ 2111line 2108 didn't jump to line 2111, because the condition on line 2108 was never false

2109 self.import_cedrics_issue()

2110

2111 result = None

2112

2113 if self.diff_only: 2113 ↛ 2114line 2113 didn't jump to line 2114, because the condition on line 2113 was never true

2114 result = self.compare_issue()

2115 else:

2116 result = self.import_in_db()

2117

2118 return result

2119

2120

2121# import from /cedram_dev/production_tex/CEDRAM

2122class importCedricsIssueDirectlyXmlCmd(importCedricsIssueXmlCmd):

2123 def __init__(self, params=None):

2124 self.is_seminar = False

2125 self.article_folders = None

2126 self.force_dois = True

2127 super().__init__(params)

2128

2129 def read_file(self, filename, skip_lines=2):

2130 i = 0

2131 lines = []

2132 try:

2133 with open(filename, encoding="utf-8") as fr:

2134 for line in fr:

2135 if i > skip_lines:

2136 lines.append(line)

2137 i += 1

2138 except UnicodeDecodeError:

2139 i = 0

2140 lines = []

2141 with open(filename, encoding="iso-8859-1") as fr:

2142 for line in fr:

2143 if i > skip_lines:

2144 lines.append(line)

2145 i += 1

2146

2147 return lines

2148

2149 def import_cedrics_issue(self):

2150 """

2151 Parse the Cedrics XML directly, without Cedrics -> JATS transformation

2152 The deplace_fasc script is no longer needed, but the Cedrics issue XML has to be created

2153 Workflow

2154 1. Get the list of articles from /cedram_dev/production_tex/CEDRAM

2155 2. Cat the article XML files into one issue.XML

2156 3. Read the Cedrics issue.XML

2157

2158 :return:

2159 """

2160

2161 output_folder = settings.MERSENNE_TMP_FOLDER

2162 output_file = self.delete_previous_file(output_folder)

2163

2164 basename = os.path.basename(self.input_file)

2165 if "-cdrxml" in basename: 2165 ↛ 2168line 2165 didn't jump to line 2168, because the condition on line 2165 was never false

2166 pid = basename.split("-cdrxml.")[0]

2167 else:

2168 pid = basename.split(".xml")[0]

2169

2170 # 1. Get the list of articles

2171 tex_src_folder = resolver.get_cedram_issue_tex_folder(self.colid, pid)

2172 self.article_folders, self.dois = resolver.get_cedram_tex_folders(self.colid, pid)

2173

2174 # 2. Create the issue XML file

2175 with open(output_file, "w", encoding="utf-8") as fw:

2176 # 2.a. Start the issue.xml based on @pid-cdrxml.xml

2177 fw.write('<?xml version="1.0" encoding="utf-8" standalone="no"?>\n')

2178 fw.write('<!DOCTYPE cedram SYSTEM "/home/cedram/XML/dtd/cedram.dtd">\n')

2179 fw.write("<cedram>\n")

2180

2181 lines = self.read_file(self.input_file)

2182 for line in lines:

2183 fw.write(line)

2184

2185 # 2.b. Cat the article XML files

2186 for basename in self.article_folders:

2187 src_file = os.path.join(tex_src_folder, basename, basename + "-cdrxml.xml")

2188

2189 lines = self.read_file(src_file)

2190 for line in lines:

2191 fw.write(line)

2192

2193 fw.write("</cedram>\n")

2194

2195 # 3. Read the Cedrics issue.XML

2196 with open(output_file, encoding="utf-8") as f:

2197 self.body = f.read()

2198

2199 parser = etree.XMLParser(

2200 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True

2201 )

2202 tree = etree.fromstring(self.body.encode("utf-8"), parser=parser)

2203 self.xissue = cedrics_parser.CedricsIssue(

2204 tree=tree,

2205 is_seminar=self.is_seminar,

2206 ignore_date_published=self.remove_date_prod,

2207 article_folders=self.article_folders,

2208 dois=self.dois,

2209 )

2210 if self.force_dois: 2210 ↛ 2215line 2210 didn't jump to line 2215, because the condition on line 2210 was never false

2211 for xarticle in self.xissue.articles:

2212 if xarticle.doi is None: 2212 ↛ 2213line 2212 didn't jump to line 2213, because the condition on line 2212 was never true

2213 raise ValueError(xarticle.pid, "n'a pas de doi")

2214

2215 self.warnings.extend(self.xissue.warnings)

2216

2217 def import_in_db(self):

2218 params = {

2219 "assign_doi": False,

2220 "full_text_folder": settings.CEDRAM_TEX_FOLDER, # the full text for SolR is in a separate file

2221 "keep_metadata": True,

2222 "keep_translations": True, # The cedrics XML does not have the translations. backup/restore them.

2223 "use_body": False,

2224 "xissue": self.xissue,

2225 "backup_folder": settings.MERSENNE_TMP_FOLDER, # temp folder used to backup/restore info during the import

2226 "from_folder": settings.CEDRAM_TEX_FOLDER,

2227 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER if self.copy_files else None,

2228 }

2229

2230 cmd = addOrUpdateIssueXmlCmd(params)

2231 issue = cmd.do()

2232 self.warnings.extend(cmd.get_warnings())

2233

2234 self.import_full_text(issue)

2235

2236 return issue

2237

2238

2239class addCedricsIssueXmlCmd(addXmlCmd):

2240 assign_doi = False

2241 full_text_folder = ""

2242 import_folder = None

2243 prod_deployed_date_iso_8601_date_str = None

2244 xissue = None

2245 remove_blank_text = False

2246 is_seminar = False

2247

2248 def internal_do(self):

2249 super().internal_do()

2250

2251 self.xissue = cedrics_parser.CedricsIssue(tree=self.tree, is_seminar=self.is_seminar)

2252

2253 return self.xissue

2254

2255

2256class addorUpdateCedricsArticleXmlCmd(baseCmd):

2257 def __init__(self, params=None):

2258 self.container_pid = None

2259 self.article_folder_name = None

2260

2261 super().__init__(params)

2262

2263 self.required_params.extend(["container_pid", "article_folder_name"])

2264

2265 def internal_do(self):

2266 super().internal_do()

2267

2268 issue = model_helpers.get_container(self.container_pid)

2269 if not issue:

2270 raise exceptions.ResourceDoesNotExist(f"Issue {self.container_pid} does not exist")

2271

2272 colid = issue.my_collection.pid

2273 article_folder = os.path.join(

2274 settings.CEDRAM_TEX_FOLDER, colid, self.container_pid, self.article_folder_name

2275 )

2276

2277 # 1. Read the Cedrics article.XML

2278 input_file = os.path.join(article_folder, f"{self.article_folder_name}-cdrxml.xml")

2279 with open(input_file, encoding="utf-8") as f:

2280 body = f.read()

2281

2282 # 2. Parse the file and create an xarticle

2283 is_seminar = colid in settings.MERSENNE_SEMINARS

2284 parser = etree.XMLParser(

2285 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True

2286 )

2287 tree = etree.fromstring(body.encode("utf-8"), parser=parser)

2288 xarticle = cedrics_parser.CedricsArticle(

2289 tree=tree,

2290 colid=colid,

2291 issue_id=self.container_pid,

2292 is_seminar=is_seminar,

2293 ignore_date_published=True,

2294 article_folder=self.article_folder_name,

2295 )

2296 if xarticle.doi is None:

2297 raise ValueError(xarticle.pid, "n'a pas de doi")

2298

2299 # Get the article position in its issue (seq) to preserve its order

2300 article_folders, dois = resolver.get_cedram_tex_folders(colid, self.container_pid)

2301 i = 1

2302 for folder in article_folders:

2303 if folder == self.article_folder_name:

2304 xarticle.seq = i

2305 i += 1

2306

2307 existing_article = model_helpers.get_article(xarticle.pid)

2308 temp_folder = settings.MERSENNE_TMP_FOLDER

2309

2310 # 3. Backup/Suppression de l'article existant

2311 if existing_article:

2312 # On commence par faire un backup de l'existant en cas de bug.

2313 ptf_cmds.exportPtfCmd(

2314 {

2315 "pid": self.container_pid,

2316 "with_internal_data": True,

2317 "with_binary_files": False,

2318 "for_archive": False,

2319 "export_folder": os.path.join(temp_folder, "backup"),

2320 }

2321 ).do()

2322

2323 # On sauvegarde les données additionnelles (extid, deployed_date,...) dans un json

2324 params = {

2325 "pid": existing_article.pid,

2326 "export_folder": temp_folder,

2327 "export_all": True,

2328 "with_binary_files": True,

2329 }

2330 ptf_cmds.exportExtraDataPtfCmd(params).do()

2331

2332 backup_obj_not_in_metadata(existing_article)

2333 backup_translation(existing_article)

2334

2335 # Inutile d'effacer l'article existant, addArticleXmlCmd le fait en mode standalone

2336

2337 # 4. Ajout de l'article dans Django/SolR

2338 params = {

2339 "xarticle": xarticle,

2340 "issue": issue,

2341 "standalone": True,

2342 "use_body": False, # No self.body with the content of the XML file; xarticle is passed directly

2343 "full_text_folder": settings.CEDRAM_TEX_FOLDER, # the full text for SolR is in a separate file

2344 # temp folder used to backup/restore info during the import

2345 "from_folder": settings.CEDRAM_TEX_FOLDER,

2346 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER,

2347 "keep_translations": True,

2348 }

2349

2350 cmd = addArticleXmlCmd(params)

2351 cmd.set_collection(issue.my_collection)

2352 article = cmd.do()

2353

2354 # 5. Lecture du full text en HTML

2355 xml_file = os.path.join(article_folder, "FullText", self.article_folder_name + ".xml")

2356 if os.path.isfile(xml_file):

2357 with open(xml_file, encoding="utf-8") as f:

2358 body = f.read()

2359

2360 cmd = addBodyInHtmlXmlCmd(

2361 {

2362 "body": body,

2363 "from_folder": settings.CEDRAM_XML_FOLDER,

2364 # nécessaire pour la copie des binaires type image

2365 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER, # idem

2366 "remove_blank_text": False,

2367 }

2368 )

2369 cmd.set_article(article)

2370 cmd.do()

2371

2372 # 6. On ajoute l'ojs-id pour ptf-tools

2373 cmd = ptf_cmds.updateResourceIdPtfCmd(

2374 {"id_type": "ojs-id", "id_value": self.article_folder_name}

2375 )

2376 cmd.set_resource(article)

2377 cmd.do()

2378

2379 # 7. On restaure les données additionnelles (extid, deployed_date,...)

2380 if existing_article:

2381 ptf_cmds.importExtraDataPtfCmd(

2382 {"pid": existing_article.pid, "import_folder": temp_folder}

2383 ).do()

2384

2385 restore_obj_not_in_metadata(article)

2386 restore_translation(article)

2387

2388 return article

2389

2390

2391class transformBodyInHtmlXmlCmd(addXmlCmd):

2392 """

2393 transformBodyInHtmlXmlCmd: transform the JATS body in HTML

2394

2395 TODO: handle images,...

2396

2397 """

2398

2399 use_body = False

2400

2401 def internal_do(self):

2402 super().internal_do()

2403

2404 xsl_file = settings.PTF_HTML_XSL

2405 xslt_doc = etree.parse(xsl_file)

2406 t = etree.XSLT(xslt_doc)

2407

2408 html_tree = t(self.tree).getroot()

2409

2410 body = html_tree.find("body/article/main")

2411 text = xmldata_jats.innerxml(body).decode("utf-8")

2412

2413 return text

2414

2415

2416class addBodyInHtmlXmlCmd(addXmlCmd):

2417 """

2418 addBodyInHtmlXmlCmd: read the JATS body of an article

2419 and create the corresponding HTML

2420

2421 TODO: handle images,... manage warnings for unused tag ?

2422

2423 """

2424

2425 def __init__(self, params=None):

2426 self.article = None

2427 self.pid = None

2428

2429 super().__init__(params)

2430

2431 def set_article(self, article):

2432 self.article = article

2433

2434 def pre_do(self):

2435 super().pre_do()

2436

2437 if self.pid is None and self.article is None: 2437 ↛ 2438line 2437 didn't jump to line 2438, because the condition on line 2437 was never true

2438 raise ValueError("pid et article sont vides")

2439

2440 if self.article is None: 2440 ↛ 2441line 2440 didn't jump to line 2441, because the condition on line 2440 was never true

2441 self.article = model_helpers.get_article(self.pid)

2442

2443 if self.pid is None: 2443 ↛ exitline 2443 didn't return from function 'pre_do', because the condition on line 2443 was never false

2444 self.pid = self.article.pid

2445

2446 def internal_do(self):

2447 super().internal_do()

2448

2449 xarticle = jats_parser.JatsArticle(tree=self.tree, pid=self.pid)

2450 # faut il récupérer les warnings du parseHTML ?

2451 # self.warnings.extend(xarticle.warnings)

2452 self.article.relatedobject_set.filter(rel="html-image").delete()

2453 self.add_objects_with_location(xarticle.figures, self.article, "RelatedObject")

2454

2455 params = {

2456 "body_html": xarticle.body_html,

2457 "body_tex": xarticle.body_tex,

2458 "body_xml": xarticle.body_xml,

2459 "use_page_count": False,

2460 }

2461

2462 cmd = ptf_cmds.updateArticlePtfCmd(params)

2463 cmd.set_article(self.article)

2464 cmd.do()

2465

2466 # copy_binary_files will call resolver.copy_html_images

2467 # to copy the article images

2468 # because updateArticlePtfCmd is not from addPtfCmd, need to copy files here

2469

2470 resolver.copy_html_images(

2471 self.article, settings.MERSENNE_TEST_DATA_FOLDER, settings.CEDRAM_XML_FOLDER

2472 )

2473

2474

2475class updateCacheXmlCmd(baseCmd):

2476 """

2477 recreate the citation_html field of the bibitems

2478

2479 Params: colid: pid of the collection to process

2480 """

2481

2482 def __init__(self, params=None):

2483 self.colid = None

2484 self.start_id = None

2485

2486 super().__init__(params)

2487

2488 self.required_params.extend(["colid"])

2489

2490 def update_article(self, xarticle):

2491 article = model_helpers.get_article(xarticle.pid)

2492 if article is None:

2493 raise exceptions.ResourceDoesNotExist(f"Article {xarticle.pid} does not exist")

2494

2495 article.title_html = xarticle.title_html

2496 article.title_tex = xarticle.title_tex

2497 article.trans_title_html = xarticle.trans_title_html

2498 article.trans_title_tex = xarticle.trans_title_tex

2499 article.save()

2500

2501 for xabstract, abstract in zip(xarticle.abstracts, article.abstract_set.all()):

2502 abstract.value_html = xabstract["value_html"]

2503 abstract.value_tex = xabstract["value_tex"]

2504 abstract.save()

2505

2506 # for xkwd_group, kwd_group in zip(xarticle.kwd_groups, article.kwdgroup_set.all()):

2507 # kwd_group.value_html = xkwd_group['value_html']

2508 # kwd_group.value_tex = xkwd_group['value_tex']

2509 # kwd_group.save()

2510

2511 for xbib, bib in zip(xarticle.bibitems, article.bibitem_set.all()):

2512 bib.citation_html = xbib.citation_html

2513 bib.citation_tex = xbib.citation_tex

2514 bib.article_title_tex = xbib.article_title_tex

2515 bib.chapter_title_tex = xbib.chapter_title_tex

2516 bib.source_tex = xbib.source_tex

2517 bib.volume = xbib.volume

2518 bib.save()

2519

2520 if hasattr(settings, "SHOW_BODY") and settings.SHOW_BODY:

2521 params = {

2522 "body_html": xarticle.body_html,

2523 "body_tex": xarticle.body_tex,

2524 "body_xml": xarticle.body_xml,

2525 "use_page_count": False,

2526 }

2527

2528 cmd = ptf_cmds.updateArticlePtfCmd(params)

2529 cmd.set_article(article)

2530 cmd.do()

2531

2532 def internal_do(self):

2533 super().internal_do()

2534

2535 collection = model_helpers.get_collection(self.colid)

2536 if collection is None:

2537 raise exceptions.ResourceDoesNotExist(f"Collection {self.colid} does not exist")

2538

2539 qs = collection.content.all().order_by("pid")

2540 start = self.start_id is None

2541 for container in qs:

2542 if not start and container.pid == self.start_id:

2543 start = True

2544

2545 if start:

2546 print(container.pid)

2547 with_body = hasattr(settings, "SHOW_BODY") and settings.SHOW_BODY

2548 xml_body = ptf_cmds.exportPtfCmd(

2549 {"pid": container.pid, "with_body": with_body}

2550 ).do()

2551

2552 parser = etree.XMLParser(

2553 huge_tree=True,

2554 recover=True,

2555 remove_blank_text=False,

2556 remove_comments=True,

2557 resolve_entities=True,

2558 )

2559 tree = etree.fromstring(xml_body.encode("utf-8"), parser=parser)

2560 xissue = jats_parser.JatsIssue(tree=tree)

2561

2562 for xarticle in xissue:

2563 self.update_article(xarticle)

Coverage for apps/ptf/cmds/xml_cmds.py: 67%

1212 statements