Coverage for apps/ptf/cmds/xml_cmds.py: 66%

1161 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2024-02-28 09:09 +0000

1import copy 

2import datetime 

3import os.path 

4import subprocess 

5import sys 

6import traceback 

7 

8from lxml import ElementInclude 

9from lxml import etree 

10 

11from django.conf import settings 

12from django.db import transaction 

13from django.db.models import Prefetch 

14from django.utils import timezone 

15 

16from ptf import exceptions 

17from ptf import model_data 

18from ptf import model_data_comparator 

19from ptf import model_data_converter 

20from ptf import model_helpers 

21from ptf import tex 

22from ptf import utils 

23from ptf.cmds import ptf_cmds 

24from ptf.cmds import solr_cmds 

25from ptf.cmds.base_cmds import baseCmd 

26from ptf.cmds.xml import xml_utils 

27from ptf.cmds.xml.cedrics import cedrics_parser 

28 

29# KEEP THIS UNUSED IMPORT THEY ARE USED 

30from ptf.cmds.xml.jats import jats_parser 

31from ptf.cmds.xml.jats import xmldata as xmldata_jats 

32from ptf.cmds.xml.xml_utils import normalize 

33from ptf.display import resolver 

34from ptf.models import Article 

35from ptf.models import Collection 

36from ptf.models import Container 

37from ptf.models import Person 

38from ptf.models import backup_obj_not_in_metadata 

39from ptf.models import restore_obj_not_in_metadata 

40 

41 

42def find_file(name): 

43 paths = settings.MANAGER_XSLT_DIRS 

44 for path in paths: 

45 for root, _, files in os.walk(path): 

46 if name in files: 

47 return os.path.join(root, name) 

48 return None 

49 

50 

51def get_transform(name): 

52 file_path = find_file(f"{name}.xsl") 

53 xslt_doc = etree.parse(file_path) 

54 return etree.XSLT(xslt_doc) 

55 

56 

57class addXmlCmd(baseCmd): 

58 """ 

59 addXmlCmd: base class for commands that take an XML as input 

60 The XML is passed with the body param 

61 

62 Example with a file: 

63 f = open('journal.xml') 

64 body = f.read() 

65 f.close() 

66 cmd = add...XmlCmd( { "body":body } ) 

67 

68 Exception raised: 

69 - ValueError if the init params are empty 

70 """ 

71 

72 use_body = True 

73 body = None 

74 tree = None 

75 solr_commit_at_the_end = True 

76 xml_filename_in_log = None 

77 remove_blank_text = False 

78 xml_file_folder = None 

79 

80 def __init__(self, params=None): 

81 super().__init__(params) 

82 

83 if self.use_body: 

84 self.required_params.extend(["body"]) 

85 

86 def get_logname(self): 

87 filename = "" 

88 

89 if hasattr(settings, "LOG_DIR"): 89 ↛ 99line 89 didn't jump to line 99, because the condition on line 89 was never false

90 i = 0 

91 today = datetime.date.today() 

92 basename = str(today) + "-" + self.__class__.__name__ + "-" 

93 filename = os.path.join(settings.LOG_DIR, basename + str(i) + ".xml") 

94 

95 while os.path.isfile(filename): 

96 i += 1 

97 filename = os.path.join(settings.LOG_DIR, basename + str(i) + ".xml") 

98 

99 return filename 

100 

101 def pre_do(self): 

102 super().pre_do() 

103 

104 if self.use_body: 

105 # The Cedrics -> JATS XSLT transform manually adds space=preserve around 

106 # the nodes with mixed-content, but leaves the text unchanged. 

107 # As such, parsing the Cedrics XML cannot be done with remove_blank_text=True 

108 # Or the spaces will be removed whereas the JATS XML will keep them. 

109 # We still need the remove_blank_text=True for JATS XML for all the other nodes 

110 parser = etree.XMLParser( 

111 huge_tree=True, 

112 recover=True, 

113 remove_blank_text=self.remove_blank_text, 

114 remove_comments=True, 

115 resolve_entities=True, 

116 ) 

117 # if isinstance(self.body, str): 

118 # self.body = self.body 

119 if self.xml_file_folder is not None: 

120 if self.xml_file_folder[-1] != "/": 

121 self.xml_file_folder += "/" 

122 # For ElementInclude to find the href 

123 self.body = self.body.replace( 

124 'xmlns:xlink="http://www.w3.org/1999/xlink"', "" 

125 ).replace("xlink:href", "href") 

126 tree = etree.fromstring(self.body.encode("utf-8"), parser=parser) 

127 

128 if self.xml_file_folder is not None: 

129 ElementInclude.include(tree, base_url=self.xml_file_folder) 

130 # t = get_transform('strip-namespace') 

131 # self.tree = t(tree).getroot() 

132 self.tree = tree 

133 

134 if self.tree is None: 134 ↛ 135line 134 didn't jump to line 135, because the condition on line 134 was never true

135 raise ValueError("tree est vide") 

136 

137 # Write the xml body on disk 

138 if hasattr(settings, "LOG_DIR") and self.body and self.use_body: 

139 self.xml_filename_in_log = self.get_logname() 

140 

141 with open(self.xml_filename_in_log, "w", encoding="utf-8") as file_: 

142 file_.write(self.body) 

143 

144 @transaction.atomic 

145 def do(self, parent=None): 

146 try: 

147 obj = super().do(parent) 

148 except Exception as e: 

149 ptf_cmds.do_solr_rollback() 

150 

151 # Empty sub_cmds to ignore undo 

152 self.cmds = [] 

153 

154 # Write the xml body on disk 

155 if hasattr(settings, "LOG_DIR") and self.body and self.use_body: 

156 with open( 

157 os.path.join(settings.LOG_DIR, "cmds.log"), "a", encoding="utf-8" 

158 ) as file_: 

159 file_.write("----------------------\n") 

160 

161 if self.xml_filename_in_log is None: 161 ↛ 162line 161 didn't jump to line 162, because the condition on line 161 was never true

162 self.xml_filename_in_log = self.get_logname() 

163 

164 file_.write(self.xml_filename_in_log + " : FAILED\n") 

165 exc_type, exc_value, exc_traceback = sys.exc_info() 

166 lines = traceback.format_exception(exc_type, exc_value, exc_traceback) 

167 for line in lines: 

168 file_.write(line + "\n") 

169 file_.write("----------------------\n") 

170 

171 raise e 

172 

173 if self.solr_commit_at_the_end: 

174 ptf_cmds.do_solr_commit() 

175 

176 return obj 

177 

178 def post_undo(self): 

179 super().post_undo() 

180 

181 Person.objects.clean() 

182 

183 def post_do(self, resource=None): 

184 super().post_do(resource) 

185 

186 Person.objects.clean() 

187 

188 if hasattr(settings, "LOG_DIR") and resource and self.use_body: 

189 today = datetime.date.today() 

190 basename = str(today) + "-" + self.__class__.__name__ 

191 

192 pids = "" 

193 first = True 

194 if isinstance(resource, list): 

195 for resource_item in resource: 

196 if first: 196 ↛ 199line 196 didn't jump to line 199, because the condition on line 196 was never false

197 first = False 

198 else: 

199 pids += ", " 

200 

201 pids += resource_item.pid 

202 else: 

203 pids = resource.pid 

204 

205 with open(os.path.join(settings.LOG_DIR, "cmds.log"), "a", encoding="utf-8") as file_: 

206 file_.write(basename + " : " + pids + "\n") 

207 

208 if hasattr(resource, "my_collection") and resource.my_collection: 

209 folder = os.path.join( 

210 settings.LOG_DIR, resource.get_top_collection().pid, resource.pid 

211 ) 

212 filename = os.path.join(folder, resource.pid + ".xml") 

213 resolver.create_folder(folder) 

214 with open(filename, "w", encoding="utf-8") as file_: 

215 file_.write(self.body) 

216 

217 # #if test, then raise an exeption if self.warnings not empty (in self.warnings we have all tags not parsed) 

218 # if 'test' in sys.argv: 

219 # if len(self.warnings) > 0: 

220 # print(self.warnings) 

221 # raise UserWarning("All tags are not parsed", self.warnings) 

222 

223 def undo(self): 

224 super().undo() 

225 

226 if self.solr_commit_at_the_end: 

227 ptf_cmds.do_solr_commit() 

228 

229 def add_objects_with_location(self, xobjs, resource, cmd_type): 

230 seq = 1 

231 

232 for xobj in xobjs: 

233 base = None 

234 

235 if xobj["base"]: 

236 base_name = xobj["base"] 

237 base = model_helpers.get_xmlbase(base_name) 

238 if base is None: 

239 cmd = ptf_cmds.addXmlBasePtfCmd({"base": xobj["base"], "solr_commit": False}) 

240 base = cmd.do(self) 

241 

242 rel = xobj["rel"] 

243 location = xobj["location"] 

244 

245 params = { 

246 "rel": rel, 

247 "mimetype": xobj.get("mimetype", ""), 

248 "location": location, 

249 "seq": seq, 

250 "solr_commit": False, 

251 "from_folder": self.from_folder, 

252 "to_folder": self.to_folder, 

253 } 

254 

255 # Ignore XML file 

256 if params["mimetype"] != "application/xml": 256 ↛ 232line 256 didn't jump to line 232, because the condition on line 256 was never false

257 if "metadata" in xobj: 

258 params["metadata"] = xobj["metadata"] 

259 

260 if "text" in xobj: 

261 params["text"] = xobj["text"] 

262 

263 # TODO: cmd factory ? 

264 cmd = None 

265 if cmd_type == "ExtLink": 

266 cmd = ptf_cmds.addExtLinkPtfCmd(params) 

267 elif cmd_type == "RelatedObject": 

268 cmd = ptf_cmds.addRelatedObjectPtfCmd(params) 

269 elif cmd_type == "SupplementaryMaterial": 269 ↛ 270line 269 didn't jump to line 270, because the condition on line 269 was never true

270 params["caption"] = xobj.get("caption", "") 

271 params["supplementary_material"] = True 

272 cmd = ptf_cmds.addSupplementaryMaterialPtfCmd(params) 

273 elif cmd_type == "DataStream": 273 ↛ 279line 273 didn't jump to line 279, because the condition on line 273 was never false

274 cmd = ptf_cmds.addDataStreamPtfCmd(params) 

275 

276 # Always try to add an ExtLink or a RelatedObject 

277 # May raise ResourceExists if the ExtLink/RelatedObject is added twice 

278 

279 if cmd is not None: 279 ↛ 285line 279 didn't jump to line 285, because the condition on line 279 was never false

280 cmd.set_base(base) 

281 cmd.set_resource(resource) 

282 

283 cmd.do(self) 

284 

285 seq += 1 

286 

287 # def add_metadata_parts(self, xobj, resource): 

288 # for (seq, name, data) in xobj.metadataparts: 

289 # params = {"name": name, 

290 # "data": data, 

291 # "seq": seq, 

292 # "solr_commit": False} 

293 # 

294 # cmd = ptf_cmds.addMetaDataPartPtfCmd(params) 

295 # cmd.set_resource(resource) 

296 # cmd.do(self) 

297 

298 @staticmethod 

299 def remove_publisher(publisher): 

300 cmd = ptf_cmds.addPublisherPtfCmd() 

301 cmd.set_object_to_be_deleted(publisher) 

302 cmd.undo() 

303 

304 # Update the published years of a collection (journal/acta/book-series...) 

305 @staticmethod 

306 def update_collection_years(pid, container, save=True): 

307 collection = Collection.objects.get(pid=pid) 

308 if container.year: 

309 year = container.year 

310 fyear, lyear = model_helpers.get_first_last_years(year) 

311 fyear = int(fyear) 

312 lyear = int(lyear) 

313 

314 if fyear < collection.fyear or not collection.fyear: 

315 collection.fyear = fyear 

316 

317 if lyear > collection.lyear or not collection.lyear: 

318 collection.lyear = lyear 

319 

320 if save: 

321 collection.save() 

322 

323 

324class addCollectionsXmlCmd(addXmlCmd): 

325 """ 

326 addCollectionsXmlCmd: adds/remove a collection 

327 

328 TODO: merge Collection and Journal ? 

329 

330 Exception raised: 

331 - exceptions.ResourceExists during do 

332 if the Collection already exists 

333 if the collection defines the same extlink/relatedobject multiple times 

334 - exceptions.ResourceDoesNotExist 

335 during undo if the Collection does not exist 

336 during do of the provider does not exist 

337 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value> 

338 - RuntimeError during undo if resources are still published 

339 """ 

340 

341 provider = None 

342 xml_format = None 

343 

344 def set_provider(self, provider): 

345 self.provider = provider 

346 

347 def add_collection(self, xcol, update=False): 

348 if not xcol: 348 ↛ 349line 348 didn't jump to line 349, because the condition on line 348 was never true

349 return None 

350 

351 if xcol.provider: 351 ↛ 354line 351 didn't jump to line 354, because the condition on line 351 was never false

352 provider = model_helpers.get_provider_by_name(xcol.provider) 

353 else: 

354 provider = self.provider 

355 

356 col_id = xcol.pid 

357 collection = model_helpers.get_collection(col_id) 

358 

359 existing = False 

360 

361 if collection is not None: 

362 existing = True 

363 if not update: 363 ↛ 367line 363 didn't jump to line 367

364 raise exceptions.ResourceExists(f"Collection {collection.pid} already exists") 

365 

366 # Create a collection 

367 params = { 

368 "xobj": xcol, 

369 "from_folder": self.from_folder, 

370 "to_folder": self.to_folder, 

371 "solr_commit": False, 

372 } 

373 

374 cls = ptf_cmds.addCollectionPtfCmd 

375 if update and existing: 375 ↛ 376line 375 didn't jump to line 376, because the condition on line 375 was never true

376 cls = ptf_cmds.updateCollectionPtfCmd 

377 

378 cmd = cls(params) 

379 cmd.set_provider(provider) 

380 collection = cmd.do(self) 

381 

382 self.add_objects_with_location(xcol.ext_links, collection, "ExtLink") 

383 

384 # if publisher: 

385 # model_helpers.publish_resource(publisher, journal) 

386 

387 return collection 

388 

389 def internal_do(self): 

390 super().internal_do() 

391 

392 collections = [] 

393 

394 if self.tree.tag == "journal-meta": 394 ↛ 395line 394 didn't jump to line 395, because the condition on line 394 was never true

395 raise ValueError( 

396 "Creation of a journal on the fly from an article is not yet supported" 

397 ) 

398 # # Code used when a journal is created on the fly while parsing an article (GDML - OAI) 

399 # # TODO 1 : Refactor all the JATS parsers (eudml/bdim/dmlcz/....) 

400 # # to be compatible with jats_parser.py 

401 # # TODO 2 : Prevent the creation of the collection on the fly ? 

402 # # Shouldn't the collection be monitored/controlled ? 

403 # xmldata = globals()[self.xml_format] 

404 # xcol = xmldata.Journal(self.tree) 

405 # collection = self.add_collection(xcol, update=True) 

406 # collections.append(collection) 

407 else: 

408 for node in self.tree: 

409 xcol = None 

410 if node.tag == "collection-meta": 410 ↛ 411line 410 didn't jump to line 411, because the condition on line 410 was never true

411 raise ValueError("Collection can only be created from <publication-meta>") 

412 # xcol = jats_parser.BitsCollection(tree=node) 

413 elif node.tag == "journal-meta": 413 ↛ 414line 413 didn't jump to line 414, because the condition on line 413 was never true

414 raise ValueError( 

415 "Collection can only be created from <publication-meta>, <journal-meta> are handled while parsing a <journal-issue>" 

416 ) 

417 # xcol = jats_parser.JatsJournal(tree=node) 

418 elif node.tag == "publication-meta": 418 ↛ 421line 418 didn't jump to line 421, because the condition on line 418 was never false

419 xcol = jats_parser.MathdocPublication(tree=node) 

420 

421 collection = self.add_collection(xcol) 

422 collections.append(collection) 

423 

424 return collections 

425 

426 

427class addIssueXmlCmd(addXmlCmd): 

428 """ 

429 addIssueXmlCmd: adds/remove an issue 

430 

431 import_folder is to be used if you want to import extra data (extid false_positive...) 

432 It is typically used when an issue is imported from an archive 

433 

434 Exception raised: 

435 - exceptions.ResourceExists during do if the issue already exists 

436 - exceptions.ResourceDoesNotExist 

437 during undo if the Issue does not exist 

438 during do if the serial/provider does not exist 

439 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value> 

440 - RuntimeError during undo if resources are still published 

441 """ 

442 

443 assign_doi = False 

444 full_text_folder = "" 

445 import_folder = None 

446 prod_deployed_date_iso_8601_date_str = None 

447 xissue = None 

448 count = 0 

449 

450 def create_child_collection(self, xjournal, journal): 

451 issn = xjournal.issn if xjournal.issn else xjournal.e_issn 

452 

453 new_xjournal = copy.deepcopy(xjournal) 

454 new_xjournal.wall = 0 

455 new_xjournal.pid = f"{xjournal.pid}-{issn}" 

456 new_xjournal.coltype = journal.coltype 

457 

458 params = {"xobj": new_xjournal} 

459 provider = model_helpers.get_provider_by_name("mathdoc") 

460 

461 cmd = ptf_cmds.addCollectionPtfCmd(params) 

462 cmd.set_parent(journal) 

463 cmd.set_provider(provider) 

464 

465 collection = cmd.do() 

466 # collection.parent = journal 

467 # journal = collection 

468 return collection 

469 

470 def get_historic_collection(self, xjournal, journal): 

471 use_meta_collections = ( 

472 settings.USE_META_COLLECTIONS if hasattr(settings, "USE_META_COLLECTIONS") else False 

473 ) 

474 

475 if not use_meta_collections: 475 ↛ 476line 475 didn't jump to line 476, because the condition on line 475 was never true

476 return journal 

477 

478 # meta-collections are used : journal may be the top collection or one of its children 

479 

480 value = id_type = None 

481 

482 # Take care of special case of STNB : 

483 # For that, we ignore the issn of STNB 2nd series 

484 if xjournal.pid == "JTNB" and xjournal.issn == "0989-5558": 484 ↛ 485line 484 didn't jump to line 485, because the condition on line 484 was never true

485 xjournal.issn = None 

486 xjournal.e_issn = None 

487 xjournal.ids = [] 

488 else: 

489 if xjournal.issn: 

490 value = xjournal.issn 

491 id_type = "issn" 

492 elif xjournal.e_issn: 492 ↛ 496line 492 didn't jump to line 496, because the condition on line 492 was never false

493 value = xjournal.e_issn 

494 id_type = "e-issn" 

495 

496 if value: 496 ↛ 506line 496 didn't jump to line 506, because the condition on line 496 was never false

497 # collection has at least one issn 

498 qs = Collection.objects.filter(resourceid__id_value=value, resourceid__id_type=id_type) 

499 if qs.exists(): 

500 journal = qs.first() 

501 else: 

502 # xjournal does not exist yet. 

503 journal = self.create_child_collection(xjournal, journal) 

504 else: 

505 # collection has no issn 

506 possible_pids = [xjournal.pid, f"{xjournal.pid}-{value}"] 

507 qs = Collection.objects.exclude(resourceid__id_value__isnull=False).filter( 

508 pid__in=possible_pids 

509 ) 

510 if qs.exists(): 

511 journal = qs.first() 

512 else: 

513 journal = self.create_child_collection(xjournal, journal) 

514 

515 return journal 

516 

517 def internal_do(self): 

518 super().internal_do() 

519 

520 ####################################################################### 

521 # get xissue 

522 

523 if self.xissue: 

524 xissue = self.xissue 

525 else: 

526 xissue = jats_parser.JatsIssue(tree=self.tree) 

527 self.warnings.extend(xissue.warnings) 

528 

529 ####################################################################### 

530 # Check if there is an existing issue / journal 

531 

532 issue_id = xissue.pid 

533 issue = model_helpers.get_container(issue_id) 

534 

535 if issue is not None: 

536 raise exceptions.ResourceExists(f"Issue {issue_id} already exists") 

537 

538 xjournal = xissue.journal 

539 journal_id = xjournal.pid 

540 journal = model_helpers.get_collection(journal_id) 

541 

542 # Note: Why use <issue-meta><custom-meta-group><custom-meta> to find the provider and then the journal 

543 # as there is a <journal-meta> with an id ? 

544 # The ptf_resource table (Resource objects) are created with only 1 id. 

545 # When you add a journal, the journal id is the one of its 

546 # <custom-meta-group><custom-meta> provider. 

547 # If you want to find the journal of an issue based on the <journal-meta> information, you might 

548 # have to search among the other ids (ptf_resourceid table, ResourceId objects) : sql JOIN select 

549 # To avoid the join select, it's better to use <issue-meta><custom-meta-group><custom-meta> to make sure 

550 # we use the correct provider. A simple select in the ptf_resource table is then needed. 

551 if journal is None: 551 ↛ 552line 551 didn't jump to line 552, because the condition on line 551 was never true

552 raise exceptions.ResourceDoesNotExist(f"Journal {journal_id} does not exist") 

553 

554 # Journal is the top collection (ex: AFST) 

555 # We want to get (or create) the journal that corresponds to the issue 

556 journal = self.get_historic_collection(xjournal, journal) 

557 

558 ####################################################################### 

559 # Get provider/publisher 

560 

561 provider_name = xissue.provider if xissue.provider else "mathdoc" 

562 provider = model_helpers.get_provider_by_name(provider_name) 

563 

564 ####################################################################### 

565 # Add the issue 

566 

567 params = { 

568 "xobj": xissue, 

569 "pid": xissue.pid, 

570 "from_folder": self.from_folder, 

571 "to_folder": self.to_folder, 

572 "solr_commit": False, 

573 } 

574 

575 cmd = ptf_cmds.addContainerPtfCmd(params) 

576 cmd.add_collection(journal) 

577 cmd.set_provider(provider) 

578 issue = cmd.do(self) 

579 

580 self.add_objects_with_location(xissue.ext_links, issue, "ExtLink") 

581 self.add_objects_with_location(xissue.related_objects, issue, "RelatedObject") 

582 self.add_objects_with_location(xissue.streams, issue, "DataStream") 

583 

584 ####################################################################### 

585 # Add the issue's articles 

586 

587 # JatsIssue is an iterator (has the __iter__ function) 

588 # you simply iterate the xissue to get its articles 

589 for seq, xarticle in enumerate(xissue, start=1): 

590 params = { 

591 "xarticle": xarticle, 

592 "journal": journal, 

593 "issue": issue, 

594 "seq": seq, 

595 "provider": provider, 

596 "assign_doi": self.assign_doi, 

597 "full_text_folder": self.full_text_folder, 

598 "use_body": False, 

599 "from_folder": self.from_folder, 

600 "to_folder": self.to_folder, 

601 "solr_commit_at_the_end": False, 

602 } 

603 cmd = addArticleXmlCmd(params) 

604 cmd.do(self) 

605 

606 # Update the top journal first year and last year 

607 self.update_collection_years(journal_id, issue) 

608 

609 # The collection maybe updated with update_collection_years and the assign_doi param (col.last_doi) 

610 # Update issue before returning the object. 

611 # Note that refresh_from_db does not update ForeignKey fields, we can't simply call issue.refresh_from_db() 

612 issue.my_collection.refresh_from_db() 

613 

614 # Used in post_do 

615 self._prod_deployed_date_iso_8601_date_str = xissue.prod_deployed_date_iso_8601_date_str 

616 

617 return issue 

618 

619 def post_do(self, resource=None): 

620 super().post_do(resource) 

621 

622 # Si le XML de l'issue a une last-modified, on la garde, sinon on en créé une. 

623 if resource.last_modified is None: 623 ↛ 624line 623 didn't jump to line 624, because the condition on line 623 was never true

624 resource.last_modified = timezone.now() 

625 resource.save() 

626 

627 # Sur ptf-tools, si le XML de l'issue a une prod_deployed_date, 

628 # On la propage aux Articles/Issue. 

629 # La restoration éventuelle des données (avec importExtraDataPtfCmd) peut écraser prod_deployed_date 

630 if self._prod_deployed_date_iso_8601_date_str and settings.SITE_NAME == "ptf_tools": 

631 prod_deployed_date = model_helpers.parse_date_str( 

632 self._prod_deployed_date_iso_8601_date_str 

633 ) 

634 journal_site = model_helpers.get_site_mersenne(resource.my_collection.pid) 

635 if journal_site: 635 ↛ 638line 635 didn't jump to line 638, because the condition on line 635 was never false

636 model_helpers.update_deployed_date(resource, journal_site, prod_deployed_date) 

637 

638 if self.import_folder: 

639 ptf_cmds.importExtraDataPtfCmd( 

640 {"pid": resource.pid, "import_folder": self.import_folder} 

641 ).do() 

642 

643 

644class addArticleXmlCmd(addXmlCmd): 

645 """ 

646 addArticleXmlCmd: adds/remove an issue 

647 

648 Exception raised: 

649 - exceptions.ResourceExists during do if the article already exists 

650 - exceptions.ResourceDoesNotExist 

651 during undo if the Article does not exist 

652 during do if the serial/issue/provider does not exist 

653 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value> 

654 """ 

655 

656 xarticle = None 

657 journal = None 

658 issue = None 

659 provider = None 

660 provider_col = None 

661 assign_doi = False 

662 full_text_folder = "" 

663 xml_format = "xmldata_jats" 

664 # restricted_mode is used by maxiDML. We do not try to import all the metadata, but only a subset 

665 restricted_mode = False 

666 # standalone is used to import isolated article, without issues 

667 standalone = False 

668 seq = ( 

669 0 # seq is used by the breadcrumbs. Generate it if it's not specified in the XML (ex: PCJ) 

670 ) 

671 do_backup_obj_not_in_metadata = False 

672 

673 def set_collection(self, collection): 

674 self.journal = collection 

675 self.provider = collection.provider 

676 

677 def set_xml_format(self, xml_format): 

678 self.xml_format = xml_format 

679 

680 def set_provider(self, provider): 

681 self.provider = provider 

682 

683 def set_provider_col(self, provider_col): 

684 self.provider_col = provider_col 

685 

686 def set_article_single_mode(self): 

687 self.xarticle = jats_parser.JatsArticle(tree=self.tree) 

688 self.warnings.extend(self.xarticle.warnings) 

689 

690 # TODO: MaxiDML: allow the creation of an issue on the fly 

691 # if not self.provider: 

692 # self.provider = model_helpers.get_provider_by_name(self.xarticle.provider) 

693 # 

694 # xmldata_jats.set_pid_type(self.provider.pid_type) 

695 # 

696 # bdy = etree.tostring(self.xarticle.journal.tree).decode("utf-8") 

697 # cmd = addCollectionsXmlCmd({'body': bdy, 

698 # 'xml_format': self.xml_format, 

699 # 'coltype': "journal"}) 

700 # cmd.set_provider(self.provider_col if self.provider_col else self.provider) 

701 # self.journal = cmd.do()[0] 

702 # 

703 # self.issue = model_helpers.get_container(self.xarticle.issue_id) 

704 # if self.issue is None: 

705 # # need to create the issue 

706 # date = datetime.datetime.strptime(self.xarticle.date_published_iso_8601_date_str, 

707 # '%Y-%m-%d') 

708 # pid = "{name}_{year}".format(name=self.journal.pid, year=date.year) 

709 # self.issue = model_helpers.get_container(pid) 

710 # if self.issue is None: 

711 # params = {'ctype': 'issue', 'year': date.year, 'pid': pid, 

712 # 'last_modified_iso_8601_date_str': datetime.datetime.now().strftime( 

713 # "%Y-%m-%d %H:%M:%S"), 'volume': self.xarticle.volume, 

714 # # if copy binary, need from_folder / to_folder 

715 # } 

716 # 

717 # cmd = ptf_cmds.addContainerPtfCmd(params) 

718 # cmd.add_collection(self.journal) 

719 # cmd.set_provider(self.provider) 

720 # self.issue = cmd.do() 

721 

722 def get_oai_identifier(self): 

723 return self.xarticle.oai_identifier 

724 

725 def update_xobj_with_body(self): 

726 # Import CEDRICS, le plein texte provient d'un fichier séparé 

727 if self.full_text_folder and not self.xarticle.body: 

728 if self.full_text_folder == settings.CEDRAM_TEX_FOLDER: 728 ↛ 740line 728 didn't jump to line 740, because the condition on line 728 was never false

729 text = "" 

730 locs = [ 

731 stream["location"] 

732 for stream in self.xarticle.streams 

733 if stream["mimetype"] == "application/pdf" 

734 ] 

735 if locs: 735 ↛ 738line 735 didn't jump to line 738, because the condition on line 735 was never false

736 full_pdf_location = os.path.join(self.full_text_folder, locs[0]) 

737 text = utils.pdf_to_text(full_pdf_location) 

738 self.xarticle.body = text 

739 else: 

740 full_text_file = self.full_text_folder + self.xarticle.pid + ".xml" 

741 

742 with open(full_text_file, mode="rb") as file_: 

743 body = file_.read() 

744 

745 parser = etree.XMLParser(huge_tree=True, recover=True) 

746 tree = etree.fromstring(body, parser=parser) 

747 node = tree.find("body") 

748 self.xarticle.body = xml_utils.get_text_from_node(node) 

749 self.xarticle.body_xml = xml_utils.get_xml_from_text("body", self.xarticle.body) 

750 elif not self.xarticle.body_xml and hasattr(self.xarticle, "pii"): 750 ↛ 751line 750 didn't jump to line 751, because the condition on line 750 was never true

751 full_text_file = os.path.join( 

752 "/numdam_dev/acquisition/donnees_traitees", 

753 self.journal.pid, 

754 self.issue.pid, 

755 self.xarticle.pid, 

756 self.xarticle.pid + ".xml", 

757 ) 

758 if os.path.isfile(full_text_file): 

759 with open(full_text_file, mode="rb") as file_: 

760 body = file_.read() 

761 

762 parser = etree.XMLParser(huge_tree=True, recover=True) 

763 tree = etree.fromstring(body, parser=parser) 

764 node = tree.find("body") 

765 self.xarticle.body = xml_utils.get_text_from_node(node) 

766 self.xarticle.body_xml = xml_utils.get_xml_from_text("body", self.xarticle.body) 

767 

768 def internal_do(self): 

769 super().internal_do() 

770 

771 if self.xarticle is None and self.journal is not None: 771 ↛ 773line 771 didn't jump to line 773, because the condition on line 771 was never true

772 # self.restricted_mode = True 

773 self.set_article_single_mode() 

774 self.update = True 

775 else: 

776 self.update = False 

777 

778 if self.xarticle.pid is None: 778 ↛ 779line 778 didn't jump to line 779

779 self.xarticle.pid = ( 

780 self.xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_") 

781 ) 

782 

783 if self.from_folder: 

784 for xtranslated_article in self.xarticle.translations: 784 ↛ 785line 784 didn't jump to line 785, because the loop on line 784 never started

785 for xtream in xtranslated_article.streams: 

786 if xtream["mimetype"] == "text/html": 

787 location = os.path.join(self.from_folder, xtream["location"]) 

788 body_html = resolver.get_body(location) 

789 body = xml_utils.get_text_from_xml_with_mathml(body_html) 

790 xtranslated_article.body_html = body_html 

791 xtranslated_article.body = body 

792 

793 article = model_helpers.get_article(self.xarticle.pid) 

794 

795 if article is not None: 795 ↛ 796line 795 didn't jump to line 796, because the condition on line 795 was never true

796 if self.update or self.standalone: 

797 if self.standalone: 

798 self.provider = article.provider 

799 

800 if self.do_backup_obj_not_in_metadata: 

801 backup_obj_not_in_metadata(article) 

802 

803 cmd = ptf_cmds.addArticlePtfCmd( 

804 { 

805 "pid": article.pid, 

806 "to_folder": self.to_folder, # on supprime les fichiers pour être sûr 

807 } 

808 ) 

809 cmd.set_object_to_be_deleted(article) 

810 cmd.undo() 

811 else: 

812 raise exceptions.ResourceExists(f"Article {self.xarticle.pid} already exists") 

813 

814 # Override seq 

815 if self.standalone and article is not None: 815 ↛ 816line 815 didn't jump to line 816, because the condition on line 815 was never true

816 self.xarticle.seq = article.seq 

817 elif ( 

818 not self.standalone and self.issue and int(self.xarticle.seq) == 0 and self.seq != 0 

819 ) or (hasattr(self, "pii") and self.seq != 0): 

820 self.xarticle.seq = self.seq 

821 

822 # Get the article's text (body) for SolR if it is empty from the PDF 

823 self.update_xobj_with_body() 

824 

825 params = { 

826 "xobj": self.xarticle, 

827 "pid": self.xarticle.pid, 

828 "from_folder": self.from_folder, 

829 "to_folder": self.to_folder, 

830 "assign_doi": self.assign_doi and not self.xarticle.doi, 

831 "solr_commit": False, 

832 } 

833 

834 cmd = ptf_cmds.addArticlePtfCmd(params) 

835 if self.issue or not self.standalone: 835 ↛ 837line 835 didn't jump to line 837, because the condition on line 835 was never false

836 cmd.set_container(self.issue) 

837 cmd.add_collection(self.journal) 

838 article = cmd.do(self) 

839 

840 self.add_objects_with_location(self.xarticle.ext_links, article, "ExtLink") 

841 self.add_objects_with_location(self.xarticle.streams, article, "DataStream") 

842 if not self.restricted_mode: 842 ↛ 847line 842 didn't jump to line 847, because the condition on line 842 was never false

843 self.add_objects_with_location( 

844 self.xarticle.supplementary_materials, article, "SupplementaryMaterial" 

845 ) 

846 

847 if ( 

848 hasattr(settings, "SHOW_BODY") and settings.SHOW_BODY 

849 ) or settings.SITE_NAME == "ptf_tools": 

850 self.add_objects_with_location(self.xarticle.figures, article, "RelatedObject") 

851 

852 for xtrans_article, trans_article in zip( 852 ↛ 855line 852 didn't jump to line 855, because the loop on line 852 never started

853 self.xarticle.translations, cmd.cmd.translated_articles 

854 ): 

855 self.add_objects_with_location(xtrans_article.streams, trans_article, "DataStream") 

856 

857 if self.do_backup_obj_not_in_metadata: 857 ↛ 858line 857 didn't jump to line 858, because the condition on line 857 was never true

858 restore_obj_not_in_metadata(article) 

859 

860 return article 

861 

862 

863class addTranslatedArticleXmlCmd(addXmlCmd): 

864 """ 

865 addTranslatedArticleXmlCmd: adds/remove translations. 

866 The original article is not changed 

867 The current translations are first removed 

868 """ 

869 

870 lang = "" 

871 html_file_name = "" 

872 pdf_file_name = "" 

873 date_published_str = "" 

874 

875 def internal_do(self): 

876 super().internal_do() 

877 

878 xarticle = jats_parser.JatsArticle(tree=self.tree) 

879 article = model_helpers.get_article(xarticle.pid) 

880 

881 if article is None: 

882 raise exceptions.ResourceDoesNotExist(f"Article {self.xarticle.pid} does not exist") 

883 

884 # Merge existing article with new translation 

885 data_article = model_data_converter.db_to_article_data(article) 

886 new_translations = [ 

887 translation 

888 for translation in data_article.translations 

889 if translation.lang != self.lang 

890 ] 

891 

892 for xtrans_article in xarticle.translations: 

893 if xtrans_article.lang == self.lang: 

894 # Upload/views has copied the HTML file on disk 

895 # Add a DataStream. 

896 # TODO: check if the datastream is not already present 

897 if self.html_file_name: 

898 data = model_data.create_datastream() 

899 data["rel"] = "full-text" 

900 data["mimetype"] = "text/html" 

901 data["location"] = self.html_file_name 

902 xtrans_article.streams.append(data) 

903 

904 if self.pdf_file_name: 

905 # Create a pdf file 

906 # pdf-translate needs the article/sub-article XML 

907 # Simply add a datastream for now 

908 # The new Article created in Django will be complete 

909 # But generate the PDF file at the end 

910 data = model_data.create_datastream() 

911 data["rel"] = "full-text" 

912 data["mimetype"] = "application/pdf" 

913 data["location"] = self.pdf_file_name 

914 xtrans_article.streams.append(data) 

915 

916 if self.date_published_str: 

917 xtrans_article.date_published_iso_8601_date_str = self.date_published_str 

918 

919 new_translations.append(xtrans_article) 

920 

921 data_article.translations = new_translations 

922 

923 cmd = addArticleXmlCmd( 

924 { 

925 "xarticle": data_article, 

926 "use_body": False, 

927 "issue": article.my_container, 

928 "standalone": True, 

929 "from_folder": self.from_folder, 

930 } 

931 ) 

932 cmd.set_collection(article.get_collection()) 

933 article = cmd.do() 

934 

935 # pdf-translate needs the article/sub-article XML 

936 xml = ptf_cmds.exportPtfCmd( 

937 { 

938 "pid": article.pid, 

939 "with_body": False, 

940 "with_djvu": False, 

941 "article_standalone": True, 

942 "collection_pid": settings.COLLECTION_PID, 

943 } 

944 ).do() 

945 

946 tex.create_translated_pdf( 

947 article, 

948 xml, 

949 self.lang, 

950 os.path.join(self.from_folder, self.pdf_file_name), 

951 os.path.join(self.from_folder, self.html_file_name), 

952 # If the date_published is specified, we assume that the PDF already exists 

953 skip_compilation=self.date_published_str != "", 

954 ) 

955 

956 return article 

957 

958 

959class addBookXmlCmd(addXmlCmd): 

960 """ 

961 addBookXmlCmd: adds/remove a book 

962 

963 Exception raised: 

964 - exceptions.ResourceExists during do if the book already exists 

965 - exceptions.ResourceDoesNotExist 

966 during undo if the Book does not exist 

967 during do if the serial/provider does not exist 

968 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value> 

969 - RuntimeError during undo if resources are still published 

970 """ 

971 

972 provider = None 

973 import_oai_mode = False 

974 journal = None 

975 xml_format = "xmldata_jats" 

976 xbook = None 

977 _collection = None 

978 

979 def set_provider(self, provider): 

980 self.provider = provider 

981 

982 def add_parts(self, xparts, pseq): 

983 if xparts: 

984 seq = 1 

985 for xpart in xparts: 

986 self.add_part(xpart, seq, pseq) 

987 seq += 1 

988 

989 def add_part(self, xpart, seq, pseq): 

990 if xpart is None: 990 ↛ 991line 990 didn't jump to line 991, because the condition on line 990 was never true

991 return 

992 

993 # An Article is used to store a book part in the database 

994 article = model_helpers.get_article(xpart.pid) 

995 

996 if article is not None: 996 ↛ 997line 996 didn't jump to line 997, because the condition on line 996 was never true

997 raise exceptions.ResourceExists(f"BookPart {xpart.pid} already exists") 

998 

999 params = { 

1000 "xobj": xpart, 

1001 "pid": xpart.pid, 

1002 "seq": seq, 

1003 "pseq": pseq, 

1004 # "deployed": deployed, 

1005 "from_folder": self.from_folder, 

1006 "to_folder": self.to_folder, 

1007 "solr_commit": False, 

1008 } 

1009 

1010 cmd = ptf_cmds.addBookPartPtfCmd(params) 

1011 cmd.set_container(self.book) 

1012 cmd.add_collection(self._collection) 

1013 article = cmd.do(self) 

1014 

1015 self.add_objects_with_location(xpart.ext_links, article, "ExtLink") 

1016 self.add_objects_with_location(xpart.streams, article, "DataStream") 

1017 

1018 self.add_parts(xpart.parts, seq) 

1019 

1020 def set_import_oai_mode(self): 

1021 self.import_oai_mode = True 

1022 

1023 def internal_do(self): 

1024 super().internal_do() 

1025 

1026 ####################################################################### 

1027 # Get xbook 

1028 

1029 if self.import_oai_mode: 1029 ↛ 1030line 1029 didn't jump to line 1030, because the condition on line 1029 was never true

1030 xmldata = globals()[self.xml_format] 

1031 xbook = xmldata.Book(self.tree) 

1032 self.journal = model_helpers.get_collection("GDML_Books", sites=False) 

1033 

1034 else: 

1035 if self.xbook: 

1036 xbook = self.xbook 

1037 else: 

1038 xbook = jats_parser.BitsBook(tree=self.tree) 

1039 self.warnings.extend(xbook.warnings) 

1040 

1041 ####################################################################### 

1042 # Get existing book if any 

1043 

1044 if not self.provider: 1044 ↛ 1048line 1044 didn't jump to line 1048, because the condition on line 1044 was never false

1045 provider = model_helpers.get_provider_by_name(xbook.provider) 

1046 self.provider = provider 

1047 

1048 book_id = xbook.pid 

1049 book = model_helpers.get_container(book_id) 

1050 

1051 ####################################################################### 

1052 # Delete any existing book 

1053 

1054 if book is not None: 

1055 if self.import_oai_mode: 1055 ↛ 1056line 1055 didn't jump to line 1056, because the condition on line 1055 was never true

1056 publisher = book.my_publisher 

1057 

1058 # Note: the existing collection is not removed even if it no longer has a resource 

1059 # TODO: urls/commands to add/update/delete a collection 

1060 

1061 # Removes the book 

1062 cmd = ptf_cmds.addContainerPtfCmd() 

1063 cmd.set_object_to_be_deleted(book) 

1064 cmd.undo() 

1065 

1066 if publisher and publisher.publishes.count() == 0: 

1067 self.remove_publisher(publisher) 

1068 else: 

1069 raise exceptions.ResourceExists("Book %s already exists" % book_id) 

1070 

1071 ####################################################################### 

1072 # Add new book 

1073 

1074 if xbook.incollection: 1074 ↛ 1079line 1074 didn't jump to line 1079, because the condition on line 1074 was never false

1075 colid = xbook.incollection[0].pid 

1076 self._collection = model_helpers.get_collection(colid) 

1077 if self._collection is None: 

1078 raise exceptions.ResourceDoesNotExist(f"The collection {colid} does not exist") 

1079 elif self.import_oai_mode: 

1080 self._collection = self.journal 

1081 

1082 params = { 

1083 "xobj": xbook, 

1084 "pid": xbook.pid, 

1085 "from_folder": self.from_folder, 

1086 "to_folder": self.to_folder, 

1087 "solr_commit": False, 

1088 } 

1089 

1090 cmd = ptf_cmds.addContainerPtfCmd(params) 

1091 cmd.add_collection(self._collection) 

1092 cmd.set_provider(provider) 

1093 

1094 book = cmd.do(self) 

1095 self.book = book 

1096 

1097 self.add_objects_with_location(xbook.ext_links, book, "ExtLink") 

1098 self.add_objects_with_location(xbook.related_objects, book, "RelatedObject") 

1099 self.add_objects_with_location(xbook.streams, book, "DataStream") 

1100 

1101 # self.add_metadata_parts(xbook, book) TODO support Metadataparts ? 

1102 

1103 ####################################################################### 

1104 # Add Book parts 

1105 

1106 # JatsIssue is an iterator (has the __iter__ function) 

1107 # TODO make JatsBook an iterator as well ? 

1108 self.add_parts(xbook.parts, 0) 

1109 

1110 # Update the collection first year and last year 

1111 for incol in xbook.incollection: 

1112 self.update_collection_years(incol.pid, book) 

1113 

1114 return book 

1115 

1116 

1117###################################################################################### 

1118###################################################################################### 

1119# 

1120# Update Commands 

1121# 

1122###################################################################################### 

1123###################################################################################### 

1124 

1125 

1126class updateCollectionsXmlCmd(addXmlCmd): 

1127 """ 

1128 updateSerialsXmlCmd: updates one or more journals 

1129 

1130 Exception raised: 

1131 - exceptions.ResourceDoesNotExist during do if the Collection does not exist 

1132 - RuntimeError if undo is called 

1133 """ 

1134 

1135 def update_collection(self, xcol, do_update=True): 

1136 if not xcol: 1136 ↛ 1137line 1136 didn't jump to line 1137, because the condition on line 1136 was never true

1137 return None 

1138 

1139 provider = model_helpers.get_provider_by_name(xcol.provider) 

1140 

1141 col_id = xcol.pid 

1142 col = model_helpers.get_collection(col_id) 

1143 

1144 if col is None: 

1145 raise exceptions.ResourceDoesNotExist("Collection %s does not exist" % xcol.pid) 

1146 

1147 if do_update: 

1148 params = { 

1149 "xobj": xcol, 

1150 "solr_commit": False, 

1151 "from_folder": self.from_folder, 

1152 "to_folder": self.to_folder, 

1153 } 

1154 

1155 # The existing other_ids, abstracts are removed in updateCollectionDatabaseCmd::internal_do 

1156 # and the new ones are added in the post_do (addResourceDatabaseCmd) 

1157 

1158 cmd = ptf_cmds.updateCollectionPtfCmd(params) 

1159 cmd.set_provider(provider) 

1160 # cmd.set_publisher(publisher) 

1161 col = cmd.do() 

1162 

1163 # The existing extlinks are removed in updateCollectionDatabaseCmd::internal_do 

1164 self.add_objects_with_location(xcol.ext_links, col, "ExtLink") 

1165 resolver.copy_binary_files(col, self.from_folder, self.to_folder) 

1166 

1167 # if publisher: 

1168 # model_helpers.publish_resource(publisher, col) 

1169 

1170 return col 

1171 

1172 def internal_do(self): 

1173 super().internal_do() 

1174 

1175 collections = [] 

1176 

1177 # First, check that all journals exist 

1178 for node in self.tree: 

1179 xcol = None 

1180 if node.tag == "collection-meta": 1180 ↛ 1181line 1180 didn't jump to line 1181, because the condition on line 1180 was never true

1181 xcol = jats_parser.BitsCollection(tree=node) 

1182 elif node.tag == "journal-meta": 1182 ↛ 1183line 1182 didn't jump to line 1183, because the condition on line 1182 was never true

1183 xcol = jats_parser.JatsJournal(tree=node) 

1184 elif node.tag == "publication-meta": 1184 ↛ 1186line 1184 didn't jump to line 1186, because the condition on line 1184 was never false

1185 xcol = jats_parser.MathdocPublication(tree=node) 

1186 self.update_collection(xcol, False) 

1187 

1188 for node in self.tree: 

1189 xcol = None 

1190 if node.tag == "collection-meta": 1190 ↛ 1191line 1190 didn't jump to line 1191, because the condition on line 1190 was never true

1191 xcol = jats_parser.BitsCollection(tree=node) 

1192 elif node.tag == "journal-meta": 1192 ↛ 1193line 1192 didn't jump to line 1193, because the condition on line 1192 was never true

1193 xcol = jats_parser.JatsJournal(tree=node) 

1194 elif node.tag == "publication-meta": 1194 ↛ 1196line 1194 didn't jump to line 1196, because the condition on line 1194 was never false

1195 xcol = jats_parser.MathdocPublication(tree=node) 

1196 self.warnings.extend(xcol.warnings) 

1197 xcol = self.update_collection(xcol) 

1198 collections.append(xcol) 

1199 

1200 return collections 

1201 

1202 def internal_undo(self): 

1203 raise RuntimeError("update commands do not support the undo") 

1204 

1205 

1206##################################################################### 

1207# 

1208# replaceIssueXmlCmd: updates an issue 

1209# 

1210# Exception raised: 

1211# - exceptions.ResourceDoesNotExist during do if the Collection/Issue/Provider does not exist 

1212# <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value> 

1213# - RuntimeError if undo is called 

1214# 

1215###################################################################### 

1216class replaceIssueXmlCmd(addXmlCmd): 

1217 def internal_do(self): 

1218 super().internal_do() 

1219 

1220 xissue = jats_parser.JatsIssue(tree=self.tree) 

1221 self.warnings.extend(xissue.warnings) 

1222 

1223 xjournal = xissue.journal 

1224 journal_id = xjournal.pid 

1225 journal = model_helpers.get_collection(journal_id) 

1226 

1227 if journal is None: 1227 ↛ 1228line 1227 didn't jump to line 1228, because the condition on line 1227 was never true

1228 raise exceptions.ResourceDoesNotExist("Journal %s does not exist" % xjournal.pid) 

1229 

1230 issue_id = xissue.pid 

1231 issue = model_helpers.get_container(issue_id) 

1232 

1233 if issue is None: 1233 ↛ 1234line 1233 didn't jump to line 1234, because the condition on line 1233 was never true

1234 raise exceptions.ResourceDoesNotExist("Issue %s does not exist" % issue_id) 

1235 

1236 publisher = issue.my_publisher 

1237 

1238 cmd = ptf_cmds.addContainerPtfCmd() 

1239 cmd.set_object_to_be_deleted(issue) 

1240 cmd.undo() 

1241 

1242 if publisher.publishes.count() == 0: 

1243 self.remove_publisher(publisher) 

1244 

1245 # update the journal first and last year 

1246 for the_issue in journal.content.all(): 

1247 self.update_collection_years(journal_id, the_issue, False) 

1248 

1249 journal.save() 

1250 

1251 cmd = addIssueXmlCmd( 

1252 { 

1253 "xissue": xissue, 

1254 "use_body": False, 

1255 "solr_commit": False, 

1256 "from_folder": self.from_folder, 

1257 "to_folder": self.to_folder, 

1258 } 

1259 ) 

1260 issue = cmd.do() 

1261 

1262 return issue 

1263 

1264 # node_tag = self.tree.tag 

1265 # for child in self.tree: 

1266 # node_tag = child.tag 

1267 

1268 def internal_undo(self): 

1269 raise RuntimeError("update commands do not support the undo") 

1270 

1271 

1272class updateBookXmlCmd(addXmlCmd): 

1273 """ 

1274 updateBookXmlCmd: updates a book 

1275 

1276 Exception raised: 

1277 - exceptions.ResourceDoesNotExist during do if the Book does not exist 

1278 - RuntimeError if undo is called 

1279 """ 

1280 

1281 def internal_do(self): 

1282 super().internal_do() 

1283 

1284 xbook = jats_parser.BitsBook(tree=self.tree) 

1285 self.warnings.extend(xbook.warnings) 

1286 

1287 book_id = xbook.pid 

1288 book = model_helpers.get_container(book_id) 

1289 

1290 if book is None: 1290 ↛ 1291line 1290 didn't jump to line 1291, because the condition on line 1290 was never true

1291 raise exceptions.ResourceDoesNotExist("Book %s does not exist" % xbook.pid) 

1292 

1293 # unpublish and delete the existing publisher if necessary 

1294 # self.update_publisher(xbook, book) 

1295 

1296 # Note: the existing collection is not removed even if it no longer has a resource 

1297 # TODO: urls/commands to add/update/delete a collection 

1298 

1299 # Removes the book 

1300 cmd = ptf_cmds.addContainerPtfCmd() 

1301 cmd.set_object_to_be_deleted(book) 

1302 cmd.undo() 

1303 

1304 cmd = addBookXmlCmd( 

1305 { 

1306 "xbook": xbook, 

1307 "use_body": False, 

1308 "solr_commit": False, 

1309 "from_folder": self.from_folder, 

1310 "to_folder": self.to_folder, 

1311 } 

1312 ) 

1313 book = cmd.do() 

1314 

1315 return book 

1316 

1317 def internal_undo(self): 

1318 raise RuntimeError("update commands do not support the undo") 

1319 

1320 

1321class addOrUpdateContainerXmlCmd(addXmlCmd): 

1322 """ 

1323 addOrUpdateContainerXmlCmd: detects Container type from xml and adds or updates an issue or a book 

1324 

1325 just detect Container type (do not check params etc.) 

1326 """ 

1327 

1328 keep_metadata = False 

1329 assign_doi = False 

1330 full_text_folder = "" 

1331 import_folder = None 

1332 temp_folder = None 

1333 fake = False # Parse the XML but do not import 

1334 

1335 def check_params(self): 

1336 super().check_params() 

1337 

1338 def internal_do(self): 

1339 super().internal_do() 

1340 

1341 tag = normalize(self.tree.tag) 

1342 

1343 if tag == "journal-issue": 1343 ↛ 1357line 1343 didn't jump to line 1357, because the condition on line 1343 was never false

1344 cmd = addOrUpdateIssueXmlCmd( 

1345 { 

1346 "body": self.body, 

1347 "import_folder": self.import_folder, 

1348 "to_folder": self.to_folder, 

1349 "from_folder": self.from_folder, 

1350 "xml_file_folder": self.xml_file_folder, 

1351 "fake": self.fake, 

1352 } 

1353 ) 

1354 obj = cmd.do() 

1355 self.warnings.extend(cmd.warnings) 

1356 return obj 

1357 elif tag == "book": 

1358 cmd = addOrUpdateBookXmlCmd( 

1359 { 

1360 "body": self.body, 

1361 "import_folder": self.import_folder, 

1362 "to_folder": self.to_folder, 

1363 "from_folder": self.from_folder, 

1364 } 

1365 ) 

1366 obj = cmd.do() 

1367 self.warnings.extend(cmd.warnings) 

1368 return obj 

1369 else: 

1370 raise RuntimeError("addOrupdateContainer command can't detect container type") 

1371 

1372 def internal_undo(self): 

1373 raise RuntimeError("update commands do not support the undo") 

1374 

1375 

1376class addOrUpdateIssueXmlCmd(addXmlCmd): 

1377 """ 

1378 addOrUpdateIssueXmlCmd: adds or updates an issue 

1379 

1380 adds an issue if it is not in the system or updates the issue if it is already there. 

1381 If the issue is already in the system, preserves the following metadata 

1382 - DOI 

1383 - Matching 

1384 

1385 By default, no metadata is preserved. Use the "keep_metadata" param 

1386 By default, no DOI is assigned for the articles. Set assign_doi to True. 

1387 from_folder: folder where binary files are 

1388 import_folder: folder where extra data are stored (extids,...). Ex: /mathdoc_archive 

1389 temp_folder: temp folder where extra data can be stored. 

1390 import_folder is used with keep_metadata=False 

1391 temp_folder is used with keep_metadata=True 

1392 

1393 Exception raised: 

1394 - exceptions.ResourceDoesNotExist during do if the Collection/Issue/Provider does not exist 

1395 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value> 

1396 - RuntimeError if undo is called 

1397 """ 

1398 

1399 keep_metadata = False 

1400 assign_doi = False 

1401 full_text_folder = "" 

1402 import_folder = None 

1403 temp_folder = None 

1404 xissue = None 

1405 fake = False # Parse the XML but do not import 

1406 do_backup_obj_not_in_metadata = False 

1407 

1408 def check_params(self): 

1409 super().check_params() 

1410 

1411 if self.keep_metadata and self.assign_doi: 1411 ↛ 1412line 1411 didn't jump to line 1412, because the condition on line 1411 was never true

1412 raise ValueError("keep_metadata and assign_doi cannot both be true.") 

1413 

1414 if self.keep_metadata and self.temp_folder is None: 1414 ↛ 1415line 1414 didn't jump to line 1415, because the condition on line 1414 was never true

1415 raise ValueError("temp_folder has to be specified when keep_metadata=True.") 

1416 

1417 def internal_do(self): 

1418 super().internal_do() 

1419 

1420 if not self.xissue: 

1421 self.xissue = xissue = jats_parser.JatsIssue( 

1422 tree=self.tree, from_folder=self.from_folder 

1423 ) 

1424 if len(xissue.warnings) > 0 and self.xml_file_folder: 

1425 warnings = [] 

1426 warning_keys = [] 

1427 for warning in xissue.warnings: 

1428 for key, value in warning.items(): 

1429 if value not in warning_keys: 

1430 warning_keys.append(value) 

1431 warnings.append({key: value}) 

1432 for warning in warnings: 

1433 print(warning) 

1434 self.warnings.extend(xissue.warnings) 

1435 else: 

1436 xissue = self.xissue 

1437 

1438 if self.fake: 1438 ↛ 1439line 1438 didn't jump to line 1439, because the condition on line 1438 was never true

1439 return 

1440 

1441 xjournal = xissue.journal 

1442 journal_id = xjournal.pid 

1443 journal = model_helpers.get_collection(journal_id) 

1444 

1445 if journal is None: 1445 ↛ 1446line 1445 didn't jump to line 1446, because the condition on line 1445 was never true

1446 raise exceptions.ResourceDoesNotExist("Journal %s does not exist" % xjournal.pid) 

1447 

1448 existing_issue = model_helpers.get_container(xissue.pid) 

1449 

1450 if existing_issue: 

1451 if self.do_backup_obj_not_in_metadata: 

1452 for article in existing_issue.article_set.all(): 

1453 backup_obj_not_in_metadata(article) 

1454 

1455 if self.keep_metadata: 

1456 # On commence par faire un backup de l'existant en cas de bug. 

1457 ptf_cmds.exportPtfCmd( 

1458 { 

1459 "pid": existing_issue.pid, 

1460 "with_internal_data": True, 

1461 "with_binary_files": False, 

1462 "for_archive": False, 

1463 "export_folder": os.path.join(self.temp_folder, "backup"), 

1464 } 

1465 ).do() 

1466 

1467 # On sauvegarde les données additionnelles (extid, deployed_date,...) 

1468 # dans un json qui sera ré-importé avec l'import du nouvel issue 

1469 params = { 

1470 "pid": existing_issue.pid, 

1471 "export_folder": self.temp_folder, 

1472 "export_all": True, 

1473 "with_binary_files": True, 

1474 "do_backup_obj_not_in_metadata": True, 

1475 } 

1476 ptf_cmds.exportExtraDataPtfCmd(params).do() 

1477 

1478 # On efface l'issue existant, sinon l'import va se plaindre d'articles existants 

1479 cmd = ptf_cmds.addContainerPtfCmd() 

1480 cmd.set_object_to_be_deleted(existing_issue) 

1481 cmd.undo() 

1482 

1483 # update the journal first and last year 

1484 for the_issue in journal.content.all(): 

1485 self.update_collection_years(journal_id, the_issue, False) 

1486 

1487 journal.save() 

1488 else: 

1489 issue_to_appear = model_helpers.get_issue_to_appear(journal_id) 

1490 

1491 # Dans le cas des AIF, les articles du volume à paraitre sont déplacés 

1492 # dans un nouveau volume avant publication (de AIF_0__0_ vers AIF_2018... par ex) 

1493 # La 1ère fois, AIF_2018_ n'est pas encore dans PTF et existing_issue vaut None. 

1494 # Exemple : AIF_0_0 contient doi1, doi2 et doi3, AIF_2018 contient doi1 et doi2. 

1495 # L'import va échouer car on ne peut avoir 2 fois le même article. 

1496 # La solution d'effacer AIF_0_0 n'est pas bonne car on perd doi3. 

1497 # Il faut supprimer les articles en commun (de _0__0 et 2018_) avant l'import 

1498 # du nouveau volume sinon il va y avoir des conflits. 

1499 

1500 if issue_to_appear and xissue.pid != issue_to_appear.pid: 

1501 # On sauvegarde les données additionnelles (extid, deployed_date,...) 

1502 # dans un json qui sera ré-importé avec l'import du nouvel issue 

1503 # ainsi que image associée via ptf-tools 

1504 if self.keep_metadata: 

1505 params = { 

1506 "pid": issue_to_appear.pid, 

1507 "force_pid": xissue.pid, 

1508 "export_folder": self.temp_folder, 

1509 "export_all": True, 

1510 "with_binary_files": True, 

1511 "do_backup_obj_not_in_metadata": True, 

1512 } 

1513 ptf_cmds.exportExtraDataPtfCmd(params).do() 

1514 

1515 for xarticle in xissue: 

1516 xdoi = getattr(xarticle, "doi") 

1517 article = issue_to_appear.article_set.filter(doi=xdoi).first() 

1518 if article: 

1519 params = {"to_folder": self.to_folder} # pour suppression des binaires 

1520 cmd = ptf_cmds.addArticlePtfCmd(params) 

1521 cmd.set_object_to_be_deleted(article) 

1522 cmd.undo() 

1523 

1524 folder = self.temp_folder if self.keep_metadata else self.import_folder 

1525 

1526 # si folder est différent de None alors addIssueXmlCmd.post_do() utilise importExtraDataPtfCmd 

1527 cmd = addIssueXmlCmd( 

1528 { 

1529 "xissue": xissue, 

1530 "use_body": False, 

1531 # "body": self.body, 

1532 "assign_doi": self.assign_doi, 

1533 "full_text_folder": self.full_text_folder, # Cedrics: the full text for SolR is in a separate file 

1534 "import_folder": folder, 

1535 "from_folder": self.from_folder, 

1536 "to_folder": self.to_folder, 

1537 "solr_commit": False, 

1538 } 

1539 ) 

1540 new_issue = cmd.do() 

1541 

1542 if self.keep_metadata: 

1543 file_ = resolver.get_archive_filename( 

1544 self.temp_folder, 

1545 new_issue.my_collection.pid, 

1546 new_issue.pid, 

1547 "json", 

1548 False, 

1549 ) 

1550 if os.path.exists(file_): 

1551 os.remove(file_) 

1552 

1553 new_articles = new_issue.article_set.all() 

1554 

1555 # Avec l'option self.assign_doi, on vérifie que les doi ont bien été assignés 

1556 for article in new_articles: 

1557 if self.assign_doi and article.doi is None: 1557 ↛ 1558line 1557 didn't jump to line 1558, because the condition on line 1557 was never true

1558 raise exceptions.ResourceHasNoDoi("The article %s has no DOI" % article.pid) 

1559 

1560 restore_obj_not_in_metadata(article) 

1561 

1562 return new_issue 

1563 

1564 # node_tag = self.tree.tag 

1565 # for child in self.tree: 

1566 # node_tag = child.tag 

1567 

1568 def internal_undo(self): 

1569 raise RuntimeError("update commands do not support the undo") 

1570 

1571 

1572class addOrUpdateBookXmlCmd(addXmlCmd): 

1573 import_folder = None 

1574 xbook = None 

1575 

1576 def internal_do(self): 

1577 super().internal_do() 

1578 

1579 if not self.xbook: 1579 ↛ 1583line 1579 didn't jump to line 1583, because the condition on line 1579 was never false

1580 xbook = jats_parser.BitsBook(tree=self.tree) 

1581 self.warnings.extend(xbook.warnings) 

1582 else: 

1583 xbook = self.xbook 

1584 

1585 book_id = xbook.pid 

1586 book = model_helpers.get_container(book_id) 

1587 

1588 if book: 1588 ↛ 1589line 1588 didn't jump to line 1589, because the condition on line 1588 was never true

1589 cmd = ptf_cmds.addContainerPtfCmd() 

1590 cmd.set_object_to_be_deleted(book) 

1591 cmd.undo() 

1592 

1593 collection = book.get_collection() 

1594 

1595 # update the collection first and last year 

1596 for container in collection.content.all(): 

1597 self.update_collection_years(collection.pid, container, False) 

1598 

1599 collection.save() 

1600 

1601 cmd = addBookXmlCmd( 

1602 { 

1603 "xbook": xbook, 

1604 "use_body": False, 

1605 # "body": self.body, 

1606 "from_folder": self.import_folder, 

1607 "to_folder": self.to_folder, 

1608 "solr_commit": False, 

1609 } 

1610 ) 

1611 book = cmd.do() 

1612 return book 

1613 

1614 

1615class updateBibitemCitationXmlCmd(baseCmd): 

1616 """ """ 

1617 

1618 def __init__(self, params=None): 

1619 self.bibitem = None 

1620 

1621 super().__init__(params) 

1622 

1623 self.required_params.extend(["bibitem"]) 

1624 

1625 def set_bibitem(self, bibitem): 

1626 self.bibitem = bibitem 

1627 

1628 def internal_do(self): 

1629 super().internal_do() 

1630 

1631 new_ids = {} 

1632 for bibitemid in self.bibitem.bibitemid_set.all(): 

1633 new_ids[bibitemid.id_type] = { 

1634 "id_type": bibitemid.id_type, 

1635 "id_value": bibitemid.id_value, 

1636 "checked": bibitemid.checked, 

1637 "false_positive": bibitemid.false_positive, 

1638 } 

1639 

1640 xbibitem = jats_parser.update_bibitem_xml(self.bibitem, new_ids) 

1641 self.warnings.extend(xbibitem.warnings) 

1642 

1643 self.bibitem.citation_xml = xbibitem.citation_xml 

1644 self.bibitem.citation_html = xbibitem.citation_html 

1645 self.bibitem.citation_tex = xbibitem.citation_tex 

1646 self.bibitem.save() 

1647 

1648 def internal_undo(self): 

1649 raise RuntimeError("update commands do not support the undo") 

1650 

1651 

1652###################################################################################### 

1653###################################################################################### 

1654# 

1655# Import Commands 

1656# 

1657###################################################################################### 

1658###################################################################################### 

1659 

1660 

1661class collectEntireCollectionXmlCmd(baseCmd): 

1662 """ 

1663 Get the PIDs of all the XML of a collection (collection.xml, issues.xml) of a given folder 

1664 

1665 results: 

1666 """ 

1667 

1668 def __init__(self, params=None): 

1669 self.pid = None 

1670 self.folder = None 

1671 

1672 super().__init__(params) 

1673 

1674 self.required_params.extend(["pid", "folder"]) 

1675 

1676 def internal_do(self): 

1677 super().internal_do() 

1678 pids = [pid for pid, _ in resolver.iterate_collection_folder(self.folder, self.pid)] 

1679 return pids 

1680 

1681 

1682class importEntireCollectionXmlCmd(baseCmd): 

1683 """ 

1684 Import all the XML of a collection (collection.xml, issues.xml) of a given folder 

1685 

1686 results: 

1687 """ 

1688 

1689 def __init__(self, params=None): 

1690 self.pid = None 

1691 self.import_folder = None 

1692 self.temp_folder = None 

1693 self.keep_metadata = False 

1694 self.caller = None 

1695 self.callback = None 

1696 self.job = None 

1697 self.to_folder = None 

1698 self.with_cedrics = True 

1699 self.from_cedrics = False # The entire collection is in Cedrics format 

1700 self.date_for_pii = False # Fetch publication_date for Elsevier articles 

1701 self.first_issue = "" 

1702 self.fake = False # Parse the XML but do not import 

1703 

1704 super().__init__(params) 

1705 

1706 self.required_params.extend(["pid", "import_folder"]) 

1707 

1708 def internal_do(self): 

1709 super().internal_do() 

1710 

1711 pid = self.pid 

1712 resource = model_helpers.get_resource(pid) 

1713 if not resource and not self.fake: 1713 ↛ 1722line 1713 didn't jump to line 1722, because the condition on line 1713 was never false

1714 body = resolver.get_archive_body(self.import_folder, pid, None) 

1715 journals = addCollectionsXmlCmd( 

1716 {"body": body, "from_folder": self.import_folder, "to_folder": self.to_folder} 

1717 ).do() 

1718 if not journals: 1718 ↛ 1719line 1718 didn't jump to line 1719, because the condition on line 1718 was never true

1719 raise ValueError(self.import_folder + " does not contain a collection") 

1720 resource = journals[0] 

1721 

1722 obj = resource.cast() 

1723 

1724 if obj.classname != "Collection": 1724 ↛ 1725line 1724 didn't jump to line 1725, because the condition on line 1724 was never true

1725 raise ValueError(pid + " does not contain a collection") 

1726 

1727 if self.with_cedrics: 1727 ↛ 1730line 1727 didn't jump to line 1730, because the condition on line 1727 was never true

1728 # with_cedrics means that you want to import everything from scratch 

1729 # Delete solr documents (01/28/2020: Solr can have multiple docs with the same PID) 

1730 cmd = solr_cmds.solrDeleteCmd({"q": "pid:" + self.pid + "*"}) 

1731 cmd.do() 

1732 

1733 i = 0 

1734 for pid, file_ in resolver.iterate_collection_folder( 

1735 self.import_folder, self.pid, self.first_issue 

1736 ): 

1737 if self.callback is None: 1737 ↛ 1740line 1737 didn't jump to line 1740, because the condition on line 1737 was never false

1738 print(pid) 

1739 

1740 if self.from_cedrics: 1740 ↛ 1741line 1740 didn't jump to line 1741, because the condition on line 1740 was never true

1741 cmd = importCedricsIssueDirectlyXmlCmd( 

1742 { 

1743 "colid": self.pid, 

1744 "input_file": file_, 

1745 "remove_email": False, 

1746 "remove_date_prod": True, 

1747 "copy_files": True, 

1748 "force_dois": False, 

1749 } 

1750 ) 

1751 else: 

1752 body = resolver.get_body(file_) 

1753 xml_file_folder = os.path.dirname(file_) 

1754 cmd = addOrUpdateContainerXmlCmd( 

1755 { 

1756 "body": body, 

1757 # utile lors de keep_metadata = True 

1758 "temp_folder": self.temp_folder, 

1759 # pour récupérer les métadonnées du json 

1760 "import_folder": self.import_folder, 

1761 "from_folder": self.import_folder, 

1762 "to_folder": self.to_folder, 

1763 # when article.XML are in separate files 

1764 "xml_file_folder": xml_file_folder, 

1765 "keep_metadata": self.keep_metadata, 

1766 "fake": self.fake, 

1767 } 

1768 ) 

1769 cmd.do() 

1770 

1771 i += 1 

1772 if self.callback: 1772 ↛ 1773line 1772 didn't jump to line 1773, because the condition on line 1772 was never true

1773 self.callback(self.job, i) 

1774 

1775 if self.with_cedrics: 1775 ↛ 1776line 1775 didn't jump to line 1776, because the condition on line 1775 was never true

1776 src_folder = os.path.join(settings.CEDRAM_XML_FOLDER, self.pid, "metadata") 

1777 

1778 xml_files = [ 

1779 os.path.join(src_folder, f) 

1780 for f in os.listdir(src_folder) 

1781 if os.path.isfile(os.path.join(src_folder, f)) and f.endswith(".xml") 

1782 ] 

1783 for xml_file in xml_files: 

1784 if self.callback is None: 

1785 print(xml_file) 

1786 

1787 cmd = importCedricsIssueXmlCmd( 

1788 { 

1789 "colid": self.pid, 

1790 "input_file": xml_file, 

1791 "from_folder": self.import_folder, 

1792 "to_folder": self.to_folder, 

1793 } 

1794 ) 

1795 cmd.do() 

1796 

1797 

1798class importCedricsIssueXmlCmd(baseCmd): 

1799 def __init__(self, params=None): 

1800 self.colid = None 

1801 self.input_file = None 

1802 self.remove_email = True 

1803 self.remove_date_prod = True 

1804 self.diff_only = False 

1805 self.body = None 

1806 self.xissue = None 

1807 self.copy_files = True 

1808 

1809 super().__init__(params) 

1810 

1811 self.required_params.extend(["colid"]) 

1812 

1813 def import_full_text(self, issue): 

1814 """ 

1815 Some journals want to display the full text in HTML (CRCHIM/CRGEOS/CEBIOL) 

1816 Read the XML file and convert the body in HTML 

1817 """ 

1818 tex_src_folder = resolver.get_cedram_issue_tex_folder(self.colid, issue.pid) 

1819 tex_folders, _ = resolver.get_cedram_tex_folders(self.colid, issue.pid) 

1820 

1821 if len(tex_folders) > 0: 1821 ↛ exitline 1821 didn't return from function 'import_full_text', because the condition on line 1821 was never false

1822 i = 0 

1823 for article in issue.article_set.all(): 

1824 article_folder = tex_folders[i] 

1825 xml_file = os.path.join( 

1826 tex_src_folder, article_folder, "FullText", article_folder + ".xml" 

1827 ) 

1828 

1829 cmd = ptf_cmds.updateResourceIdPtfCmd( 

1830 {"id_type": "ojs-id", "id_value": article_folder} 

1831 ) 

1832 cmd.set_resource(article) 

1833 cmd.do() 

1834 

1835 if os.path.isfile(xml_file): 

1836 with open(xml_file, encoding="utf-8") as f: 

1837 body = f.read() 

1838 

1839 cmd = addBodyInHtmlXmlCmd( 

1840 { 

1841 "body": body, 

1842 "from_folder": settings.CEDRAM_XML_FOLDER, 

1843 # nécessaire pour la copie des binaires type image 

1844 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER, # idem 

1845 } 

1846 ) 

1847 cmd.set_article(article) 

1848 cmd.do() 

1849 

1850 i += 1 

1851 

1852 def import_in_db(self): 

1853 """ 

1854 Import Cedrics issue from /cedram_dev/exploitation/cedram 

1855 This worflow is no longer used. 

1856 """ 

1857 

1858 # Cedrics: the full text for SolR is in a separate file 

1859 full_text_folder = os.path.dirname(os.path.dirname(self.input_file)) + "/plaintext/" 

1860 

1861 params = { 

1862 "assign_doi": False, 

1863 "full_text_folder": full_text_folder, 

1864 "keep_metadata": True, 

1865 "use_body": False, 

1866 "xissue": self.xissue, 

1867 "temp_folder": settings.MERSENNE_TMP_FOLDER, 

1868 "from_folder": settings.CEDRAM_XML_FOLDER, 

1869 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER if self.copy_files else None, 

1870 } 

1871 

1872 # params['body'] = self.body 

1873 

1874 cmd = addOrUpdateIssueXmlCmd(params) 

1875 issue = cmd.do() 

1876 self.warnings.extend(cmd.get_warnings()) 

1877 

1878 # resolver.copy_binary_files( 

1879 # issue, 

1880 # settings.CEDRAM_XML_FOLDER, 

1881 # settings.MERSENNE_TEST_DATA_FOLDER) 

1882 

1883 self.import_full_text(issue) 

1884 

1885 return issue 

1886 

1887 def compare_issue(self): 

1888 xissue = self.xissue 

1889 issues_diff = {} 

1890 result = True 

1891 

1892 time1 = timezone.now() 

1893 

1894 new_dois = [article.doi for article in xissue.articles] 

1895 

1896 article_qs = Article.objects.filter(doi__in=new_dois).prefetch_related( 

1897 "abstract_set", 

1898 "kwd_set", 

1899 "subj_set", 

1900 "datastream_set", 

1901 "relatedobject_set", 

1902 "resourcecount_set", 

1903 "contributions", 

1904 "contributions__contribaddress_set", 

1905 "bibitem_set__bibitemid_set", 

1906 "bibitem_set__contributions", 

1907 "bibitem_set__contributions__contribaddress_set", 

1908 ) 

1909 

1910 issue = None 

1911 try: 

1912 issue = ( 

1913 Container.objects.select_related("my_collection", "my_publisher") 

1914 .prefetch_related( 

1915 Prefetch("article_set", queryset=article_qs, to_attr="articles_from_doi") 

1916 ) 

1917 .get(sites__id=settings.SITE_ID, pid=xissue.pid) 

1918 ) 

1919 except Container.DoesNotExist: 

1920 pass 

1921 

1922 if issue: 

1923 data_issue = model_data_converter.db_to_issue_data(issue, issue.articles_from_doi) 

1924 

1925 time2 = timezone.now() 

1926 delta = time2 - time1 

1927 

1928 delta.seconds + delta.microseconds / 1e6 

1929 print(delta) 

1930 

1931 # Handle xml cmds side effects (ex: "numdam" changed into "mathdoc", ...) 

1932 model_data_comparator.prepare_issue_for_comparison(xissue) 

1933 

1934 issue_comparator = model_data_comparator.IssueDataComparator() 

1935 

1936 result = issue_comparator.compare(data_issue, xissue, issues_diff) 

1937 

1938 return (result, issues_diff, xissue) 

1939 

1940 def delete_previous_file(self, output_folder): 

1941 basename = os.path.basename(self.input_file) 

1942 

1943 output_file = os.path.join(output_folder, self.colid, basename) 

1944 if os.path.isfile(output_file): 

1945 os.remove(output_file) 

1946 

1947 os.makedirs(output_folder, exist_ok=True) 

1948 os.makedirs(os.path.dirname(output_file), exist_ok=True) 

1949 

1950 return output_file 

1951 

1952 def import_cedrics_issue(self): 

1953 """ 

1954 Import Cedrics issue from /cedram_dev/exploitation/cedram 

1955 This worflow is no longer used. 

1956 Cedrics issues are imported from /cedram_dev/production_tex/CEDRAM 

1957 (see importCedricsIssueDirectlyXmlCmd below) 

1958 """ 

1959 

1960 output_folder = settings.MERSENNE_TMP_FOLDER 

1961 ptf_xsl_folder = settings.PTF_XSL_FOLDER 

1962 log_file = os.path.join(output_folder, settings.MERSENNE_LOG_FILE) 

1963 

1964 # 1. Delete the previous file 

1965 output_file = self.delete_previous_file(output_folder) 

1966 

1967 # 2. Transform the cedrics XML into JATS 

1968 cmd_folder = os.path.join(ptf_xsl_folder, "cedram") 

1969 

1970 cmd_str = 'cd {}; {} cedram2ptf.py -v -x {} -p {} -o {} -b "" -l {} {} {} > {} 2>&1'.format( 

1971 cmd_folder, 

1972 os.path.join(settings.VIRTUALENV_DIR, "bin/python"), 

1973 "-s" if self.colid in settings.MERSENNE_SEMINARS else "", 

1974 self.input_file, 

1975 output_folder, 

1976 log_file + "1", 

1977 # option -e for cedram2ptf.py for not removing email 

1978 "-e" if not self.remove_email else "", 

1979 "-t" if self.remove_date_prod else "", 

1980 log_file, 

1981 ) 

1982 

1983 log_file2 = log_file + "2" 

1984 with open(log_file2, "w", encoding="ascii") as file_: 

1985 file_.write(cmd_str + "\n") 

1986 

1987 sys.path.append(ptf_xsl_folder + "/lib") 

1988 

1989 try: 

1990 result = subprocess.check_output(cmd_str, shell=True) 

1991 except Exception as e: 

1992 with open(log_file) as logfile_: 

1993 logfile_body = logfile_.read() 

1994 message = str(e) + "\n" + logfile_body + "\n" 

1995 file_.write(message) 

1996 file_.close() 

1997 raise RuntimeError(message) 

1998 

1999 file_.write(str(result) + "\n") 

2000 

2001 # Check if the output_file has been created 

2002 if not os.path.isfile(output_file): 

2003 raise RuntimeError("The file was not converted in JATS") 

2004 

2005 with open(output_file, encoding="utf-8") as f: 

2006 self.body = f.read() 

2007 

2008 parser = etree.XMLParser( 

2009 huge_tree=True, recover=True, remove_blank_text=True, remove_comments=True 

2010 ) 

2011 tree = etree.fromstring(self.body.encode("utf-8"), parser=parser) 

2012 self.xissue = jats_parser.JatsIssue(tree=tree) 

2013 self.warnings.extend(self.xissue.warnings) 

2014 

2015 def internal_do(self): 

2016 super().internal_do() 

2017 

2018 if not self.xissue: 2018 ↛ 2021line 2018 didn't jump to line 2021, because the condition on line 2018 was never false

2019 self.import_cedrics_issue() 

2020 

2021 result = None 

2022 

2023 if self.diff_only: 2023 ↛ 2024line 2023 didn't jump to line 2024, because the condition on line 2023 was never true

2024 result = self.compare_issue() 

2025 else: 

2026 result = self.import_in_db() 

2027 

2028 return result 

2029 

2030 

2031# import from /cedram_dev/production_tex/CEDRAM 

2032class importCedricsIssueDirectlyXmlCmd(importCedricsIssueXmlCmd): 

2033 def __init__(self, params=None): 

2034 self.is_seminar = False 

2035 self.article_folders = None 

2036 self.force_dois = True 

2037 super().__init__(params) 

2038 

2039 def read_file(self, filename, skip_lines=2): 

2040 i = 0 

2041 lines = [] 

2042 try: 

2043 with open(filename, encoding="utf-8") as fr: 

2044 for line in fr: 

2045 if i > skip_lines: 

2046 lines.append(line) 

2047 i += 1 

2048 except UnicodeDecodeError: 

2049 i = 0 

2050 lines = [] 

2051 with open(filename, encoding="iso-8859-1") as fr: 

2052 for line in fr: 

2053 if i > skip_lines: 

2054 lines.append(line) 

2055 i += 1 

2056 

2057 return lines 

2058 

2059 def import_cedrics_issue(self): 

2060 """ 

2061 Parse the Cedrics XML directly, without Cedrics -> JATS transformation 

2062 The deplace_fasc script is no longer needed, but the Cedrics issue XML has to be created 

2063 Workflow 

2064 1. Get the list of articles from /cedram_dev/production_tex/CEDRAM 

2065 2. Cat the article XML files into one issue.XML 

2066 3. Read the Cedrics issue.XML 

2067 

2068 :return: 

2069 """ 

2070 

2071 output_folder = settings.MERSENNE_TMP_FOLDER 

2072 output_file = self.delete_previous_file(output_folder) 

2073 

2074 basename = os.path.basename(self.input_file) 

2075 if "-cdrxml" in basename: 2075 ↛ 2078line 2075 didn't jump to line 2078, because the condition on line 2075 was never false

2076 pid = basename.split("-cdrxml.")[0] 

2077 else: 

2078 pid = basename.split(".xml")[0] 

2079 

2080 # 1. Get the list of articles 

2081 tex_src_folder = resolver.get_cedram_issue_tex_folder(self.colid, pid) 

2082 self.article_folders, self.dois = resolver.get_cedram_tex_folders(self.colid, pid) 

2083 

2084 # 2. Create the issue XML file 

2085 with open(output_file, "w", encoding="utf-8") as fw: 

2086 # 2.a. Start the issue.xml based on @pid-cdrxml.xml 

2087 fw.write('<?xml version="1.0" encoding="utf-8" standalone="no"?>\n') 

2088 fw.write('<!DOCTYPE cedram SYSTEM "/home/cedram/XML/dtd/cedram.dtd">\n') 

2089 fw.write("<cedram>\n") 

2090 

2091 lines = self.read_file(self.input_file) 

2092 for line in lines: 

2093 fw.write(line) 

2094 

2095 # 2.b. Cat the article XML files 

2096 for basename in self.article_folders: 

2097 src_file = os.path.join(tex_src_folder, basename, basename + "-cdrxml.xml") 

2098 

2099 lines = self.read_file(src_file) 

2100 for line in lines: 

2101 fw.write(line) 

2102 

2103 fw.write("</cedram>\n") 

2104 

2105 # 3. Read the Cedrics issue.XML 

2106 with open(output_file, encoding="utf-8") as f: 

2107 self.body = f.read() 

2108 

2109 parser = etree.XMLParser( 

2110 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True 

2111 ) 

2112 tree = etree.fromstring(self.body.encode("utf-8"), parser=parser) 

2113 self.xissue = cedrics_parser.CedricsIssue( 

2114 tree=tree, 

2115 is_seminar=self.is_seminar, 

2116 ignore_date_published=self.remove_date_prod, 

2117 article_folders=self.article_folders, 

2118 dois=self.dois, 

2119 ) 

2120 if self.force_dois: 2120 ↛ 2125line 2120 didn't jump to line 2125, because the condition on line 2120 was never false

2121 for xarticle in self.xissue.articles: 

2122 if xarticle.doi is None: 2122 ↛ 2123line 2122 didn't jump to line 2123, because the condition on line 2122 was never true

2123 raise ValueError(xarticle.pid, "n'a pas de doi") 

2124 

2125 self.warnings.extend(self.xissue.warnings) 

2126 

2127 def import_in_db(self): 

2128 params = { 

2129 "assign_doi": False, 

2130 "full_text_folder": settings.CEDRAM_TEX_FOLDER, # the full text for SolR is in a separate file 

2131 "keep_metadata": True, 

2132 "use_body": False, 

2133 "xissue": self.xissue, 

2134 "temp_folder": settings.MERSENNE_TMP_FOLDER, # temp folder used to backup/restore info during the import 

2135 "from_folder": settings.CEDRAM_TEX_FOLDER, 

2136 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER if self.copy_files else None, 

2137 } 

2138 

2139 cmd = addOrUpdateIssueXmlCmd(params) 

2140 issue = cmd.do() 

2141 self.warnings.extend(cmd.get_warnings()) 

2142 

2143 self.import_full_text(issue) 

2144 

2145 return issue 

2146 

2147 

2148class addCedricsIssueXmlCmd(addXmlCmd): 

2149 assign_doi = False 

2150 full_text_folder = "" 

2151 import_folder = None 

2152 prod_deployed_date_iso_8601_date_str = None 

2153 xissue = None 

2154 remove_blank_text = False 

2155 is_seminar = False 

2156 

2157 def internal_do(self): 

2158 super().internal_do() 

2159 

2160 self.xissue = cedrics_parser.CedricsIssue(tree=self.tree, is_seminar=self.is_seminar) 

2161 

2162 return self.xissue 

2163 

2164 

2165class addorUpdateCedricsArticleXmlCmd(baseCmd): 

2166 def __init__(self, params=None): 

2167 self.container_pid = None 

2168 self.article_folder_name = None 

2169 

2170 super().__init__(params) 

2171 

2172 self.required_params.extend(["container_pid", "article_folder_name"]) 

2173 

2174 def internal_do(self): 

2175 super().internal_do() 

2176 

2177 issue = model_helpers.get_container(self.container_pid) 

2178 if not issue: 

2179 raise exceptions.ResourceDoesNotExist(f"Issue {self.container_pid} does not exist") 

2180 

2181 colid = issue.my_collection.pid 

2182 article_folder = os.path.join( 

2183 settings.CEDRAM_TEX_FOLDER, colid, self.container_pid, self.article_folder_name 

2184 ) 

2185 

2186 # 1. Read the Cedrics article.XML 

2187 input_file = os.path.join(article_folder, f"{self.article_folder_name}-cdrxml.xml") 

2188 with open(input_file, encoding="utf-8") as f: 

2189 body = f.read() 

2190 

2191 # 2. Parse the file and create an xarticle 

2192 is_seminar = colid in settings.MERSENNE_SEMINARS 

2193 parser = etree.XMLParser( 

2194 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True 

2195 ) 

2196 tree = etree.fromstring(body.encode("utf-8"), parser=parser) 

2197 xarticle = cedrics_parser.CedricsArticle( 

2198 tree=tree, 

2199 colid=colid, 

2200 issue_id=self.container_pid, 

2201 is_seminar=is_seminar, 

2202 ignore_date_published=True, 

2203 article_folder=self.article_folder_name, 

2204 ) 

2205 if xarticle.doi is None: 

2206 raise ValueError(xarticle.pid, "n'a pas de doi") 

2207 

2208 # Get the article position in its issue (seq) to preserve its order 

2209 article_folders, dois = resolver.get_cedram_tex_folders(colid, self.container_pid) 

2210 i = 1 

2211 for folder in article_folders: 

2212 if folder == self.article_folder_name: 

2213 xarticle.seq = i 

2214 i += 1 

2215 

2216 existing_article = model_helpers.get_article(xarticle.pid) 

2217 temp_folder = settings.MERSENNE_TMP_FOLDER 

2218 

2219 # 3. Backup/Suppression de l'article existant 

2220 if existing_article: 

2221 # On commence par faire un backup de l'existant en cas de bug. 

2222 ptf_cmds.exportPtfCmd( 

2223 { 

2224 "pid": self.container_pid, 

2225 "with_internal_data": True, 

2226 "with_binary_files": False, 

2227 "for_archive": False, 

2228 "export_folder": os.path.join(temp_folder, "backup"), 

2229 } 

2230 ).do() 

2231 

2232 # On sauvegarde les données additionnelles (extid, deployed_date,...) dans un json 

2233 params = { 

2234 "pid": existing_article.pid, 

2235 "export_folder": temp_folder, 

2236 "export_all": True, 

2237 "with_binary_files": True, 

2238 "do_backup_obj_not_in_metadata": True, 

2239 } 

2240 ptf_cmds.exportExtraDataPtfCmd(params).do() 

2241 

2242 # Inutile d'effacer l'article existant, addArticleXmlCmd le fait en mode standalone 

2243 

2244 # 4. Ajout de l'article dans Django/SolR 

2245 params = { 

2246 "xarticle": xarticle, 

2247 "issue": issue, 

2248 "standalone": True, 

2249 "use_body": False, # No self.body with the content of the XML file; xarticle is passed directly 

2250 "full_text_folder": settings.CEDRAM_TEX_FOLDER, # the full text for SolR is in a separate file 

2251 # temp folder used to backup/restore info during the import 

2252 "from_folder": settings.CEDRAM_TEX_FOLDER, 

2253 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER, 

2254 } 

2255 

2256 cmd = addArticleXmlCmd(params) 

2257 cmd.set_collection(issue.my_collection) 

2258 article = cmd.do() 

2259 

2260 # 5. Lecture du full text en HTML 

2261 xml_file = os.path.join(article_folder, "FullText", self.article_folder_name + ".xml") 

2262 if os.path.isfile(xml_file): 

2263 with open(xml_file, encoding="utf-8") as f: 

2264 body = f.read() 

2265 

2266 cmd = addBodyInHtmlXmlCmd( 

2267 { 

2268 "body": body, 

2269 "from_folder": settings.CEDRAM_XML_FOLDER, 

2270 # nécessaire pour la copie des binaires type image 

2271 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER, # idem 

2272 "remove_blank_text": False, 

2273 } 

2274 ) 

2275 cmd.set_article(article) 

2276 cmd.do() 

2277 

2278 # 6. On ajoute l'ojs-id pour ptf-tools 

2279 cmd = ptf_cmds.updateResourceIdPtfCmd( 

2280 {"id_type": "ojs-id", "id_value": self.article_folder_name} 

2281 ) 

2282 cmd.set_resource(article) 

2283 cmd.do() 

2284 

2285 # 7. On restaure les données additionnelles (extid, deployed_date,...) 

2286 if existing_article: 

2287 ptf_cmds.importExtraDataPtfCmd( 

2288 {"pid": existing_article.pid, "import_folder": temp_folder} 

2289 ).do() 

2290 

2291 return article 

2292 

2293 

2294class transformBodyInHtmlXmlCmd(addXmlCmd): 

2295 """ 

2296 transformBodyInHtmlXmlCmd: transform the JATS body in HTML 

2297 

2298 TODO: handle images,... 

2299 

2300 """ 

2301 

2302 use_body = False 

2303 

2304 def internal_do(self): 

2305 super().internal_do() 

2306 

2307 xsl_file = settings.PTF_HTML_XSL 

2308 xslt_doc = etree.parse(xsl_file) 

2309 t = etree.XSLT(xslt_doc) 

2310 

2311 html_tree = t(self.tree).getroot() 

2312 

2313 body = html_tree.find("body/article/main") 

2314 text = xmldata_jats.innerxml(body).decode("utf-8") 

2315 

2316 return text 

2317 

2318 

2319class addBodyInHtmlXmlCmd(addXmlCmd): 

2320 """ 

2321 addBodyInHtmlXmlCmd: read the JATS body of an article 

2322 and create the corresponding HTML 

2323 

2324 TODO: handle images,... manage warnings for unused tag ? 

2325 

2326 """ 

2327 

2328 def __init__(self, params=None): 

2329 self.article = None 

2330 self.pid = None 

2331 

2332 super().__init__(params) 

2333 

2334 def set_article(self, article): 

2335 self.article = article 

2336 

2337 def pre_do(self): 

2338 super().pre_do() 

2339 

2340 if self.pid is None and self.article is None: 2340 ↛ 2341line 2340 didn't jump to line 2341, because the condition on line 2340 was never true

2341 raise ValueError("pid et article sont vides") 

2342 

2343 if self.article is None: 2343 ↛ 2344line 2343 didn't jump to line 2344, because the condition on line 2343 was never true

2344 self.article = model_helpers.get_article(self.pid) 

2345 

2346 if self.pid is None: 2346 ↛ exitline 2346 didn't return from function 'pre_do', because the condition on line 2346 was never false

2347 self.pid = self.article.pid 

2348 

2349 def internal_do(self): 

2350 super().internal_do() 

2351 

2352 xarticle = jats_parser.JatsArticle(tree=self.tree, pid=self.pid) 

2353 # faut il récupérer les warnings du parseHTML ? 

2354 # self.warnings.extend(xarticle.warnings) 

2355 self.article.relatedobject_set.filter(rel="html-image").delete() 

2356 self.add_objects_with_location(xarticle.figures, self.article, "RelatedObject") 

2357 

2358 params = { 

2359 "body_html": xarticle.body_html, 

2360 "body_tex": xarticle.body_tex, 

2361 "body_xml": xarticle.body_xml, 

2362 "use_page_count": False, 

2363 } 

2364 

2365 cmd = ptf_cmds.updateArticlePtfCmd(params) 

2366 cmd.set_article(self.article) 

2367 cmd.do() 

2368 

2369 # copy_binary_files will call resolver.copy_html_images 

2370 # to copy the article images 

2371 # because updateArticlePtfCmd is not from addPtfCmd, need to copy files here 

2372 

2373 resolver.copy_html_images( 

2374 self.article, settings.MERSENNE_TEST_DATA_FOLDER, settings.CEDRAM_XML_FOLDER 

2375 ) 

2376 

2377 

2378class updateCacheXmlCmd(baseCmd): 

2379 """ 

2380 recreate the citation_html field of the bibitems 

2381 

2382 Params: colid: pid of the collection to process 

2383 """ 

2384 

2385 def __init__(self, params=None): 

2386 self.colid = None 

2387 self.start_id = None 

2388 

2389 super().__init__(params) 

2390 

2391 self.required_params.extend(["colid"]) 

2392 

2393 def update_article(self, xarticle): 

2394 article = model_helpers.get_article(xarticle.pid) 

2395 if article is None: 

2396 raise exceptions.ResourceDoesNotExist(f"Article {xarticle.pid} does not exist") 

2397 

2398 article.title_html = xarticle.title_html 

2399 article.title_tex = xarticle.title_tex 

2400 article.trans_title_html = xarticle.trans_title_html 

2401 article.trans_title_tex = xarticle.trans_title_tex 

2402 article.save() 

2403 

2404 for xabstract, abstract in zip(xarticle.abstracts, article.abstract_set.all()): 

2405 abstract.value_html = xabstract["value_html"] 

2406 abstract.value_tex = xabstract["value_tex"] 

2407 abstract.save() 

2408 

2409 # for xkwd_group, kwd_group in zip(xarticle.kwd_groups, article.kwdgroup_set.all()): 

2410 # kwd_group.value_html = xkwd_group['value_html'] 

2411 # kwd_group.value_tex = xkwd_group['value_tex'] 

2412 # kwd_group.save() 

2413 

2414 for xbib, bib in zip(xarticle.bibitems, article.bibitem_set.all()): 

2415 bib.citation_html = xbib.citation_html 

2416 bib.citation_tex = xbib.citation_tex 

2417 bib.article_title_tex = xbib.article_title_tex 

2418 bib.chapter_title_tex = xbib.chapter_title_tex 

2419 bib.source_tex = xbib.source_tex 

2420 bib.volume = xbib.volume 

2421 bib.save() 

2422 

2423 if hasattr(settings, "SHOW_BODY") and settings.SHOW_BODY: 

2424 params = { 

2425 "body_html": xarticle.body_html, 

2426 "body_tex": xarticle.body_tex, 

2427 "body_xml": xarticle.body_xml, 

2428 "use_page_count": False, 

2429 } 

2430 

2431 cmd = ptf_cmds.updateArticlePtfCmd(params) 

2432 cmd.set_article(article) 

2433 cmd.do() 

2434 

2435 def internal_do(self): 

2436 super().internal_do() 

2437 

2438 collection = model_helpers.get_collection(self.colid) 

2439 if collection is None: 

2440 raise exceptions.ResourceDoesNotExist(f"Collection {self.colid} does not exist") 

2441 

2442 qs = collection.content.all().order_by("pid") 

2443 start = self.start_id is None 

2444 for container in qs: 

2445 if not start and container.pid == self.start_id: 

2446 start = True 

2447 

2448 if start: 

2449 print(container.pid) 

2450 with_body = hasattr(settings, "SHOW_BODY") and settings.SHOW_BODY 

2451 xml_body = ptf_cmds.exportPtfCmd( 

2452 {"pid": container.pid, "with_body": with_body} 

2453 ).do() 

2454 

2455 parser = etree.XMLParser( 

2456 huge_tree=True, 

2457 recover=True, 

2458 remove_blank_text=False, 

2459 remove_comments=True, 

2460 resolve_entities=True, 

2461 ) 

2462 tree = etree.fromstring(xml_body.encode("utf-8"), parser=parser) 

2463 xissue = jats_parser.JatsIssue(tree=tree) 

2464 

2465 for xarticle in xissue: 

2466 self.update_article(xarticle)