Coverage for apps/ptf/cmds/xml_cmds.py: 67%

1212 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-05-19 19:20 +0000

1import copy 

2import datetime 

3import os.path 

4import subprocess 

5import sys 

6import traceback 

7 

8from lxml import ElementInclude 

9from lxml import etree 

10 

11from django.conf import settings 

12from django.db import transaction 

13from django.db.models import Prefetch 

14from django.utils import timezone 

15 

16from ptf import exceptions 

17from ptf import model_data 

18from ptf import model_data_comparator 

19from ptf import model_data_converter 

20from ptf import model_helpers 

21from ptf import tex 

22from ptf import utils 

23from ptf.cmds import ptf_cmds 

24from ptf.cmds import solr_cmds 

25from ptf.cmds.base_cmds import baseCmd 

26from ptf.cmds.xml import xml_utils 

27from ptf.cmds.xml.cedrics import cedrics_parser 

28 

29# KEEP THIS UNUSED IMPORT THEY ARE USED 

30from ptf.cmds.xml.jats import jats_parser 

31from ptf.cmds.xml.jats import xmldata as xmldata_jats 

32from ptf.cmds.xml.xml_utils import normalize 

33from ptf.display import resolver 

34from ptf.models import Article 

35from ptf.models import Collection 

36from ptf.models import Container 

37from ptf.models import Person 

38from ptf.models import backup_obj_not_in_metadata 

39from ptf.models import backup_translation 

40from ptf.models import restore_obj_not_in_metadata 

41from ptf.models import restore_translation 

42 

43 

44def find_file(name): 

45 paths = settings.MANAGER_XSLT_DIRS 

46 for path in paths: 

47 for root, _, files in os.walk(path): 

48 if name in files: 

49 return os.path.join(root, name) 

50 return None 

51 

52 

53def get_transform(name): 

54 file_path = find_file(f"{name}.xsl") 

55 xslt_doc = etree.parse(file_path) 

56 return etree.XSLT(xslt_doc) 

57 

58 

59class addXmlCmd(baseCmd): 

60 """ 

61 addXmlCmd: base class for commands that take an XML as input 

62 The XML is passed with the body param 

63 

64 from_folder / to_folder: location of binary files to copy 

65 

66 Example with a file: 

67 f = open('journal.xml') 

68 body = f.read() 

69 f.close() 

70 cmd = add...XmlCmd( { "body":body } ) 

71 

72 Exception raised: 

73 - ValueError if the init params are empty 

74 """ 

75 

76 use_body = True 

77 body = None 

78 tree = None 

79 solr_commit_at_the_end = True 

80 xml_filename_in_log = None 

81 remove_blank_text = False 

82 xml_file_folder = None 

83 

84 def __init__(self, params=None): 

85 super().__init__(params) 

86 

87 if self.use_body: 

88 self.required_params.extend(["body"]) 

89 

90 def get_logname(self): 

91 filename = "" 

92 

93 if hasattr(settings, "LOG_DIR"): 93 ↛ 103line 93 didn't jump to line 103, because the condition on line 93 was never false

94 i = 0 

95 today = datetime.date.today() 

96 basename = str(today) + "-" + self.__class__.__name__ + "-" 

97 filename = os.path.join(settings.LOG_DIR, basename + str(i) + ".xml") 

98 

99 while os.path.isfile(filename): 

100 i += 1 

101 filename = os.path.join(settings.LOG_DIR, basename + str(i) + ".xml") 

102 

103 return filename 

104 

105 def pre_do(self): 

106 super().pre_do() 

107 

108 if self.use_body: 

109 # The Cedrics -> JATS XSLT transform manually adds space=preserve around 

110 # the nodes with mixed-content, but leaves the text unchanged. 

111 # As such, parsing the Cedrics XML cannot be done with remove_blank_text=True 

112 # Or the spaces will be removed whereas the JATS XML will keep them. 

113 # We still need the remove_blank_text=True for JATS XML for all the other nodes 

114 parser = etree.XMLParser( 

115 huge_tree=True, 

116 recover=True, 

117 remove_blank_text=self.remove_blank_text, 

118 remove_comments=True, 

119 resolve_entities=True, 

120 ) 

121 # if isinstance(self.body, str): 

122 # self.body = self.body 

123 if self.xml_file_folder is not None: 

124 if self.xml_file_folder[-1] != "/": 

125 self.xml_file_folder += "/" 

126 # For ElementInclude to find the href 

127 self.body = self.body.replace( 

128 'xmlns:xlink="http://www.w3.org/1999/xlink"', "" 

129 ).replace("xlink:href", "href") 

130 tree = etree.fromstring(self.body.encode("utf-8"), parser=parser) 

131 

132 if self.xml_file_folder is not None: 

133 ElementInclude.include(tree, base_url=self.xml_file_folder) 

134 # t = get_transform('strip-namespace') 

135 # self.tree = t(tree).getroot() 

136 self.tree = tree 

137 

138 if self.tree is None: 138 ↛ 139line 138 didn't jump to line 139, because the condition on line 138 was never true

139 raise ValueError("tree est vide") 

140 

141 # Write the xml body on disk 

142 if hasattr(settings, "LOG_DIR") and self.body and self.use_body: 

143 self.xml_filename_in_log = self.get_logname() 

144 

145 with open(self.xml_filename_in_log, "w", encoding="utf-8") as file_: 

146 file_.write(self.body) 

147 

148 @transaction.atomic 

149 def do(self, parent=None): 

150 try: 

151 obj = super().do(parent) 

152 except Exception as e: 

153 ptf_cmds.do_solr_rollback() 

154 

155 # Empty sub_cmds to ignore undo 

156 self.cmds = [] 

157 

158 # Write the xml body on disk 

159 if hasattr(settings, "LOG_DIR") and self.body and self.use_body: 

160 with open( 

161 os.path.join(settings.LOG_DIR, "cmds.log"), "a", encoding="utf-8" 

162 ) as file_: 

163 file_.write("----------------------\n") 

164 

165 if self.xml_filename_in_log is None: 165 ↛ 166line 165 didn't jump to line 166, because the condition on line 165 was never true

166 self.xml_filename_in_log = self.get_logname() 

167 

168 file_.write(self.xml_filename_in_log + " : FAILED\n") 

169 exc_type, exc_value, exc_traceback = sys.exc_info() 

170 lines = traceback.format_exception(exc_type, exc_value, exc_traceback) 

171 for line in lines: 

172 file_.write(line + "\n") 

173 file_.write("----------------------\n") 

174 

175 raise e 

176 

177 if self.solr_commit_at_the_end: 

178 ptf_cmds.do_solr_commit() 

179 

180 return obj 

181 

182 def post_undo(self): 

183 super().post_undo() 

184 

185 Person.objects.clean() 

186 

187 def post_do(self, resource=None): 

188 super().post_do(resource) 

189 

190 Person.objects.clean() 

191 

192 if hasattr(settings, "LOG_DIR") and resource and self.use_body: 

193 today = datetime.date.today() 

194 basename = str(today) + "-" + self.__class__.__name__ 

195 

196 pids = "" 

197 first = True 

198 if isinstance(resource, list): 

199 for resource_item in resource: 

200 if first: 200 ↛ 203line 200 didn't jump to line 203, because the condition on line 200 was never false

201 first = False 

202 else: 

203 pids += ", " 

204 

205 pids += resource_item.pid 

206 else: 

207 pids = resource.pid 

208 

209 with open(os.path.join(settings.LOG_DIR, "cmds.log"), "a", encoding="utf-8") as file_: 

210 file_.write(basename + " : " + pids + "\n") 

211 

212 if hasattr(resource, "my_collection") and resource.my_collection: 

213 folder = os.path.join( 

214 settings.LOG_DIR, resource.get_top_collection().pid, resource.pid 

215 ) 

216 filename = os.path.join(folder, resource.pid + ".xml") 

217 resolver.create_folder(folder) 

218 with open(filename, "w", encoding="utf-8") as file_: 

219 file_.write(self.body) 

220 

221 # #if test, then raise an exeption if self.warnings not empty (in self.warnings we have all tags not parsed) 

222 # if 'test' in sys.argv: 

223 # if len(self.warnings) > 0: 

224 # print(self.warnings) 

225 # raise UserWarning("All tags are not parsed", self.warnings) 

226 

227 def undo(self): 

228 super().undo() 

229 

230 if self.solr_commit_at_the_end: 

231 ptf_cmds.do_solr_commit() 

232 

233 def add_objects_with_location(self, xobjs, resource, cmd_type): 

234 seq = 1 

235 

236 for xobj in xobjs: 

237 base = None 

238 

239 if xobj["base"]: 

240 base_name = xobj["base"] 

241 base = model_helpers.get_xmlbase(base_name) 

242 if base is None: 

243 cmd = ptf_cmds.addXmlBasePtfCmd({"base": xobj["base"], "solr_commit": False}) 

244 base = cmd.do(self) 

245 

246 rel = xobj["rel"] 

247 location = xobj["location"] 

248 

249 params = { 

250 "rel": rel, 

251 "mimetype": xobj.get("mimetype", ""), 

252 "location": location, 

253 "seq": seq, 

254 "solr_commit": False, 

255 "from_folder": self.from_folder, 

256 "to_folder": self.to_folder, 

257 } 

258 

259 # Ignore XML file 

260 if params["mimetype"] != "application/xml": 260 ↛ 236line 260 didn't jump to line 236, because the condition on line 260 was never false

261 if "metadata" in xobj: 

262 params["metadata"] = xobj["metadata"] 

263 

264 if "text" in xobj: 

265 params["text"] = xobj["text"] 

266 

267 # TODO: cmd factory ? 

268 cmd = None 

269 if cmd_type == "ExtLink": 

270 cmd = ptf_cmds.addExtLinkPtfCmd(params) 

271 elif cmd_type == "RelatedObject": 

272 cmd = ptf_cmds.addRelatedObjectPtfCmd(params) 

273 elif cmd_type == "SupplementaryMaterial": 273 ↛ 274line 273 didn't jump to line 274, because the condition on line 273 was never true

274 params["caption"] = xobj.get("caption", "") 

275 params["supplementary_material"] = True 

276 cmd = ptf_cmds.addSupplementaryMaterialPtfCmd(params) 

277 elif cmd_type == "DataStream": 277 ↛ 283line 277 didn't jump to line 283, because the condition on line 277 was never false

278 cmd = ptf_cmds.addDataStreamPtfCmd(params) 

279 

280 # Always try to add an ExtLink or a RelatedObject 

281 # May raise ResourceExists if the ExtLink/RelatedObject is added twice 

282 

283 if cmd is not None: 283 ↛ 289line 283 didn't jump to line 289, because the condition on line 283 was never false

284 cmd.set_base(base) 

285 cmd.set_resource(resource) 

286 

287 cmd.do(self) 

288 

289 seq += 1 

290 

291 # def add_metadata_parts(self, xobj, resource): 

292 # for (seq, name, data) in xobj.metadataparts: 

293 # params = {"name": name, 

294 # "data": data, 

295 # "seq": seq, 

296 # "solr_commit": False} 

297 # 

298 # cmd = ptf_cmds.addMetaDataPartPtfCmd(params) 

299 # cmd.set_resource(resource) 

300 # cmd.do(self) 

301 

302 @staticmethod 

303 def remove_publisher(publisher): 

304 cmd = ptf_cmds.addPublisherPtfCmd() 

305 cmd.set_object_to_be_deleted(publisher) 

306 cmd.undo() 

307 

308 # Update the published years of a collection (journal/acta/book-series...) 

309 @staticmethod 

310 def update_collection_years(pid, container, save=True): 

311 collection = Collection.objects.get(pid=pid) 

312 if container.year: 

313 year = container.year 

314 fyear, lyear = model_helpers.get_first_last_years(year) 

315 fyear = int(fyear) 

316 lyear = int(lyear) 

317 

318 if fyear < collection.fyear or not collection.fyear: 

319 collection.fyear = fyear 

320 

321 if lyear > collection.lyear or not collection.lyear: 

322 collection.lyear = lyear 

323 

324 if save: 

325 collection.save() 

326 

327 

328class addCollectionsXmlCmd(addXmlCmd): 

329 """ 

330 addCollectionsXmlCmd: adds/remove a collection 

331 

332 TODO: merge Collection and Journal ? 

333 

334 Exception raised: 

335 - exceptions.ResourceExists during do 

336 if the Collection already exists 

337 if the collection defines the same extlink/relatedobject multiple times 

338 - exceptions.ResourceDoesNotExist 

339 during undo if the Collection does not exist 

340 during do of the provider does not exist 

341 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value> 

342 - RuntimeError during undo if resources are still published 

343 """ 

344 

345 provider = None 

346 xml_format = None 

347 

348 def set_provider(self, provider): 

349 self.provider = provider 

350 

351 def add_collection(self, xcol, update=False): 

352 if not xcol: 352 ↛ 353line 352 didn't jump to line 353, because the condition on line 352 was never true

353 return None 

354 

355 if xcol.provider: 355 ↛ 358line 355 didn't jump to line 358, because the condition on line 355 was never false

356 provider = model_helpers.get_provider_by_name(xcol.provider) 

357 else: 

358 provider = self.provider 

359 

360 col_id = xcol.pid 

361 collection = model_helpers.get_collection(col_id) 

362 

363 existing = False 

364 

365 if collection is not None: 

366 existing = True 

367 if not update: 367 ↛ 371line 367 didn't jump to line 371

368 raise exceptions.ResourceExists(f"Collection {collection.pid} already exists") 

369 

370 # Create a collection 

371 params = { 

372 "xobj": xcol, 

373 "from_folder": self.from_folder, 

374 "to_folder": self.to_folder, 

375 "solr_commit": False, 

376 } 

377 

378 cls = ptf_cmds.addCollectionPtfCmd 

379 if update and existing: 379 ↛ 380line 379 didn't jump to line 380, because the condition on line 379 was never true

380 cls = ptf_cmds.updateCollectionPtfCmd 

381 

382 cmd = cls(params) 

383 cmd.set_provider(provider) 

384 collection = cmd.do(self) 

385 

386 self.add_objects_with_location(xcol.ext_links, collection, "ExtLink") 

387 

388 # if publisher: 

389 # model_helpers.publish_resource(publisher, journal) 

390 

391 return collection 

392 

393 def internal_do(self): 

394 super().internal_do() 

395 

396 collections = [] 

397 

398 if self.tree.tag == "journal-meta": 398 ↛ 399line 398 didn't jump to line 399, because the condition on line 398 was never true

399 raise ValueError( 

400 "Creation of a journal on the fly from an article is not yet supported" 

401 ) 

402 # # Code used when a journal is created on the fly while parsing an article (GDML - OAI) 

403 # # TODO 1 : Refactor all the JATS parsers (eudml/bdim/dmlcz/....) 

404 # # to be compatible with jats_parser.py 

405 # # TODO 2 : Prevent the creation of the collection on the fly ? 

406 # # Shouldn't the collection be monitored/controlled ? 

407 # xmldata = globals()[self.xml_format] 

408 # xcol = xmldata.Journal(self.tree) 

409 # collection = self.add_collection(xcol, update=True) 

410 # collections.append(collection) 

411 else: 

412 for node in self.tree: 

413 xcol = None 

414 if node.tag == "collection-meta": 414 ↛ 415line 414 didn't jump to line 415, because the condition on line 414 was never true

415 raise ValueError("Collection can only be created from <publication-meta>") 

416 # xcol = jats_parser.BitsCollection(tree=node) 

417 elif node.tag == "journal-meta": 417 ↛ 418line 417 didn't jump to line 418, because the condition on line 417 was never true

418 raise ValueError( 

419 "Collection can only be created from <publication-meta>, <journal-meta> are handled while parsing a <journal-issue>" 

420 ) 

421 # xcol = jats_parser.JatsJournal(tree=node) 

422 elif node.tag == "publication-meta": 422 ↛ 425line 422 didn't jump to line 425, because the condition on line 422 was never false

423 xcol = jats_parser.MathdocPublication(tree=node) 

424 

425 collection = self.add_collection(xcol) 

426 collections.append(collection) 

427 

428 return collections 

429 

430 

431class addIssueXmlCmd(addXmlCmd): 

432 """ 

433 addIssueXmlCmd: adds/remove an issue 

434 

435 from_folder / to_folder (declared in addXmlCmd): location of binary files to copy 

436 

437 extra_folder: folder where extra data (extid false_positive...) are stored in a json 

438 It is used 

439 - when you call addIssueXmlCmd directly to import from an archive, 

440 - when you call addOrUpdateIssueXmlCmd and we need to restore extra data after the import 

441 

442 Exception raised: 

443 - exceptions.ResourceExists during do if the issue already exists 

444 - exceptions.ResourceDoesNotExist 

445 during undo if the Issue does not exist 

446 during do if the serial/provider does not exist 

447 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value> 

448 - RuntimeError during undo if resources are still published 

449 """ 

450 

451 assign_doi = False 

452 full_text_folder = "" 

453 extra_folder = None 

454 prod_deployed_date_iso_8601_date_str = None 

455 xissue = None 

456 count = 0 

457 no_bib = False # Ignore the references during the import (used in Geodesic) 

458 embargo = False # Import only the open articles (used in Geodesic) 

459 

460 def create_child_collection(self, xjournal, journal): 

461 issn = xjournal.issn if xjournal.issn else xjournal.e_issn 

462 

463 new_xjournal = copy.deepcopy(xjournal) 

464 new_xjournal.wall = 0 

465 new_xjournal.pid = f"{xjournal.pid}-{issn}" 

466 new_xjournal.coltype = journal.coltype 

467 

468 params = {"xobj": new_xjournal} 

469 provider = model_helpers.get_provider_by_name("mathdoc") 

470 

471 cmd = ptf_cmds.addCollectionPtfCmd(params) 

472 cmd.set_parent(journal) 

473 cmd.set_provider(provider) 

474 

475 collection = cmd.do() 

476 # collection.parent = journal 

477 # journal = collection 

478 return collection 

479 

480 def get_historic_collection(self, xjournal, journal): 

481 use_meta_collections = ( 

482 settings.USE_META_COLLECTIONS if hasattr(settings, "USE_META_COLLECTIONS") else False 

483 ) 

484 

485 if not use_meta_collections: 485 ↛ 486line 485 didn't jump to line 486, because the condition on line 485 was never true

486 return journal 

487 

488 # meta-collections are used : journal may be the top collection or one of its children 

489 

490 value = id_type = None 

491 

492 # Take care of special case of STNB : 

493 # For that, we ignore the issn of STNB 2nd series 

494 if xjournal.pid == "JTNB" and xjournal.issn == "0989-5558": 494 ↛ 495line 494 didn't jump to line 495, because the condition on line 494 was never true

495 xjournal.issn = None 

496 xjournal.e_issn = None 

497 xjournal.ids = [] 

498 else: 

499 if xjournal.issn: 

500 value = xjournal.issn 

501 id_type = "issn" 

502 elif xjournal.e_issn: 

503 value = xjournal.e_issn 

504 id_type = "e-issn" 

505 

506 if value: 

507 # collection has at least one issn 

508 qs = Collection.objects.filter(resourceid__id_value=value, resourceid__id_type=id_type) 

509 if qs.exists(): 

510 journal = qs.first() 

511 else: 

512 # xjournal does not exist yet. 

513 journal = self.create_child_collection(xjournal, journal) 

514 else: 

515 # collection has no issn 

516 possible_pids = [xjournal.pid, f"{xjournal.pid}-{value}"] 

517 qs = Collection.objects.exclude(resourceid__id_value__isnull=False).filter( 

518 pid__in=possible_pids 

519 ) 

520 if qs.exists(): 520 ↛ 523line 520 didn't jump to line 523, because the condition on line 520 was never false

521 journal = qs.first() 

522 else: 

523 journal = self.create_child_collection(xjournal, journal) 

524 

525 return journal 

526 

527 def internal_do(self): 

528 super().internal_do() 

529 

530 ####################################################################### 

531 # get xissue 

532 

533 if self.xissue: 

534 xissue = self.xissue 

535 else: 

536 xissue = jats_parser.JatsIssue(tree=self.tree, no_bib=self.no_bib) 

537 self.warnings.extend(xissue.warnings) 

538 

539 ####################################################################### 

540 # Check if there is an existing issue / journal 

541 

542 issue_id = xissue.pid 

543 issue = model_helpers.get_container(issue_id) 

544 

545 if issue is not None: 

546 raise exceptions.ResourceExists(f"Issue {issue_id} already exists") 

547 

548 xjournal = xissue.journal 

549 journal_id = xjournal.pid 

550 journal = model_helpers.get_collection(journal_id) 

551 

552 # Note: Why use <issue-meta><custom-meta-group><custom-meta> to find the provider and then the journal 

553 # as there is a <journal-meta> with an id ? 

554 # The ptf_resource table (Resource objects) are created with only 1 id. 

555 # When you add a journal, the journal id is the one of its 

556 # <custom-meta-group><custom-meta> provider. 

557 # If you want to find the journal of an issue based on the <journal-meta> information, you might 

558 # have to search among the other ids (ptf_resourceid table, ResourceId objects) : sql JOIN select 

559 # To avoid the join select, it's better to use <issue-meta><custom-meta-group><custom-meta> to make sure 

560 # we use the correct provider. A simple select in the ptf_resource table is then needed. 

561 if journal is None: 561 ↛ 562line 561 didn't jump to line 562, because the condition on line 561 was never true

562 raise exceptions.ResourceDoesNotExist(f"Journal {journal_id} does not exist") 

563 

564 # Journal is the top collection (ex: AFST) 

565 # We want to get (or create) the journal that corresponds to the issue 

566 journal = self.get_historic_collection(xjournal, journal) 

567 

568 if self.embargo and journal.wall > 0: 568 ↛ 571line 568 didn't jump to line 571, because the condition on line 568 was never true

569 # Geodesic is for open access articles. 

570 # We do not want to import the issues under embargo 

571 if resolver.embargo(journal.wall, xissue.year): 

572 print(f"Embargo, ignore {xissue.pid}") 

573 return None 

574 

575 ####################################################################### 

576 # Get provider/publisher 

577 

578 provider_name = xissue.provider if xissue.provider else "mathdoc" 

579 provider = model_helpers.get_provider_by_name(provider_name) 

580 

581 ####################################################################### 

582 # Add the issue 

583 

584 params = { 

585 "xobj": xissue, 

586 "pid": xissue.pid, 

587 "from_folder": self.from_folder, 

588 "to_folder": self.to_folder, 

589 "solr_commit": False, 

590 } 

591 

592 cmd = ptf_cmds.addContainerPtfCmd(params) 

593 cmd.add_collection(journal) 

594 cmd.set_provider(provider) 

595 issue = cmd.do(self) 

596 

597 self.add_objects_with_location(xissue.ext_links, issue, "ExtLink") 

598 self.add_objects_with_location(xissue.related_objects, issue, "RelatedObject") 

599 self.add_objects_with_location(xissue.streams, issue, "DataStream") 

600 

601 ####################################################################### 

602 # Add the issue's articles 

603 

604 # JatsIssue is an iterator (has the __iter__ function) 

605 # you simply iterate the xissue to get its articles 

606 for seq, xarticle in enumerate(xissue, start=1): 

607 params = { 

608 "xarticle": xarticle, 

609 "journal": journal, 

610 "issue": issue, 

611 "seq": seq, 

612 "provider": provider, 

613 "assign_doi": self.assign_doi, 

614 "full_text_folder": self.full_text_folder, 

615 "use_body": False, 

616 "from_folder": self.from_folder, 

617 "to_folder": self.to_folder, 

618 "solr_commit_at_the_end": False, 

619 } 

620 cmd = addArticleXmlCmd(params) 

621 cmd.do(self) 

622 

623 # Update the top journal first year and last year 

624 self.update_collection_years(journal_id, issue) 

625 

626 # The collection maybe updated with update_collection_years and the assign_doi param (col.last_doi) 

627 # Update issue before returning the object. 

628 # Note that refresh_from_db does not update ForeignKey fields, we can't simply call issue.refresh_from_db() 

629 issue.my_collection.refresh_from_db() 

630 

631 # Used in post_do 

632 self._prod_deployed_date_iso_8601_date_str = xissue.prod_deployed_date_iso_8601_date_str 

633 

634 return issue 

635 

636 def post_do(self, resource=None): 

637 super().post_do(resource) 

638 

639 # Si le XML de l'issue a une last-modified, on la garde, sinon on en créé une. 

640 if resource.last_modified is None: 640 ↛ 641line 640 didn't jump to line 641, because the condition on line 640 was never true

641 resource.last_modified = timezone.now() 

642 resource.save() 

643 

644 # Sur ptf-tools, si le XML de l'issue a une prod_deployed_date, 

645 # On la propage aux Articles/Issue. 

646 # La restoration éventuelle des données (avec importExtraDataPtfCmd) peut écraser prod_deployed_date 

647 if self._prod_deployed_date_iso_8601_date_str and settings.SITE_NAME == "ptf_tools": 

648 prod_deployed_date = model_helpers.parse_date_str( 

649 self._prod_deployed_date_iso_8601_date_str 

650 ) 

651 journal_site = model_helpers.get_site_mersenne(resource.my_collection.pid) 

652 if journal_site: 652 ↛ 655line 652 didn't jump to line 655, because the condition on line 652 was never false

653 model_helpers.update_deployed_date(resource, journal_site, prod_deployed_date) 

654 

655 if self.extra_folder: 

656 ptf_cmds.importExtraDataPtfCmd( 

657 {"pid": resource.pid, "import_folder": self.extra_folder} 

658 ).do() 

659 

660 

661class addArticleXmlCmd(addXmlCmd): 

662 """ 

663 addArticleXmlCmd: adds/remove an issue 

664 

665 Exception raised: 

666 - exceptions.ResourceExists during do if the article already exists 

667 - exceptions.ResourceDoesNotExist 

668 during undo if the Article does not exist 

669 during do if the serial/issue/provider does not exist 

670 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value> 

671 """ 

672 

673 xarticle = None 

674 journal = None 

675 issue = None 

676 provider = None 

677 provider_col = None 

678 assign_doi = False 

679 full_text_folder = "" 

680 xml_format = "xmldata_jats" 

681 # restricted_mode is used by maxiDML. We do not try to import all the metadata, but only a subset 

682 restricted_mode = False 

683 # standalone is used to import isolated article, without issues 

684 standalone = False 

685 seq = ( 

686 0 # seq is used by the breadcrumbs. Generate it if it's not specified in the XML (ex: PCJ) 

687 ) 

688 keep_translations = False 

689 

690 def set_collection(self, collection): 

691 self.journal = collection 

692 self.provider = collection.provider 

693 

694 def set_xml_format(self, xml_format): 

695 self.xml_format = xml_format 

696 

697 def set_provider(self, provider): 

698 self.provider = provider 

699 

700 def set_provider_col(self, provider_col): 

701 self.provider_col = provider_col 

702 

703 def set_article_single_mode(self): 

704 self.xarticle = jats_parser.JatsArticle(tree=self.tree) 

705 self.warnings.extend(self.xarticle.warnings) 

706 

707 # TODO: MaxiDML: allow the creation of an issue on the fly 

708 # if not self.provider: 

709 # self.provider = model_helpers.get_provider_by_name(self.xarticle.provider) 

710 # 

711 # xmldata_jats.set_pid_type(self.provider.pid_type) 

712 # 

713 # bdy = etree.tostring(self.xarticle.journal.tree).decode("utf-8") 

714 # cmd = addCollectionsXmlCmd({'body': bdy, 

715 # 'xml_format': self.xml_format, 

716 # 'coltype': "journal"}) 

717 # cmd.set_provider(self.provider_col if self.provider_col else self.provider) 

718 # self.journal = cmd.do()[0] 

719 # 

720 # self.issue = model_helpers.get_container(self.xarticle.issue_id) 

721 # if self.issue is None: 

722 # # need to create the issue 

723 # date = datetime.datetime.strptime(self.xarticle.date_published_iso_8601_date_str, 

724 # '%Y-%m-%d') 

725 # pid = "{name}_{year}".format(name=self.journal.pid, year=date.year) 

726 # self.issue = model_helpers.get_container(pid) 

727 # if self.issue is None: 

728 # params = {'ctype': 'issue', 'year': date.year, 'pid': pid, 

729 # 'last_modified_iso_8601_date_str': datetime.datetime.now().strftime( 

730 # "%Y-%m-%d %H:%M:%S"), 'volume': self.xarticle.volume, 

731 # # if copy binary, need from_folder / to_folder 

732 # } 

733 # 

734 # cmd = ptf_cmds.addContainerPtfCmd(params) 

735 # cmd.add_collection(self.journal) 

736 # cmd.set_provider(self.provider) 

737 # self.issue = cmd.do() 

738 

739 def get_oai_identifier(self): 

740 return self.xarticle.oai_identifier 

741 

742 def update_xobj_with_body(self): 

743 # Import CEDRICS, le plein texte provient d'un fichier séparé 

744 if self.full_text_folder and not self.xarticle.body: 

745 if self.full_text_folder == settings.CEDRAM_TEX_FOLDER: 745 ↛ 757line 745 didn't jump to line 757, because the condition on line 745 was never false

746 text = "" 

747 locs = [ 

748 stream["location"] 

749 for stream in self.xarticle.streams 

750 if stream["mimetype"] == "application/pdf" 

751 ] 

752 if locs: 752 ↛ 755line 752 didn't jump to line 755, because the condition on line 752 was never false

753 full_pdf_location = os.path.join(self.full_text_folder, locs[0]) 

754 text = utils.pdf_to_text(full_pdf_location) 

755 self.xarticle.body = text 

756 else: 

757 full_text_file = self.full_text_folder + self.xarticle.pid + ".xml" 

758 

759 with open(full_text_file, mode="rb") as file_: 

760 body = file_.read() 

761 

762 parser = etree.XMLParser(huge_tree=True, recover=True) 

763 tree = etree.fromstring(body, parser=parser) 

764 node = tree.find("body") 

765 self.xarticle.body = xml_utils.get_text_from_node(node) 

766 self.xarticle.body_xml = xml_utils.get_xml_from_text("body", self.xarticle.body) 

767 elif not self.xarticle.body_xml and hasattr(self.xarticle, "pii"): 767 ↛ 768line 767 didn't jump to line 768, because the condition on line 767 was never true

768 full_text_file = os.path.join( 

769 "/numdam_dev/acquisition/donnees_traitees", 

770 self.journal.pid, 

771 self.issue.pid, 

772 self.xarticle.pid, 

773 self.xarticle.pid + ".xml", 

774 ) 

775 if os.path.isfile(full_text_file): 

776 with open(full_text_file, mode="rb") as file_: 

777 body = file_.read() 

778 

779 parser = etree.XMLParser(huge_tree=True, recover=True) 

780 tree = etree.fromstring(body, parser=parser) 

781 node = tree.find("body") 

782 self.xarticle.body = xml_utils.get_text_from_node(node) 

783 self.xarticle.body_xml = xml_utils.get_xml_from_text("body", self.xarticle.body) 

784 

785 def internal_do(self): 

786 super().internal_do() 

787 

788 if self.xarticle is None and self.journal is not None: 788 ↛ 790line 788 didn't jump to line 790, because the condition on line 788 was never true

789 # self.restricted_mode = True 

790 self.set_article_single_mode() 

791 self.update = True 

792 else: 

793 self.update = False 

794 

795 if self.xarticle.pid is None: 

796 self.xarticle.pid = ( 

797 self.xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_") 

798 ) 

799 

800 for xtranslated_article in self.xarticle.translations: 800 ↛ 801line 800 didn't jump to line 801, because the loop on line 800 never started

801 for xtream in xtranslated_article.streams: 

802 if xtream["mimetype"] == "text/html": 

803 if self.from_folder is None: 

804 raise ValueError( 

805 "The article has its full text in a separate HTML file. You need to set from_folder" 

806 ) 

807 

808 location = os.path.join(self.from_folder, xtream["location"]) 

809 body_html = resolver.get_body(location) 

810 body = xml_utils.get_text_from_xml_with_mathml(body_html) 

811 xtranslated_article.body_html = body_html 

812 xtranslated_article.body = body 

813 

814 for stream in self.xarticle.streams: 

815 if stream["mimetype"] == "text/html": 

816 location = os.path.join(self.from_folder, stream["location"]) 

817 body_html = resolver.get_body(location) 

818 body = xml_utils.get_text_from_xml_with_mathml(body_html) 

819 self.xarticle.body_html = body_html 

820 self.xarticle.body = body 

821 

822 if self.xarticle.doi: 

823 article = model_helpers.get_article_by_doi(self.xarticle.doi) 

824 else: 

825 article = model_helpers.get_article(self.xarticle.pid) 

826 needs_to_restore_article = False 

827 

828 if article is not None: 828 ↛ 829line 828 didn't jump to line 829, because the condition on line 828 was never true

829 if self.update or self.standalone: 

830 if self.standalone: 

831 self.provider = article.provider 

832 

833 needs_to_restore_article = True 

834 backup_obj_not_in_metadata(article) 

835 

836 if self.keep_translations: 

837 backup_translation(article) 

838 

839 cmd = ptf_cmds.addArticlePtfCmd( 

840 { 

841 "pid": article.pid, 

842 "to_folder": self.to_folder, # on supprime les fichiers pour être sûr 

843 } 

844 ) 

845 cmd.set_object_to_be_deleted(article) 

846 cmd.undo() 

847 else: 

848 raise exceptions.ResourceExists(f"Article {self.xarticle.pid} already exists") 

849 

850 # Override seq 

851 if self.standalone and article is not None: 851 ↛ 852line 851 didn't jump to line 852, because the condition on line 851 was never true

852 self.xarticle.seq = article.seq 

853 elif ( 

854 not self.standalone and self.issue and int(self.xarticle.seq) == 0 and self.seq != 0 

855 ) or (hasattr(self, "pii") and self.seq != 0): 

856 self.xarticle.seq = self.seq 

857 

858 # Get the article's text (body) for SolR if it is empty from the PDF 

859 self.update_xobj_with_body() 

860 

861 params = { 

862 "xobj": self.xarticle, 

863 "pid": self.xarticle.pid, 

864 "from_folder": self.from_folder, 

865 "to_folder": self.to_folder, 

866 "assign_doi": self.assign_doi and not self.xarticle.doi, 

867 "solr_commit": False, 

868 } 

869 

870 cmd = ptf_cmds.addArticlePtfCmd(params) 

871 if self.issue or not self.standalone: 871 ↛ 873line 871 didn't jump to line 873, because the condition on line 871 was never false

872 cmd.set_container(self.issue) 

873 cmd.add_collection(self.journal) 

874 article = cmd.do(self) 

875 

876 self.add_objects_with_location(self.xarticle.ext_links, article, "ExtLink") 

877 self.add_objects_with_location(self.xarticle.streams, article, "DataStream") 

878 if not self.restricted_mode: 878 ↛ 883line 878 didn't jump to line 883, because the condition on line 878 was never false

879 self.add_objects_with_location( 

880 self.xarticle.supplementary_materials, article, "SupplementaryMaterial" 

881 ) 

882 

883 if ( 

884 hasattr(settings, "SHOW_BODY") and settings.SHOW_BODY 

885 ) or settings.SITE_NAME == "ptf_tools": 

886 self.add_objects_with_location(self.xarticle.figures, article, "RelatedObject") 

887 

888 for xtrans_article, trans_article in zip( 888 ↛ 891line 888 didn't jump to line 891, because the loop on line 888 never started

889 self.xarticle.translations, cmd.cmd.translated_articles 

890 ): 

891 self.add_objects_with_location(xtrans_article.streams, trans_article, "DataStream") 

892 

893 if needs_to_restore_article: 893 ↛ 894line 893 didn't jump to line 894, because the condition on line 893 was never true

894 restore_obj_not_in_metadata(article) 

895 

896 if self.keep_translations: 

897 restore_translation(article) 

898 

899 return article 

900 

901 

902class addTranslatedArticleXmlCmd(addXmlCmd): 

903 """ 

904 addTranslatedArticleXmlCmd: adds/remove translations. 

905 The original article is not changed 

906 The current translations are first removed 

907 """ 

908 

909 lang = "" 

910 html_file_name = "" 

911 pdf_file_name = "" 

912 date_published_str = "" 

913 

914 def internal_do(self): 

915 super().internal_do() 

916 

917 xarticle = jats_parser.JatsArticle(tree=self.tree) 

918 article = model_helpers.get_article(xarticle.pid) 

919 

920 if article is None: 

921 raise exceptions.ResourceDoesNotExist(f"Article {self.xarticle.pid} does not exist") 

922 

923 # Merge existing article with new translation 

924 data_article = model_data_converter.db_to_article_data(article) 

925 new_translations = [ 

926 translation 

927 for translation in data_article.translations 

928 if translation.lang != self.lang 

929 ] 

930 

931 for xtrans_article in xarticle.translations: 

932 if xtrans_article.lang == self.lang: 

933 # Upload/views has copied the HTML file on disk 

934 # Add a DataStream. 

935 # TODO: check if the datastream is not already present 

936 if self.html_file_name: 

937 data = model_data.create_datastream() 

938 data["rel"] = "full-text" 

939 data["mimetype"] = "text/html" 

940 data["location"] = self.html_file_name 

941 xtrans_article.streams.append(data) 

942 

943 if self.pdf_file_name: 

944 # Create a pdf file 

945 # pdf-translate needs the article/sub-article XML 

946 # Simply add a datastream for now 

947 # The new Article created in Django will be complete 

948 # But generate the PDF file at the end 

949 data = model_data.create_datastream() 

950 data["rel"] = "full-text" 

951 data["mimetype"] = "application/pdf" 

952 data["location"] = self.pdf_file_name 

953 xtrans_article.streams.append(data) 

954 

955 if self.date_published_str: 

956 xtrans_article.date_published_iso_8601_date_str = self.date_published_str 

957 

958 new_translations.append(xtrans_article) 

959 

960 data_article.translations = new_translations 

961 

962 cmd = addArticleXmlCmd( 

963 { 

964 "xarticle": data_article, 

965 "use_body": False, 

966 "issue": article.my_container, 

967 "standalone": True, 

968 "from_folder": self.from_folder, 

969 } 

970 ) 

971 cmd.set_collection(article.get_collection()) 

972 article = cmd.do() 

973 

974 # pdf-translate needs the article/sub-article XML 

975 xml = ptf_cmds.exportPtfCmd( 

976 { 

977 "pid": article.pid, 

978 "with_body": False, 

979 "with_djvu": False, 

980 "article_standalone": True, 

981 "collection_pid": settings.COLLECTION_PID, 

982 } 

983 ).do() 

984 

985 tex.create_translated_pdf( 

986 article, 

987 xml, 

988 self.lang, 

989 os.path.join(self.from_folder, self.pdf_file_name), 

990 os.path.join(self.from_folder, self.html_file_name), 

991 # If the date_published is specified, we assume that the PDF already exists 

992 skip_compilation=self.date_published_str != "", 

993 ) 

994 

995 return article 

996 

997 

998class addPCJArticleXmlCmd(addXmlCmd): 

999 """ 

1000 addPCJArticleXmlCmd: 

1001 """ 

1002 

1003 html_file_name = "" 

1004 

1005 def internal_do(self): 

1006 super().internal_do() 

1007 

1008 xarticle = jats_parser.JatsArticle(tree=self.tree) 

1009 

1010 if self.html_file_name: 1010 ↛ 1017line 1010 didn't jump to line 1017, because the condition on line 1010 was never false

1011 data = model_data.create_datastream() 

1012 data["rel"] = "full-text" 

1013 data["mimetype"] = "text/html" 

1014 data["location"] = self.html_file_name 

1015 xarticle.streams.append(data) 

1016 

1017 cmd = addArticleXmlCmd( 

1018 { 

1019 "xarticle": xarticle, 

1020 "use_body": False, 

1021 "issue": self.issue, 

1022 "standalone": True, 

1023 "from_folder": self.from_folder, 

1024 } 

1025 ) 

1026 cmd.set_collection(self.collection) 

1027 article = cmd.do() 

1028 

1029 return article 

1030 

1031 

1032class addBookXmlCmd(addXmlCmd): 

1033 """ 

1034 addBookXmlCmd: adds/remove a book 

1035 

1036 Exception raised: 

1037 - exceptions.ResourceExists during do if the book already exists 

1038 - exceptions.ResourceDoesNotExist 

1039 during undo if the Book does not exist 

1040 during do if the serial/provider does not exist 

1041 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value> 

1042 - RuntimeError during undo if resources are still published 

1043 """ 

1044 

1045 provider = None 

1046 import_oai_mode = False 

1047 journal = None 

1048 xml_format = "xmldata_jats" 

1049 xbook = None 

1050 _collection = None 

1051 

1052 def set_provider(self, provider): 

1053 self.provider = provider 

1054 

1055 def add_parts(self, xparts, pseq): 

1056 if xparts: 

1057 seq = 1 

1058 for xpart in xparts: 

1059 self.add_part(xpart, seq, pseq) 

1060 seq += 1 

1061 

1062 def add_part(self, xpart, seq, pseq): 

1063 if xpart is None: 1063 ↛ 1064line 1063 didn't jump to line 1064, because the condition on line 1063 was never true

1064 return 

1065 

1066 # An Article is used to store a book part in the database 

1067 article = model_helpers.get_article(xpart.pid) 

1068 

1069 if article is not None: 1069 ↛ 1070line 1069 didn't jump to line 1070, because the condition on line 1069 was never true

1070 raise exceptions.ResourceExists(f"BookPart {xpart.pid} already exists") 

1071 

1072 params = { 

1073 "xobj": xpart, 

1074 "pid": xpart.pid, 

1075 "seq": seq, 

1076 "pseq": pseq, 

1077 # "deployed": deployed, 

1078 "from_folder": self.from_folder, 

1079 "to_folder": self.to_folder, 

1080 "solr_commit": False, 

1081 } 

1082 

1083 cmd = ptf_cmds.addBookPartPtfCmd(params) 

1084 cmd.set_container(self.book) 

1085 cmd.add_collection(self._collection) 

1086 article = cmd.do(self) 

1087 

1088 self.add_objects_with_location(xpart.ext_links, article, "ExtLink") 

1089 self.add_objects_with_location(xpart.streams, article, "DataStream") 

1090 

1091 self.add_parts(xpart.parts, seq) 

1092 

1093 def set_import_oai_mode(self): 

1094 self.import_oai_mode = True 

1095 

1096 def internal_do(self): 

1097 super().internal_do() 

1098 

1099 ####################################################################### 

1100 # Get xbook 

1101 

1102 if self.import_oai_mode: 1102 ↛ 1103line 1102 didn't jump to line 1103, because the condition on line 1102 was never true

1103 xmldata = globals()[self.xml_format] 

1104 xbook = xmldata.Book(self.tree) 

1105 self.journal = model_helpers.get_collection("GDML_Books") 

1106 

1107 else: 

1108 if self.xbook: 

1109 xbook = self.xbook 

1110 else: 

1111 xbook = jats_parser.BitsBook(tree=self.tree) 

1112 self.warnings.extend(xbook.warnings) 

1113 

1114 ####################################################################### 

1115 # Get existing book if any 

1116 

1117 if not self.provider: 1117 ↛ 1121line 1117 didn't jump to line 1121, because the condition on line 1117 was never false

1118 provider = model_helpers.get_provider_by_name(xbook.provider) 

1119 self.provider = provider 

1120 

1121 book_id = xbook.pid 

1122 book = model_helpers.get_container(book_id) 

1123 

1124 ####################################################################### 

1125 # Delete any existing book 

1126 

1127 if book is not None: 

1128 if self.import_oai_mode: 1128 ↛ 1129line 1128 didn't jump to line 1129, because the condition on line 1128 was never true

1129 publisher = book.my_publisher 

1130 

1131 # Note: the existing collection is not removed even if it no longer has a resource 

1132 # TODO: urls/commands to add/update/delete a collection 

1133 

1134 # Removes the book 

1135 cmd = ptf_cmds.addContainerPtfCmd() 

1136 cmd.set_object_to_be_deleted(book) 

1137 cmd.undo() 

1138 

1139 if publisher and publisher.publishes.count() == 0: 

1140 self.remove_publisher(publisher) 

1141 else: 

1142 raise exceptions.ResourceExists("Book %s already exists" % book_id) 

1143 

1144 ####################################################################### 

1145 # Add new book 

1146 

1147 if xbook.incollection: 1147 ↛ 1152line 1147 didn't jump to line 1152, because the condition on line 1147 was never false

1148 colid = xbook.incollection[0].pid 

1149 self._collection = model_helpers.get_collection(colid) 

1150 if self._collection is None: 

1151 raise exceptions.ResourceDoesNotExist(f"The collection {colid} does not exist") 

1152 elif self.import_oai_mode: 

1153 self._collection = self.journal 

1154 

1155 params = { 

1156 "xobj": xbook, 

1157 "pid": xbook.pid, 

1158 "from_folder": self.from_folder, 

1159 "to_folder": self.to_folder, 

1160 "solr_commit": False, 

1161 } 

1162 

1163 cmd = ptf_cmds.addContainerPtfCmd(params) 

1164 cmd.add_collection(self._collection) 

1165 cmd.set_provider(provider) 

1166 

1167 book = cmd.do(self) 

1168 self.book = book 

1169 

1170 self.add_objects_with_location(xbook.ext_links, book, "ExtLink") 

1171 self.add_objects_with_location(xbook.related_objects, book, "RelatedObject") 

1172 self.add_objects_with_location(xbook.streams, book, "DataStream") 

1173 

1174 # self.add_metadata_parts(xbook, book) TODO support Metadataparts ? 

1175 

1176 ####################################################################### 

1177 # Add Book parts 

1178 

1179 # JatsIssue is an iterator (has the __iter__ function) 

1180 # TODO make JatsBook an iterator as well ? 

1181 self.add_parts(xbook.parts, 0) 

1182 

1183 # Update the collection first year and last year 

1184 for incol in xbook.incollection: 

1185 self.update_collection_years(incol.pid, book) 

1186 

1187 return book 

1188 

1189 

1190###################################################################################### 

1191###################################################################################### 

1192# 

1193# Update Commands 

1194# 

1195###################################################################################### 

1196###################################################################################### 

1197 

1198 

1199class updateCollectionsXmlCmd(addXmlCmd): 

1200 """ 

1201 updateSerialsXmlCmd: updates one or more journals 

1202 

1203 Exception raised: 

1204 - exceptions.ResourceDoesNotExist during do if the Collection does not exist 

1205 - RuntimeError if undo is called 

1206 """ 

1207 

1208 def update_collection(self, xcol, do_update=True): 

1209 if not xcol: 1209 ↛ 1210line 1209 didn't jump to line 1210, because the condition on line 1209 was never true

1210 return None 

1211 

1212 provider = model_helpers.get_provider_by_name(xcol.provider) 

1213 

1214 col_id = xcol.pid 

1215 col = model_helpers.get_collection(col_id) 

1216 

1217 if col is None: 

1218 raise exceptions.ResourceDoesNotExist("Collection %s does not exist" % xcol.pid) 

1219 

1220 if do_update: 

1221 params = { 

1222 "xobj": xcol, 

1223 "solr_commit": False, 

1224 "from_folder": self.from_folder, 

1225 "to_folder": self.to_folder, 

1226 } 

1227 

1228 # The existing other_ids, abstracts are removed in updateCollectionDatabaseCmd::internal_do 

1229 # and the new ones are added in the post_do (addResourceDatabaseCmd) 

1230 

1231 cmd = ptf_cmds.updateCollectionPtfCmd(params) 

1232 cmd.set_provider(provider) 

1233 # cmd.set_publisher(publisher) 

1234 col = cmd.do() 

1235 

1236 # The existing extlinks are removed in updateCollectionDatabaseCmd::internal_do 

1237 self.add_objects_with_location(xcol.ext_links, col, "ExtLink") 

1238 resolver.copy_binary_files(col, self.from_folder, self.to_folder) 

1239 

1240 # if publisher: 

1241 # model_helpers.publish_resource(publisher, col) 

1242 

1243 return col 

1244 

1245 def internal_do(self): 

1246 super().internal_do() 

1247 

1248 collections = [] 

1249 

1250 # First, check that all journals exist 

1251 for node in self.tree: 

1252 xcol = None 

1253 if node.tag == "collection-meta": 1253 ↛ 1254line 1253 didn't jump to line 1254, because the condition on line 1253 was never true

1254 xcol = jats_parser.BitsCollection(tree=node) 

1255 elif node.tag == "journal-meta": 1255 ↛ 1256line 1255 didn't jump to line 1256, because the condition on line 1255 was never true

1256 xcol = jats_parser.JatsJournal(tree=node) 

1257 elif node.tag == "publication-meta": 1257 ↛ 1259line 1257 didn't jump to line 1259, because the condition on line 1257 was never false

1258 xcol = jats_parser.MathdocPublication(tree=node) 

1259 self.update_collection(xcol, False) 

1260 

1261 for node in self.tree: 

1262 xcol = None 

1263 if node.tag == "collection-meta": 1263 ↛ 1264line 1263 didn't jump to line 1264, because the condition on line 1263 was never true

1264 xcol = jats_parser.BitsCollection(tree=node) 

1265 elif node.tag == "journal-meta": 1265 ↛ 1266line 1265 didn't jump to line 1266, because the condition on line 1265 was never true

1266 xcol = jats_parser.JatsJournal(tree=node) 

1267 elif node.tag == "publication-meta": 1267 ↛ 1269line 1267 didn't jump to line 1269, because the condition on line 1267 was never false

1268 xcol = jats_parser.MathdocPublication(tree=node) 

1269 self.warnings.extend(xcol.warnings) 

1270 xcol = self.update_collection(xcol) 

1271 collections.append(xcol) 

1272 

1273 return collections 

1274 

1275 def internal_undo(self): 

1276 raise RuntimeError("update commands do not support the undo") 

1277 

1278 

1279##################################################################### 

1280# 

1281# replaceIssueXmlCmd: updates an issue 

1282# 

1283# Exception raised: 

1284# - exceptions.ResourceDoesNotExist during do if the Collection/Issue/Provider does not exist 

1285# <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value> 

1286# - RuntimeError if undo is called 

1287# 

1288###################################################################### 

1289class replaceIssueXmlCmd(addXmlCmd): 

1290 def internal_do(self): 

1291 super().internal_do() 

1292 

1293 xissue = jats_parser.JatsIssue(tree=self.tree) 

1294 self.warnings.extend(xissue.warnings) 

1295 

1296 xjournal = xissue.journal 

1297 journal_id = xjournal.pid 

1298 journal = model_helpers.get_collection(journal_id) 

1299 

1300 if journal is None: 1300 ↛ 1301line 1300 didn't jump to line 1301, because the condition on line 1300 was never true

1301 raise exceptions.ResourceDoesNotExist("Journal %s does not exist" % xjournal.pid) 

1302 

1303 issue_id = xissue.pid 

1304 issue = model_helpers.get_container(issue_id) 

1305 

1306 if issue is None: 1306 ↛ 1307line 1306 didn't jump to line 1307, because the condition on line 1306 was never true

1307 raise exceptions.ResourceDoesNotExist("Issue %s does not exist" % issue_id) 

1308 

1309 publisher = issue.my_publisher 

1310 

1311 cmd = ptf_cmds.addContainerPtfCmd() 

1312 cmd.set_object_to_be_deleted(issue) 

1313 cmd.undo() 

1314 

1315 if publisher.publishes.count() == 0: 

1316 self.remove_publisher(publisher) 

1317 

1318 # update the journal first and last year 

1319 for the_issue in journal.content.all(): 

1320 self.update_collection_years(journal_id, the_issue, False) 

1321 

1322 journal.save() 

1323 

1324 cmd = addIssueXmlCmd( 

1325 { 

1326 "xissue": xissue, 

1327 "use_body": False, 

1328 "solr_commit": False, 

1329 "extra_folder": self.from_folder, 

1330 "to_folder": self.to_folder, 

1331 } 

1332 ) 

1333 issue = cmd.do() 

1334 

1335 return issue 

1336 

1337 # node_tag = self.tree.tag 

1338 # for child in self.tree: 

1339 # node_tag = child.tag 

1340 

1341 def internal_undo(self): 

1342 raise RuntimeError("update commands do not support the undo") 

1343 

1344 

1345class updateBookXmlCmd(addXmlCmd): 

1346 """ 

1347 updateBookXmlCmd: updates a book 

1348 

1349 Exception raised: 

1350 - exceptions.ResourceDoesNotExist during do if the Book does not exist 

1351 - RuntimeError if undo is called 

1352 """ 

1353 

1354 def internal_do(self): 

1355 super().internal_do() 

1356 

1357 xbook = jats_parser.BitsBook(tree=self.tree) 

1358 self.warnings.extend(xbook.warnings) 

1359 

1360 book_id = xbook.pid 

1361 book = model_helpers.get_container(book_id) 

1362 

1363 if book is None: 1363 ↛ 1364line 1363 didn't jump to line 1364, because the condition on line 1363 was never true

1364 raise exceptions.ResourceDoesNotExist("Book %s does not exist" % xbook.pid) 

1365 

1366 # unpublish and delete the existing publisher if necessary 

1367 # self.update_publisher(xbook, book) 

1368 

1369 # Note: the existing collection is not removed even if it no longer has a resource 

1370 # TODO: urls/commands to add/update/delete a collection 

1371 

1372 # Removes the book 

1373 cmd = ptf_cmds.addContainerPtfCmd() 

1374 cmd.set_object_to_be_deleted(book) 

1375 cmd.undo() 

1376 

1377 cmd = addBookXmlCmd( 

1378 { 

1379 "xbook": xbook, 

1380 "use_body": False, 

1381 "solr_commit": False, 

1382 "from_folder": self.from_folder, 

1383 "to_folder": self.to_folder, 

1384 } 

1385 ) 

1386 book = cmd.do() 

1387 

1388 return book 

1389 

1390 def internal_undo(self): 

1391 raise RuntimeError("update commands do not support the undo") 

1392 

1393 

1394class addOrUpdateContainerXmlCmd(addXmlCmd): 

1395 """ 

1396 addOrUpdateContainerXmlCmd: detects Container type from xml and adds or updates an issue or a book 

1397 

1398 just detect Container type (do not check params etc.) 

1399 """ 

1400 

1401 keep_metadata = False 

1402 keep_translations = False 

1403 backup_folder = None 

1404 full_text_folder = "" 

1405 fake = False # Parse the XML but do not import 

1406 no_bib = False # Ignore the references during the import (used in Geodesic) 

1407 embargo = False # Import only the open articles (used in Geodesic) 

1408 

1409 def check_params(self): 

1410 super().check_params() 

1411 

1412 def internal_do(self): 

1413 super().internal_do() 

1414 

1415 tag = normalize(self.tree.tag) 

1416 

1417 if tag == "journal-issue": 1417 ↛ 1435line 1417 didn't jump to line 1435, because the condition on line 1417 was never false

1418 cmd = addOrUpdateIssueXmlCmd( 

1419 { 

1420 "body": self.body, 

1421 "keep_metadata": self.keep_metadata, 

1422 "keep_translations": self.keep_translations, 

1423 "backup_folder": self.backup_folder, 

1424 "to_folder": self.to_folder, 

1425 "from_folder": self.from_folder, 

1426 "xml_file_folder": self.xml_file_folder, 

1427 "fake": self.fake, 

1428 "no_bib": self.no_bib, 

1429 "embargo": self.embargo, 

1430 } 

1431 ) 

1432 obj = cmd.do() 

1433 self.warnings.extend(cmd.warnings) 

1434 return obj 

1435 elif tag == "book": 

1436 cmd = addOrUpdateBookXmlCmd( 

1437 { 

1438 "body": self.body, 

1439 "from_folder": self.from_folder, 

1440 "to_folder": self.to_folder, 

1441 "no_bib": self.no_bib, 

1442 "embargo": self.embargo, 

1443 } 

1444 ) 

1445 obj = cmd.do() 

1446 self.warnings.extend(cmd.warnings) 

1447 return obj 

1448 else: 

1449 raise RuntimeError("addOrupdateContainer command can't detect container type") 

1450 

1451 def internal_undo(self): 

1452 raise RuntimeError("update commands do not support the undo") 

1453 

1454 

1455class addOrUpdateIssueXmlCmd(addXmlCmd): 

1456 """ 

1457 addOrUpdateIssueXmlCmd: adds or updates an issue 

1458 

1459 Adds an issue if it is not in the system or updates the issue if it is already there. 

1460 By default, no DOI is assigned for the articles. Set assign_doi to True. 

1461 

1462 from_folder / to_folder (declared in addXmlCmd): location of binary files to copy 

1463 backup_folder: folder where extra data (extid false_positive...) are (to be) stored in a json 

1464 

1465 keep_metadata: 

1466 True if you want to back up extra data (icon, dates, matching ids, ...) in the backup_folder 

1467 Default: False 

1468 Note: backup_obj_not_in_metadata / restore_obj_not_in_metadata is always called 

1469 We always want to preserve GraphicalAbstracts (they are not in the issue XML) 

1470 

1471 keep_translations: 

1472 True if you want back up/restore translations. 

1473 Default: False 

1474 Note: When you post an article to a journal (test) website, the translation is declared in the XML 

1475 But if you import a Cedrics article in Trammel, the XML does not list translations 

1476 

1477 Exception raised: 

1478 - exceptions.ResourceDoesNotExist during do if the Collection/Issue/Provider does not exist 

1479 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value> 

1480 - RuntimeError if undo is called 

1481 """ 

1482 

1483 keep_metadata = False 

1484 keep_translations = False 

1485 backup_folder = None 

1486 assign_doi = False 

1487 full_text_folder = "" 

1488 

1489 xissue = None 

1490 fake = False # Parse the XML but do not import 

1491 no_bib = False # Ignore the references during the import (used in Geodesic) 

1492 embargo = False # Import only the open articles (used in Geodesic) 

1493 

1494 def check_params(self): 

1495 super().check_params() 

1496 

1497 if self.keep_metadata and self.assign_doi: 1497 ↛ 1498line 1497 didn't jump to line 1498, because the condition on line 1497 was never true

1498 raise ValueError("keep_metadata and assign_doi cannot both be true.") 

1499 

1500 if self.keep_metadata and self.backup_folder is None: 1500 ↛ 1501line 1500 didn't jump to line 1501, because the condition on line 1500 was never true

1501 raise ValueError("backup_folder needs to be set when keep_metadata is true.") 

1502 

1503 def internal_do(self): 

1504 super().internal_do() 

1505 

1506 if not self.xissue: 

1507 self.xissue = xissue = jats_parser.JatsIssue( 

1508 tree=self.tree, from_folder=self.from_folder, no_bib=self.no_bib 

1509 ) 

1510 if len(xissue.warnings) > 0 and self.xml_file_folder: 

1511 warnings = [] 

1512 warning_keys = [] 

1513 for warning in xissue.warnings: 

1514 for key, value in warning.items(): 

1515 if value not in warning_keys: 

1516 warning_keys.append(value) 

1517 warnings.append({key: value}) 

1518 for warning in warnings: 

1519 print(warning) 

1520 self.warnings.extend(xissue.warnings) 

1521 else: 

1522 xissue = self.xissue 

1523 

1524 if self.fake: 1524 ↛ 1525line 1524 didn't jump to line 1525, because the condition on line 1524 was never true

1525 return 

1526 

1527 xjournal = xissue.journal 

1528 journal_id = xjournal.pid 

1529 journal = model_helpers.get_collection(journal_id) 

1530 

1531 if journal is None: 1531 ↛ 1532line 1531 didn't jump to line 1532, because the condition on line 1531 was never true

1532 raise exceptions.ResourceDoesNotExist("Journal %s does not exist" % xjournal.pid) 

1533 

1534 existing_issue = model_helpers.get_container(xissue.pid) 

1535 

1536 if existing_issue: 

1537 if self.embargo and existing_issue.embargo(): 1537 ↛ 1540line 1537 didn't jump to line 1540, because the condition on line 1537 was never true

1538 # Geodesic is for open access articles. 

1539 # We do not want to import the issues under embargo 

1540 print(f"Embargo, ignore {xissue.pid}") 

1541 return None 

1542 

1543 if self.keep_metadata: 

1544 # On commence par faire un backup de l'existant en cas de bug. 

1545 ptf_cmds.exportPtfCmd( 

1546 { 

1547 "pid": existing_issue.pid, 

1548 "with_internal_data": True, 

1549 "with_binary_files": False, 

1550 "for_archive": False, 

1551 "export_folder": os.path.join(settings.MERSENNE_TMP_FOLDER, "backup"), 

1552 } 

1553 ).do() 

1554 

1555 # On sauvegarde les données additionnelles (extid, deployed_date,...) 

1556 # dans un json qui sera ré-importé avec l'import du nouvel issue 

1557 params = { 

1558 "pid": existing_issue.pid, 

1559 "export_folder": self.backup_folder, 

1560 "export_all": True, 

1561 "with_binary_files": True, 

1562 } 

1563 ptf_cmds.exportExtraDataPtfCmd(params).do() 

1564 

1565 for article in existing_issue.article_set.all(): 

1566 backup_obj_not_in_metadata(article) 

1567 if self.keep_translations: 

1568 backup_translation(article) 

1569 

1570 # On efface l'issue existant, sinon l'import va se plaindre d'articles existants 

1571 cmd = ptf_cmds.addContainerPtfCmd() 

1572 cmd.set_object_to_be_deleted(existing_issue) 

1573 cmd.undo() 

1574 

1575 # update the journal first and last year 

1576 for the_issue in journal.content.all(): 

1577 self.update_collection_years(journal_id, the_issue, False) 

1578 

1579 journal.save() 

1580 else: 

1581 issue_to_appear = model_helpers.get_issue_to_appear(journal_id) 

1582 

1583 # Dans le cas des AIF, les articles du volume à paraitre sont déplacés 

1584 # dans un nouveau volume avant publication (de AIF_0__0_ vers AIF_2018... par ex) 

1585 # La 1ère fois, AIF_2018_ n'est pas encore dans PTF et existing_issue vaut None. 

1586 # Exemple : AIF_0_0 contient doi1, doi2 et doi3, AIF_2018 contient doi1 et doi2. 

1587 # L'import va échouer car on ne peut avoir 2 fois le même article. 

1588 # La solution d'effacer AIF_0_0 n'est pas bonne car on perd doi3. 

1589 # Il faut supprimer les articles en commun (de _0__0 et 2018_) avant l'import 

1590 # du nouveau volume sinon il va y avoir des conflits. 

1591 

1592 if issue_to_appear and xissue.pid != issue_to_appear.pid: 

1593 # On sauvegarde les données additionnelles (extid, deployed_date,...) 

1594 # dans un json qui sera ré-importé avec l'import du nouvel issue 

1595 # ainsi que image associée via ptf-tools 

1596 if self.keep_metadata: 

1597 params = { 

1598 "pid": issue_to_appear.pid, 

1599 "force_pid": xissue.pid, 

1600 "export_folder": self.backup_folder, 

1601 "export_all": True, 

1602 "with_binary_files": True, 

1603 } 

1604 ptf_cmds.exportExtraDataPtfCmd(params).do() 

1605 

1606 for xarticle in xissue: 

1607 xdoi = getattr(xarticle, "doi") 

1608 article = issue_to_appear.article_set.filter(doi=xdoi).first() 

1609 if article: 

1610 backup_obj_not_in_metadata(article) 

1611 if self.keep_translations: 

1612 backup_translation(article) 

1613 

1614 params = {"to_folder": self.to_folder} # pour suppression des binaires 

1615 cmd = ptf_cmds.addArticlePtfCmd(params) 

1616 cmd.set_object_to_be_deleted(article) 

1617 cmd.undo() 

1618 

1619 # si backup_folder est différent de None, alors addIssueXmlCmd.post_do() utilise importExtraDataPtfCmd 

1620 cmd = addIssueXmlCmd( 

1621 { 

1622 "xissue": xissue, 

1623 "use_body": False, 

1624 # "body": self.body, 

1625 "assign_doi": self.assign_doi, 

1626 "full_text_folder": self.full_text_folder, # Cedrics: the full text for SolR is in a separate file 

1627 "extra_folder": self.backup_folder, 

1628 "from_folder": self.from_folder, 

1629 "to_folder": self.to_folder, 

1630 "no_bib": self.no_bib, 

1631 "embargo": self.embargo, 

1632 "solr_commit": False, 

1633 } 

1634 ) 

1635 new_issue = cmd.do() 

1636 

1637 if new_issue: 1637 ↛ 1650line 1637 didn't jump to line 1650, because the condition on line 1637 was never false

1638 new_articles = new_issue.article_set.all() 

1639 

1640 # Avec l'option self.assign_doi, on vérifie que les doi ont bien été assignés 

1641 for article in new_articles: 

1642 if self.assign_doi and article.doi is None: 1642 ↛ 1643line 1642 didn't jump to line 1643, because the condition on line 1642 was never true

1643 raise exceptions.ResourceHasNoDoi("The article %s has no DOI" % article.pid) 

1644 

1645 # TODO garbage collector on articles no longer in the issue 

1646 restore_obj_not_in_metadata(article) 

1647 if self.keep_translations: 

1648 restore_translation(article) 

1649 

1650 return new_issue 

1651 

1652 def internal_undo(self): 

1653 raise RuntimeError("update commands do not support the undo") 

1654 

1655 

1656class addOrUpdateBookXmlCmd(addXmlCmd): 

1657 xbook = None 

1658 

1659 def internal_do(self): 

1660 super().internal_do() 

1661 

1662 if not self.xbook: 1662 ↛ 1666line 1662 didn't jump to line 1666, because the condition on line 1662 was never false

1663 xbook = jats_parser.BitsBook(tree=self.tree) 

1664 self.warnings.extend(xbook.warnings) 

1665 else: 

1666 xbook = self.xbook 

1667 

1668 book_id = xbook.pid 

1669 book = model_helpers.get_container(book_id) 

1670 

1671 if book: 1671 ↛ 1672line 1671 didn't jump to line 1672, because the condition on line 1671 was never true

1672 cmd = ptf_cmds.addContainerPtfCmd() 

1673 cmd.set_object_to_be_deleted(book) 

1674 cmd.undo() 

1675 

1676 collection = book.get_collection() 

1677 

1678 # update the collection first and last year 

1679 for container in collection.content.all(): 

1680 self.update_collection_years(collection.pid, container, False) 

1681 

1682 collection.save() 

1683 

1684 cmd = addBookXmlCmd( 

1685 { 

1686 "xbook": xbook, 

1687 "use_body": False, 

1688 # "body": self.body, 

1689 "from_folder": self.from_folder, 

1690 "to_folder": self.to_folder, 

1691 "solr_commit": False, 

1692 } 

1693 ) 

1694 book = cmd.do() 

1695 return book 

1696 

1697 

1698class updateBibitemCitationXmlCmd(baseCmd): 

1699 """ """ 

1700 

1701 def __init__(self, params=None): 

1702 self.bibitem = None 

1703 

1704 super().__init__(params) 

1705 

1706 self.required_params.extend(["bibitem"]) 

1707 

1708 def set_bibitem(self, bibitem): 

1709 self.bibitem = bibitem 

1710 

1711 def internal_do(self): 

1712 super().internal_do() 

1713 

1714 new_ids = {} 

1715 for bibitemid in self.bibitem.bibitemid_set.all(): 

1716 new_ids[bibitemid.id_type] = { 

1717 "id_type": bibitemid.id_type, 

1718 "id_value": bibitemid.id_value, 

1719 "checked": bibitemid.checked, 

1720 "false_positive": bibitemid.false_positive, 

1721 } 

1722 

1723 xbibitem = jats_parser.update_bibitem_xml(self.bibitem, new_ids) 

1724 self.warnings.extend(xbibitem.warnings) 

1725 

1726 self.bibitem.citation_xml = xbibitem.citation_xml 

1727 self.bibitem.citation_html = xbibitem.citation_html 

1728 self.bibitem.citation_tex = xbibitem.citation_tex 

1729 self.bibitem.save() 

1730 

1731 def internal_undo(self): 

1732 raise RuntimeError("update commands do not support the undo") 

1733 

1734 

1735###################################################################################### 

1736###################################################################################### 

1737# 

1738# Import Commands 

1739# 

1740###################################################################################### 

1741###################################################################################### 

1742 

1743 

1744class collectEntireCollectionXmlCmd(baseCmd): 

1745 """ 

1746 Get the PIDs of all the XML of a collection (collection.xml, issues.xml) of a given folder 

1747 

1748 results: 

1749 """ 

1750 

1751 def __init__(self, params=None): 

1752 self.pid = None 

1753 self.folder = None 

1754 

1755 super().__init__(params) 

1756 

1757 self.required_params.extend(["pid", "folder"]) 

1758 

1759 def internal_do(self): 

1760 super().internal_do() 

1761 pids = [pid for pid, _ in resolver.iterate_collection_folder(self.folder, self.pid)] 

1762 return pids 

1763 

1764 

1765class importEntireCollectionXmlCmd(baseCmd): 

1766 """ 

1767 Import all the XML of a collection (collection.xml, issues.xml) of a given folder 

1768 

1769 results: 

1770 """ 

1771 

1772 def __init__(self, params=None): 

1773 self.pid = None 

1774 self.from_folder = None 

1775 self.to_folder = None 

1776 self.backup_folder = None 

1777 self.keep_metadata = False 

1778 self.keep_translations = False 

1779 

1780 self.with_cedrics = True 

1781 self.from_cedrics = False # The entire collection is in Cedrics format 

1782 self.date_for_pii = False # Fetch publication_date for Elsevier articles 

1783 self.first_issue = "" 

1784 self.fake = False # Parse the XML but do not import 

1785 

1786 self.no_bib = False # Ignore the references during the import (used in Geodesic) 

1787 self.embargo = False # Import only the open articles (used in Geodesic) 

1788 

1789 self.caller = None 

1790 self.callback = None 

1791 self.job = None 

1792 

1793 super().__init__(params) 

1794 

1795 self.required_params.extend(["pid", "from_folder"]) 

1796 

1797 def internal_do(self): 

1798 super().internal_do() 

1799 

1800 pid = self.pid 

1801 resource = model_helpers.get_resource(pid) 

1802 if not resource and not self.fake: 1802 ↛ 1811line 1802 didn't jump to line 1811, because the condition on line 1802 was never false

1803 body = resolver.get_archive_body(self.from_folder, pid, None) 

1804 journals = addCollectionsXmlCmd( 

1805 {"body": body, "from_folder": self.from_folder, "to_folder": self.to_folder} 

1806 ).do() 

1807 if not journals: 1807 ↛ 1808line 1807 didn't jump to line 1808, because the condition on line 1807 was never true

1808 raise ValueError(self.from_folder + " does not contain a collection") 

1809 resource = journals[0] 

1810 

1811 obj = resource.cast() 

1812 

1813 if obj.classname != "Collection": 1813 ↛ 1814line 1813 didn't jump to line 1814, because the condition on line 1813 was never true

1814 raise ValueError(pid + " does not contain a collection") 

1815 

1816 if self.with_cedrics: 1816 ↛ 1819line 1816 didn't jump to line 1819, because the condition on line 1816 was never true

1817 # with_cedrics means that you want to import everything from scratch 

1818 # Delete solr documents (01/28/2020: Solr can have multiple docs with the same PID) 

1819 cmd = solr_cmds.solrDeleteCmd({"q": "pid:" + self.pid + "*"}) 

1820 cmd.do() 

1821 

1822 i = 0 

1823 for pid, file_ in resolver.iterate_collection_folder( 

1824 self.from_folder, self.pid, self.first_issue 

1825 ): 

1826 if self.callback is None: 1826 ↛ 1829line 1826 didn't jump to line 1829, because the condition on line 1826 was never false

1827 print(pid) 

1828 

1829 if self.from_cedrics: 1829 ↛ 1830line 1829 didn't jump to line 1830, because the condition on line 1829 was never true

1830 cmd = importCedricsIssueDirectlyXmlCmd( 

1831 { 

1832 "colid": self.pid, 

1833 "input_file": file_, 

1834 "remove_email": False, 

1835 "remove_date_prod": True, 

1836 "copy_files": True, 

1837 "force_dois": False, 

1838 } 

1839 ) 

1840 else: 

1841 body = resolver.get_body(file_) 

1842 xml_file_folder = os.path.dirname(file_) 

1843 cmd = addOrUpdateContainerXmlCmd( 

1844 { 

1845 "body": body, 

1846 "from_folder": self.from_folder, 

1847 "to_folder": self.to_folder, 

1848 "backup_folder": self.backup_folder, # Read extra data (if any) stored in a json file 

1849 "xml_file_folder": xml_file_folder, # when article.XML are in separate files 

1850 "keep_metadata": self.keep_metadata, # Backup/Restore existing data not in the XML 

1851 "keep_translations": self.keep_translations, # Backup/Restore existing translations 

1852 "no_bib": self.no_bib, 

1853 "embargo": self.embargo, 

1854 # Needed in Trammel 

1855 "fake": self.fake, 

1856 } 

1857 ) 

1858 cmd.do() 

1859 

1860 i += 1 

1861 if self.callback: 1861 ↛ 1862line 1861 didn't jump to line 1862, because the condition on line 1861 was never true

1862 self.callback(self.job, i) 

1863 

1864 if self.with_cedrics: 1864 ↛ 1865line 1864 didn't jump to line 1865, because the condition on line 1864 was never true

1865 src_folder = os.path.join(settings.CEDRAM_XML_FOLDER, self.pid, "metadata") 

1866 

1867 xml_files = [ 

1868 os.path.join(src_folder, f) 

1869 for f in os.listdir(src_folder) 

1870 if os.path.isfile(os.path.join(src_folder, f)) and f.endswith(".xml") 

1871 ] 

1872 for xml_file in xml_files: 

1873 if self.callback is None: 

1874 print(xml_file) 

1875 

1876 cmd = importCedricsIssueXmlCmd( 

1877 { 

1878 "colid": self.pid, 

1879 "input_file": xml_file, 

1880 "from_folder": self.from_folder, 

1881 "to_folder": self.to_folder, 

1882 } 

1883 ) 

1884 cmd.do() 

1885 

1886 

1887class importCedricsIssueXmlCmd(baseCmd): 

1888 def __init__(self, params=None): 

1889 self.colid = None 

1890 self.input_file = None 

1891 self.remove_email = True 

1892 self.remove_date_prod = True 

1893 self.diff_only = False 

1894 self.body = None 

1895 self.xissue = None 

1896 self.copy_files = True 

1897 

1898 super().__init__(params) 

1899 

1900 self.required_params.extend(["colid"]) 

1901 

1902 def import_full_text(self, issue): 

1903 """ 

1904 Some journals want to display the full text in HTML (CRCHIM/CRGEOS/CEBIOL) 

1905 Read the XML file and convert the body in HTML 

1906 """ 

1907 tex_src_folder = resolver.get_cedram_issue_tex_folder(self.colid, issue.pid) 

1908 tex_folders, _ = resolver.get_cedram_tex_folders(self.colid, issue.pid) 

1909 

1910 if len(tex_folders) > 0: 1910 ↛ exitline 1910 didn't return from function 'import_full_text', because the condition on line 1910 was never false

1911 i = 0 

1912 for article in issue.article_set.all(): 

1913 article_folder = tex_folders[i] 

1914 xml_file = os.path.join( 

1915 tex_src_folder, article_folder, "FullText", article_folder + ".xml" 

1916 ) 

1917 

1918 cmd = ptf_cmds.updateResourceIdPtfCmd( 

1919 {"id_type": "ojs-id", "id_value": article_folder} 

1920 ) 

1921 cmd.set_resource(article) 

1922 cmd.do() 

1923 

1924 if os.path.isfile(xml_file): 

1925 with open(xml_file, encoding="utf-8") as f: 

1926 body = f.read() 

1927 

1928 cmd = addBodyInHtmlXmlCmd( 

1929 { 

1930 "body": body, 

1931 "from_folder": settings.CEDRAM_XML_FOLDER, 

1932 # nécessaire pour la copie des binaires type image 

1933 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER, # idem 

1934 } 

1935 ) 

1936 cmd.set_article(article) 

1937 cmd.do() 

1938 

1939 i += 1 

1940 

1941 def import_in_db(self): 

1942 """ 

1943 Import Cedrics issue from /cedram_dev/exploitation/cedram 

1944 This worflow is no longer used. 

1945 """ 

1946 

1947 # Cedrics: the full text for SolR is in a separate file 

1948 full_text_folder = os.path.dirname(os.path.dirname(self.input_file)) + "/plaintext/" 

1949 

1950 params = { 

1951 "assign_doi": False, 

1952 "full_text_folder": full_text_folder, 

1953 "keep_metadata": True, 

1954 "keep_translations": True, 

1955 "use_body": False, 

1956 "xissue": self.xissue, 

1957 "backup_folder": settings.MERSENNE_TMP_FOLDER, 

1958 "from_folder": settings.CEDRAM_XML_FOLDER, 

1959 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER if self.copy_files else None, 

1960 } 

1961 

1962 # params['body'] = self.body 

1963 

1964 cmd = addOrUpdateIssueXmlCmd(params) 

1965 issue = cmd.do() 

1966 self.warnings.extend(cmd.get_warnings()) 

1967 

1968 # resolver.copy_binary_files( 

1969 # issue, 

1970 # settings.CEDRAM_XML_FOLDER, 

1971 # settings.MERSENNE_TEST_DATA_FOLDER) 

1972 

1973 self.import_full_text(issue) 

1974 

1975 return issue 

1976 

1977 def compare_issue(self): 

1978 xissue = self.xissue 

1979 issues_diff = {} 

1980 result = True 

1981 

1982 time1 = timezone.now() 

1983 

1984 new_dois = [article.doi for article in xissue.articles] 

1985 

1986 article_qs = Article.objects.filter(doi__in=new_dois).prefetch_related( 

1987 "abstract_set", 

1988 "kwd_set", 

1989 "subj_set", 

1990 "datastream_set", 

1991 "relatedobject_set", 

1992 "resourcecount_set", 

1993 "contributions", 

1994 "contributions__contribaddress_set", 

1995 "bibitem_set__bibitemid_set", 

1996 "bibitem_set__contributions", 

1997 "bibitem_set__contributions__contribaddress_set", 

1998 ) 

1999 

2000 issue = None 

2001 try: 

2002 issue = ( 

2003 Container.objects.select_related("my_collection", "my_publisher") 

2004 .prefetch_related( 

2005 Prefetch("article_set", queryset=article_qs, to_attr="articles_from_doi") 

2006 ) 

2007 .get(sites__id=settings.SITE_ID, pid=xissue.pid) 

2008 ) 

2009 except Container.DoesNotExist: 

2010 pass 

2011 

2012 if issue: 

2013 data_issue = model_data_converter.db_to_issue_data(issue, issue.articles_from_doi) 

2014 

2015 time2 = timezone.now() 

2016 delta = time2 - time1 

2017 

2018 delta.seconds + delta.microseconds / 1e6 

2019 print(delta) 

2020 

2021 # Handle xml cmds side effects (ex: "numdam" changed into "mathdoc", ...) 

2022 model_data_comparator.prepare_issue_for_comparison(xissue) 

2023 

2024 issue_comparator = model_data_comparator.IssueDataComparator() 

2025 

2026 result = issue_comparator.compare(data_issue, xissue, issues_diff) 

2027 

2028 return (result, issues_diff, xissue) 

2029 

2030 def delete_previous_file(self, output_folder): 

2031 basename = os.path.basename(self.input_file) 

2032 

2033 output_file = os.path.join(output_folder, self.colid, basename) 

2034 if os.path.isfile(output_file): 

2035 os.remove(output_file) 

2036 

2037 os.makedirs(output_folder, exist_ok=True) 

2038 os.makedirs(os.path.dirname(output_file), exist_ok=True) 

2039 

2040 return output_file 

2041 

2042 def import_cedrics_issue(self): 

2043 """ 

2044 Import Cedrics issue from /cedram_dev/exploitation/cedram 

2045 This worflow is no longer used. 

2046 Cedrics issues are imported from /cedram_dev/production_tex/CEDRAM 

2047 (see importCedricsIssueDirectlyXmlCmd below) 

2048 """ 

2049 

2050 output_folder = settings.MERSENNE_TMP_FOLDER 

2051 ptf_xsl_folder = settings.PTF_XSL_FOLDER 

2052 log_file = os.path.join(output_folder, settings.MERSENNE_LOG_FILE) 

2053 

2054 # 1. Delete the previous file 

2055 output_file = self.delete_previous_file(output_folder) 

2056 

2057 # 2. Transform the cedrics XML into JATS 

2058 cmd_folder = os.path.join(ptf_xsl_folder, "cedram") 

2059 

2060 cmd_str = 'cd {}; {} cedram2ptf.py -v -x {} -p {} -o {} -b "" -l {} {} {} > {} 2>&1'.format( 

2061 cmd_folder, 

2062 os.path.join(settings.VIRTUALENV_DIR, "bin/python"), 

2063 "-s" if self.colid in settings.MERSENNE_SEMINARS else "", 

2064 self.input_file, 

2065 output_folder, 

2066 log_file + "1", 

2067 # option -e for cedram2ptf.py for not removing email 

2068 "-e" if not self.remove_email else "", 

2069 "-t" if self.remove_date_prod else "", 

2070 log_file, 

2071 ) 

2072 

2073 log_file2 = log_file + "2" 

2074 with open(log_file2, "w", encoding="ascii") as file_: 

2075 file_.write(cmd_str + "\n") 

2076 

2077 sys.path.append(ptf_xsl_folder + "/lib") 

2078 

2079 try: 

2080 result = subprocess.check_output(cmd_str, shell=True) 

2081 except Exception as e: 

2082 with open(log_file) as logfile_: 

2083 logfile_body = logfile_.read() 

2084 message = str(e) + "\n" + logfile_body + "\n" 

2085 file_.write(message) 

2086 file_.close() 

2087 raise RuntimeError(message) 

2088 

2089 file_.write(str(result) + "\n") 

2090 

2091 # Check if the output_file has been created 

2092 if not os.path.isfile(output_file): 

2093 raise RuntimeError("The file was not converted in JATS") 

2094 

2095 with open(output_file, encoding="utf-8") as f: 

2096 self.body = f.read() 

2097 

2098 parser = etree.XMLParser( 

2099 huge_tree=True, recover=True, remove_blank_text=True, remove_comments=True 

2100 ) 

2101 tree = etree.fromstring(self.body.encode("utf-8"), parser=parser) 

2102 self.xissue = jats_parser.JatsIssue(tree=tree) 

2103 self.warnings.extend(self.xissue.warnings) 

2104 

2105 def internal_do(self): 

2106 super().internal_do() 

2107 

2108 if not self.xissue: 2108 ↛ 2111line 2108 didn't jump to line 2111, because the condition on line 2108 was never false

2109 self.import_cedrics_issue() 

2110 

2111 result = None 

2112 

2113 if self.diff_only: 2113 ↛ 2114line 2113 didn't jump to line 2114, because the condition on line 2113 was never true

2114 result = self.compare_issue() 

2115 else: 

2116 result = self.import_in_db() 

2117 

2118 return result 

2119 

2120 

2121# import from /cedram_dev/production_tex/CEDRAM 

2122class importCedricsIssueDirectlyXmlCmd(importCedricsIssueXmlCmd): 

2123 def __init__(self, params=None): 

2124 self.is_seminar = False 

2125 self.article_folders = None 

2126 self.force_dois = True 

2127 super().__init__(params) 

2128 

2129 def read_file(self, filename, skip_lines=2): 

2130 i = 0 

2131 lines = [] 

2132 try: 

2133 with open(filename, encoding="utf-8") as fr: 

2134 for line in fr: 

2135 if i > skip_lines: 

2136 lines.append(line) 

2137 i += 1 

2138 except UnicodeDecodeError: 

2139 i = 0 

2140 lines = [] 

2141 with open(filename, encoding="iso-8859-1") as fr: 

2142 for line in fr: 

2143 if i > skip_lines: 

2144 lines.append(line) 

2145 i += 1 

2146 

2147 return lines 

2148 

2149 def import_cedrics_issue(self): 

2150 """ 

2151 Parse the Cedrics XML directly, without Cedrics -> JATS transformation 

2152 The deplace_fasc script is no longer needed, but the Cedrics issue XML has to be created 

2153 Workflow 

2154 1. Get the list of articles from /cedram_dev/production_tex/CEDRAM 

2155 2. Cat the article XML files into one issue.XML 

2156 3. Read the Cedrics issue.XML 

2157 

2158 :return: 

2159 """ 

2160 

2161 output_folder = settings.MERSENNE_TMP_FOLDER 

2162 output_file = self.delete_previous_file(output_folder) 

2163 

2164 basename = os.path.basename(self.input_file) 

2165 if "-cdrxml" in basename: 2165 ↛ 2168line 2165 didn't jump to line 2168, because the condition on line 2165 was never false

2166 pid = basename.split("-cdrxml.")[0] 

2167 else: 

2168 pid = basename.split(".xml")[0] 

2169 

2170 # 1. Get the list of articles 

2171 tex_src_folder = resolver.get_cedram_issue_tex_folder(self.colid, pid) 

2172 self.article_folders, self.dois = resolver.get_cedram_tex_folders(self.colid, pid) 

2173 

2174 # 2. Create the issue XML file 

2175 with open(output_file, "w", encoding="utf-8") as fw: 

2176 # 2.a. Start the issue.xml based on @pid-cdrxml.xml 

2177 fw.write('<?xml version="1.0" encoding="utf-8" standalone="no"?>\n') 

2178 fw.write('<!DOCTYPE cedram SYSTEM "/home/cedram/XML/dtd/cedram.dtd">\n') 

2179 fw.write("<cedram>\n") 

2180 

2181 lines = self.read_file(self.input_file) 

2182 for line in lines: 

2183 fw.write(line) 

2184 

2185 # 2.b. Cat the article XML files 

2186 for basename in self.article_folders: 

2187 src_file = os.path.join(tex_src_folder, basename, basename + "-cdrxml.xml") 

2188 

2189 lines = self.read_file(src_file) 

2190 for line in lines: 

2191 fw.write(line) 

2192 

2193 fw.write("</cedram>\n") 

2194 

2195 # 3. Read the Cedrics issue.XML 

2196 with open(output_file, encoding="utf-8") as f: 

2197 self.body = f.read() 

2198 

2199 parser = etree.XMLParser( 

2200 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True 

2201 ) 

2202 tree = etree.fromstring(self.body.encode("utf-8"), parser=parser) 

2203 self.xissue = cedrics_parser.CedricsIssue( 

2204 tree=tree, 

2205 is_seminar=self.is_seminar, 

2206 ignore_date_published=self.remove_date_prod, 

2207 article_folders=self.article_folders, 

2208 dois=self.dois, 

2209 ) 

2210 if self.force_dois: 2210 ↛ 2215line 2210 didn't jump to line 2215, because the condition on line 2210 was never false

2211 for xarticle in self.xissue.articles: 

2212 if xarticle.doi is None: 2212 ↛ 2213line 2212 didn't jump to line 2213, because the condition on line 2212 was never true

2213 raise ValueError(xarticle.pid, "n'a pas de doi") 

2214 

2215 self.warnings.extend(self.xissue.warnings) 

2216 

2217 def import_in_db(self): 

2218 params = { 

2219 "assign_doi": False, 

2220 "full_text_folder": settings.CEDRAM_TEX_FOLDER, # the full text for SolR is in a separate file 

2221 "keep_metadata": True, 

2222 "keep_translations": True, # The cedrics XML does not have the translations. backup/restore them. 

2223 "use_body": False, 

2224 "xissue": self.xissue, 

2225 "backup_folder": settings.MERSENNE_TMP_FOLDER, # temp folder used to backup/restore info during the import 

2226 "from_folder": settings.CEDRAM_TEX_FOLDER, 

2227 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER if self.copy_files else None, 

2228 } 

2229 

2230 cmd = addOrUpdateIssueXmlCmd(params) 

2231 issue = cmd.do() 

2232 self.warnings.extend(cmd.get_warnings()) 

2233 

2234 self.import_full_text(issue) 

2235 

2236 return issue 

2237 

2238 

2239class addCedricsIssueXmlCmd(addXmlCmd): 

2240 assign_doi = False 

2241 full_text_folder = "" 

2242 import_folder = None 

2243 prod_deployed_date_iso_8601_date_str = None 

2244 xissue = None 

2245 remove_blank_text = False 

2246 is_seminar = False 

2247 

2248 def internal_do(self): 

2249 super().internal_do() 

2250 

2251 self.xissue = cedrics_parser.CedricsIssue(tree=self.tree, is_seminar=self.is_seminar) 

2252 

2253 return self.xissue 

2254 

2255 

2256class addorUpdateCedricsArticleXmlCmd(baseCmd): 

2257 def __init__(self, params=None): 

2258 self.container_pid = None 

2259 self.article_folder_name = None 

2260 

2261 super().__init__(params) 

2262 

2263 self.required_params.extend(["container_pid", "article_folder_name"]) 

2264 

2265 def internal_do(self): 

2266 super().internal_do() 

2267 

2268 issue = model_helpers.get_container(self.container_pid) 

2269 if not issue: 

2270 raise exceptions.ResourceDoesNotExist(f"Issue {self.container_pid} does not exist") 

2271 

2272 colid = issue.my_collection.pid 

2273 article_folder = os.path.join( 

2274 settings.CEDRAM_TEX_FOLDER, colid, self.container_pid, self.article_folder_name 

2275 ) 

2276 

2277 # 1. Read the Cedrics article.XML 

2278 input_file = os.path.join(article_folder, f"{self.article_folder_name}-cdrxml.xml") 

2279 with open(input_file, encoding="utf-8") as f: 

2280 body = f.read() 

2281 

2282 # 2. Parse the file and create an xarticle 

2283 is_seminar = colid in settings.MERSENNE_SEMINARS 

2284 parser = etree.XMLParser( 

2285 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True 

2286 ) 

2287 tree = etree.fromstring(body.encode("utf-8"), parser=parser) 

2288 xarticle = cedrics_parser.CedricsArticle( 

2289 tree=tree, 

2290 colid=colid, 

2291 issue_id=self.container_pid, 

2292 is_seminar=is_seminar, 

2293 ignore_date_published=True, 

2294 article_folder=self.article_folder_name, 

2295 ) 

2296 if xarticle.doi is None: 

2297 raise ValueError(xarticle.pid, "n'a pas de doi") 

2298 

2299 # Get the article position in its issue (seq) to preserve its order 

2300 article_folders, dois = resolver.get_cedram_tex_folders(colid, self.container_pid) 

2301 i = 1 

2302 for folder in article_folders: 

2303 if folder == self.article_folder_name: 

2304 xarticle.seq = i 

2305 i += 1 

2306 

2307 existing_article = model_helpers.get_article(xarticle.pid) 

2308 temp_folder = settings.MERSENNE_TMP_FOLDER 

2309 

2310 # 3. Backup/Suppression de l'article existant 

2311 if existing_article: 

2312 # On commence par faire un backup de l'existant en cas de bug. 

2313 ptf_cmds.exportPtfCmd( 

2314 { 

2315 "pid": self.container_pid, 

2316 "with_internal_data": True, 

2317 "with_binary_files": False, 

2318 "for_archive": False, 

2319 "export_folder": os.path.join(temp_folder, "backup"), 

2320 } 

2321 ).do() 

2322 

2323 # On sauvegarde les données additionnelles (extid, deployed_date,...) dans un json 

2324 params = { 

2325 "pid": existing_article.pid, 

2326 "export_folder": temp_folder, 

2327 "export_all": True, 

2328 "with_binary_files": True, 

2329 } 

2330 ptf_cmds.exportExtraDataPtfCmd(params).do() 

2331 

2332 backup_obj_not_in_metadata(existing_article) 

2333 backup_translation(existing_article) 

2334 

2335 # Inutile d'effacer l'article existant, addArticleXmlCmd le fait en mode standalone 

2336 

2337 # 4. Ajout de l'article dans Django/SolR 

2338 params = { 

2339 "xarticle": xarticle, 

2340 "issue": issue, 

2341 "standalone": True, 

2342 "use_body": False, # No self.body with the content of the XML file; xarticle is passed directly 

2343 "full_text_folder": settings.CEDRAM_TEX_FOLDER, # the full text for SolR is in a separate file 

2344 # temp folder used to backup/restore info during the import 

2345 "from_folder": settings.CEDRAM_TEX_FOLDER, 

2346 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER, 

2347 "keep_translations": True, 

2348 } 

2349 

2350 cmd = addArticleXmlCmd(params) 

2351 cmd.set_collection(issue.my_collection) 

2352 article = cmd.do() 

2353 

2354 # 5. Lecture du full text en HTML 

2355 xml_file = os.path.join(article_folder, "FullText", self.article_folder_name + ".xml") 

2356 if os.path.isfile(xml_file): 

2357 with open(xml_file, encoding="utf-8") as f: 

2358 body = f.read() 

2359 

2360 cmd = addBodyInHtmlXmlCmd( 

2361 { 

2362 "body": body, 

2363 "from_folder": settings.CEDRAM_XML_FOLDER, 

2364 # nécessaire pour la copie des binaires type image 

2365 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER, # idem 

2366 "remove_blank_text": False, 

2367 } 

2368 ) 

2369 cmd.set_article(article) 

2370 cmd.do() 

2371 

2372 # 6. On ajoute l'ojs-id pour ptf-tools 

2373 cmd = ptf_cmds.updateResourceIdPtfCmd( 

2374 {"id_type": "ojs-id", "id_value": self.article_folder_name} 

2375 ) 

2376 cmd.set_resource(article) 

2377 cmd.do() 

2378 

2379 # 7. On restaure les données additionnelles (extid, deployed_date,...) 

2380 if existing_article: 

2381 ptf_cmds.importExtraDataPtfCmd( 

2382 {"pid": existing_article.pid, "import_folder": temp_folder} 

2383 ).do() 

2384 

2385 restore_obj_not_in_metadata(article) 

2386 restore_translation(article) 

2387 

2388 return article 

2389 

2390 

2391class transformBodyInHtmlXmlCmd(addXmlCmd): 

2392 """ 

2393 transformBodyInHtmlXmlCmd: transform the JATS body in HTML 

2394 

2395 TODO: handle images,... 

2396 

2397 """ 

2398 

2399 use_body = False 

2400 

2401 def internal_do(self): 

2402 super().internal_do() 

2403 

2404 xsl_file = settings.PTF_HTML_XSL 

2405 xslt_doc = etree.parse(xsl_file) 

2406 t = etree.XSLT(xslt_doc) 

2407 

2408 html_tree = t(self.tree).getroot() 

2409 

2410 body = html_tree.find("body/article/main") 

2411 text = xmldata_jats.innerxml(body).decode("utf-8") 

2412 

2413 return text 

2414 

2415 

2416class addBodyInHtmlXmlCmd(addXmlCmd): 

2417 """ 

2418 addBodyInHtmlXmlCmd: read the JATS body of an article 

2419 and create the corresponding HTML 

2420 

2421 TODO: handle images,... manage warnings for unused tag ? 

2422 

2423 """ 

2424 

2425 def __init__(self, params=None): 

2426 self.article = None 

2427 self.pid = None 

2428 

2429 super().__init__(params) 

2430 

2431 def set_article(self, article): 

2432 self.article = article 

2433 

2434 def pre_do(self): 

2435 super().pre_do() 

2436 

2437 if self.pid is None and self.article is None: 2437 ↛ 2438line 2437 didn't jump to line 2438, because the condition on line 2437 was never true

2438 raise ValueError("pid et article sont vides") 

2439 

2440 if self.article is None: 2440 ↛ 2441line 2440 didn't jump to line 2441, because the condition on line 2440 was never true

2441 self.article = model_helpers.get_article(self.pid) 

2442 

2443 if self.pid is None: 2443 ↛ exitline 2443 didn't return from function 'pre_do', because the condition on line 2443 was never false

2444 self.pid = self.article.pid 

2445 

2446 def internal_do(self): 

2447 super().internal_do() 

2448 

2449 xarticle = jats_parser.JatsArticle(tree=self.tree, pid=self.pid) 

2450 # faut il récupérer les warnings du parseHTML ? 

2451 # self.warnings.extend(xarticle.warnings) 

2452 self.article.relatedobject_set.filter(rel="html-image").delete() 

2453 self.add_objects_with_location(xarticle.figures, self.article, "RelatedObject") 

2454 

2455 params = { 

2456 "body_html": xarticle.body_html, 

2457 "body_tex": xarticle.body_tex, 

2458 "body_xml": xarticle.body_xml, 

2459 "use_page_count": False, 

2460 } 

2461 

2462 cmd = ptf_cmds.updateArticlePtfCmd(params) 

2463 cmd.set_article(self.article) 

2464 cmd.do() 

2465 

2466 # copy_binary_files will call resolver.copy_html_images 

2467 # to copy the article images 

2468 # because updateArticlePtfCmd is not from addPtfCmd, need to copy files here 

2469 

2470 resolver.copy_html_images( 

2471 self.article, settings.MERSENNE_TEST_DATA_FOLDER, settings.CEDRAM_XML_FOLDER 

2472 ) 

2473 

2474 

2475class updateCacheXmlCmd(baseCmd): 

2476 """ 

2477 recreate the citation_html field of the bibitems 

2478 

2479 Params: colid: pid of the collection to process 

2480 """ 

2481 

2482 def __init__(self, params=None): 

2483 self.colid = None 

2484 self.start_id = None 

2485 

2486 super().__init__(params) 

2487 

2488 self.required_params.extend(["colid"]) 

2489 

2490 def update_article(self, xarticle): 

2491 article = model_helpers.get_article(xarticle.pid) 

2492 if article is None: 

2493 raise exceptions.ResourceDoesNotExist(f"Article {xarticle.pid} does not exist") 

2494 

2495 article.title_html = xarticle.title_html 

2496 article.title_tex = xarticle.title_tex 

2497 article.trans_title_html = xarticle.trans_title_html 

2498 article.trans_title_tex = xarticle.trans_title_tex 

2499 article.save() 

2500 

2501 for xabstract, abstract in zip(xarticle.abstracts, article.abstract_set.all()): 

2502 abstract.value_html = xabstract["value_html"] 

2503 abstract.value_tex = xabstract["value_tex"] 

2504 abstract.save() 

2505 

2506 # for xkwd_group, kwd_group in zip(xarticle.kwd_groups, article.kwdgroup_set.all()): 

2507 # kwd_group.value_html = xkwd_group['value_html'] 

2508 # kwd_group.value_tex = xkwd_group['value_tex'] 

2509 # kwd_group.save() 

2510 

2511 for xbib, bib in zip(xarticle.bibitems, article.bibitem_set.all()): 

2512 bib.citation_html = xbib.citation_html 

2513 bib.citation_tex = xbib.citation_tex 

2514 bib.article_title_tex = xbib.article_title_tex 

2515 bib.chapter_title_tex = xbib.chapter_title_tex 

2516 bib.source_tex = xbib.source_tex 

2517 bib.volume = xbib.volume 

2518 bib.save() 

2519 

2520 if hasattr(settings, "SHOW_BODY") and settings.SHOW_BODY: 

2521 params = { 

2522 "body_html": xarticle.body_html, 

2523 "body_tex": xarticle.body_tex, 

2524 "body_xml": xarticle.body_xml, 

2525 "use_page_count": False, 

2526 } 

2527 

2528 cmd = ptf_cmds.updateArticlePtfCmd(params) 

2529 cmd.set_article(article) 

2530 cmd.do() 

2531 

2532 def internal_do(self): 

2533 super().internal_do() 

2534 

2535 collection = model_helpers.get_collection(self.colid) 

2536 if collection is None: 

2537 raise exceptions.ResourceDoesNotExist(f"Collection {self.colid} does not exist") 

2538 

2539 qs = collection.content.all().order_by("pid") 

2540 start = self.start_id is None 

2541 for container in qs: 

2542 if not start and container.pid == self.start_id: 

2543 start = True 

2544 

2545 if start: 

2546 print(container.pid) 

2547 with_body = hasattr(settings, "SHOW_BODY") and settings.SHOW_BODY 

2548 xml_body = ptf_cmds.exportPtfCmd( 

2549 {"pid": container.pid, "with_body": with_body} 

2550 ).do() 

2551 

2552 parser = etree.XMLParser( 

2553 huge_tree=True, 

2554 recover=True, 

2555 remove_blank_text=False, 

2556 remove_comments=True, 

2557 resolve_entities=True, 

2558 ) 

2559 tree = etree.fromstring(xml_body.encode("utf-8"), parser=parser) 

2560 xissue = jats_parser.JatsIssue(tree=tree) 

2561 

2562 for xarticle in xissue: 

2563 self.update_article(xarticle)