Coverage for apps/ptf/cmds/xml_cmds.py: 67%

1215 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-06-19 08:36 +0000

1import copy 

2import datetime 

3import os.path 

4import subprocess 

5import sys 

6import traceback 

7 

8from lxml import ElementInclude 

9from lxml import etree 

10 

11from django.conf import settings 

12from django.db import transaction 

13from django.db.models import Prefetch 

14from django.utils import timezone 

15 

16from ptf import exceptions 

17from ptf import model_data 

18from ptf import model_data_comparator 

19from ptf import model_data_converter 

20from ptf import model_helpers 

21from ptf import tex 

22from ptf import utils 

23from ptf.cmds import ptf_cmds 

24from ptf.cmds import solr_cmds 

25from ptf.cmds.base_cmds import baseCmd 

26from ptf.cmds.xml import xml_utils 

27from ptf.cmds.xml.cedrics import cedrics_parser 

28 

29# KEEP THIS UNUSED IMPORT THEY ARE USED 

30from ptf.cmds.xml.jats import jats_parser 

31from ptf.cmds.xml.jats import xmldata as xmldata_jats 

32from ptf.cmds.xml.xml_utils import normalize 

33from ptf.display import resolver 

34from ptf.models import Article 

35from ptf.models import Collection 

36from ptf.models import Container 

37from ptf.models import Person 

38from ptf.models import backup_obj_not_in_metadata 

39from ptf.models import backup_translation 

40from ptf.models import restore_obj_not_in_metadata 

41from ptf.models import restore_translation 

42 

43 

44def find_file(name): 

45 paths = settings.MANAGER_XSLT_DIRS 

46 for path in paths: 

47 for root, _, files in os.walk(path): 

48 if name in files: 

49 return os.path.join(root, name) 

50 return None 

51 

52 

53def get_transform(name): 

54 file_path = find_file(f"{name}.xsl") 

55 xslt_doc = etree.parse(file_path) 

56 return etree.XSLT(xslt_doc) 

57 

58 

59class addXmlCmd(baseCmd): 

60 """ 

61 addXmlCmd: base class for commands that take an XML as input 

62 The XML is passed with the body param 

63 

64 from_folder / to_folder: location of binary files to copy 

65 

66 Example with a file: 

67 f = open('journal.xml') 

68 body = f.read() 

69 f.close() 

70 cmd = add...XmlCmd( { "body":body } ) 

71 

72 Exception raised: 

73 - ValueError if the init params are empty 

74 """ 

75 

76 use_body = True 

77 body = None 

78 tree = None 

79 solr_commit_at_the_end = True 

80 xml_filename_in_log = None 

81 remove_blank_text = False 

82 xml_file_folder = None 

83 

84 def __init__(self, params=None): 

85 super().__init__(params) 

86 

87 if self.use_body: 

88 self.required_params.extend(["body"]) 

89 

90 def get_logname(self): 

91 filename = "" 

92 

93 if hasattr(settings, "LOG_DIR"): 93 ↛ 103line 93 didn't jump to line 103, because the condition on line 93 was never false

94 i = 0 

95 today = datetime.date.today() 

96 basename = str(today) + "-" + self.__class__.__name__ + "-" 

97 filename = os.path.join(settings.LOG_DIR, basename + str(i) + ".xml") 

98 

99 while os.path.isfile(filename): 

100 i += 1 

101 filename = os.path.join(settings.LOG_DIR, basename + str(i) + ".xml") 

102 

103 return filename 

104 

105 def pre_do(self): 

106 super().pre_do() 

107 

108 if self.use_body: 

109 # The Cedrics -> JATS XSLT transform manually adds space=preserve around 

110 # the nodes with mixed-content, but leaves the text unchanged. 

111 # As such, parsing the Cedrics XML cannot be done with remove_blank_text=True 

112 # Or the spaces will be removed whereas the JATS XML will keep them. 

113 # We still need the remove_blank_text=True for JATS XML for all the other nodes 

114 parser = etree.XMLParser( 

115 huge_tree=True, 

116 recover=True, 

117 remove_blank_text=self.remove_blank_text, 

118 remove_comments=True, 

119 resolve_entities=True, 

120 ) 

121 # if isinstance(self.body, str): 

122 # self.body = self.body 

123 if self.xml_file_folder is not None: 

124 if self.xml_file_folder[-1] != "/": 

125 self.xml_file_folder += "/" 

126 # For ElementInclude to find the href 

127 self.body = self.body.replace( 

128 'xmlns:xlink="http://www.w3.org/1999/xlink"', "" 

129 ).replace("xlink:href", "href") 

130 tree = etree.fromstring(self.body.encode("utf-8"), parser=parser) 

131 

132 if self.xml_file_folder is not None: 

133 ElementInclude.include(tree, base_url=self.xml_file_folder) 

134 # t = get_transform('strip-namespace') 

135 # self.tree = t(tree).getroot() 

136 self.tree = tree 

137 

138 if self.tree is None: 138 ↛ 139line 138 didn't jump to line 139, because the condition on line 138 was never true

139 raise ValueError("tree est vide") 

140 

141 # Write the xml body on disk 

142 if hasattr(settings, "LOG_DIR") and self.body and self.use_body: 

143 self.xml_filename_in_log = self.get_logname() 

144 

145 with open(self.xml_filename_in_log, "w", encoding="utf-8") as file_: 

146 file_.write(self.body) 

147 

148 @transaction.atomic 

149 def do(self, parent=None): 

150 try: 

151 obj = super().do(parent) 

152 except Exception as e: 

153 ptf_cmds.do_solr_rollback() 

154 

155 # Empty sub_cmds to ignore undo 

156 self.cmds = [] 

157 

158 # Write the xml body on disk 

159 if hasattr(settings, "LOG_DIR") and self.body and self.use_body: 

160 with open( 

161 os.path.join(settings.LOG_DIR, "cmds.log"), "a", encoding="utf-8" 

162 ) as file_: 

163 file_.write("----------------------\n") 

164 

165 if self.xml_filename_in_log is None: 165 ↛ 166line 165 didn't jump to line 166, because the condition on line 165 was never true

166 self.xml_filename_in_log = self.get_logname() 

167 

168 file_.write(self.xml_filename_in_log + " : FAILED\n") 

169 exc_type, exc_value, exc_traceback = sys.exc_info() 

170 lines = traceback.format_exception(exc_type, exc_value, exc_traceback) 

171 for line in lines: 

172 file_.write(line + "\n") 

173 file_.write("----------------------\n") 

174 

175 raise e 

176 

177 if self.solr_commit_at_the_end: 

178 ptf_cmds.do_solr_commit() 

179 

180 return obj 

181 

182 def post_undo(self): 

183 super().post_undo() 

184 

185 Person.objects.clean() 

186 

187 def post_do(self, resource=None): 

188 super().post_do(resource) 

189 

190 Person.objects.clean() 

191 

192 if hasattr(settings, "LOG_DIR") and resource and self.use_body: 

193 today = datetime.date.today() 

194 basename = str(today) + "-" + self.__class__.__name__ 

195 

196 pids = "" 

197 first = True 

198 if isinstance(resource, list): 

199 for resource_item in resource: 

200 if first: 200 ↛ 203line 200 didn't jump to line 203, because the condition on line 200 was never false

201 first = False 

202 else: 

203 pids += ", " 

204 

205 pids += resource_item.pid 

206 else: 

207 pids = resource.pid 

208 

209 with open(os.path.join(settings.LOG_DIR, "cmds.log"), "a", encoding="utf-8") as file_: 

210 file_.write(basename + " : " + pids + "\n") 

211 

212 if hasattr(resource, "my_collection") and resource.my_collection: 

213 folder = os.path.join( 

214 settings.LOG_DIR, resource.get_top_collection().pid, resource.pid 

215 ) 

216 filename = os.path.join(folder, resource.pid + ".xml") 

217 resolver.create_folder(folder) 

218 with open(filename, "w", encoding="utf-8") as file_: 

219 file_.write(self.body) 

220 

221 # #if test, then raise an exeption if self.warnings not empty (in self.warnings we have all tags not parsed) 

222 # if 'test' in sys.argv: 

223 # if len(self.warnings) > 0: 

224 # print(self.warnings) 

225 # raise UserWarning("All tags are not parsed", self.warnings) 

226 

227 def undo(self): 

228 super().undo() 

229 

230 if self.solr_commit_at_the_end: 

231 ptf_cmds.do_solr_commit() 

232 

233 def add_objects_with_location(self, xobjs, resource, cmd_type): 

234 seq = 1 

235 

236 for xobj in xobjs: 

237 base = None 

238 

239 if xobj["base"]: 

240 base_name = xobj["base"] 

241 base = model_helpers.get_xmlbase(base_name) 

242 if base is None: 

243 cmd = ptf_cmds.addXmlBasePtfCmd({"base": xobj["base"], "solr_commit": False}) 

244 base = cmd.do(self) 

245 

246 rel = xobj["rel"] 

247 location = xobj["location"] 

248 

249 params = { 

250 "rel": rel, 

251 "mimetype": xobj.get("mimetype", ""), 

252 "location": location, 

253 "seq": seq, 

254 "solr_commit": False, 

255 "from_folder": self.from_folder, 

256 "to_folder": self.to_folder, 

257 } 

258 

259 # Ignore XML file 

260 if params["mimetype"] != "application/xml": 260 ↛ 236line 260 didn't jump to line 236, because the condition on line 260 was never false

261 if "metadata" in xobj: 

262 params["metadata"] = xobj["metadata"] 

263 

264 if "text" in xobj: 

265 params["text"] = xobj["text"] 

266 

267 # TODO: cmd factory ? 

268 cmd = None 

269 if cmd_type == "ExtLink": 

270 cmd = ptf_cmds.addExtLinkPtfCmd(params) 

271 elif cmd_type == "RelatedObject": 

272 cmd = ptf_cmds.addRelatedObjectPtfCmd(params) 

273 elif cmd_type == "SupplementaryMaterial": 273 ↛ 274line 273 didn't jump to line 274, because the condition on line 273 was never true

274 params["caption"] = xobj.get("caption", "") 

275 params["supplementary_material"] = True 

276 cmd = ptf_cmds.addSupplementaryMaterialPtfCmd(params) 

277 elif cmd_type == "DataStream": 277 ↛ 283line 277 didn't jump to line 283, because the condition on line 277 was never false

278 cmd = ptf_cmds.addDataStreamPtfCmd(params) 

279 

280 # Always try to add an ExtLink or a RelatedObject 

281 # May raise ResourceExists if the ExtLink/RelatedObject is added twice 

282 

283 if cmd is not None: 283 ↛ 289line 283 didn't jump to line 289, because the condition on line 283 was never false

284 cmd.set_base(base) 

285 cmd.set_resource(resource) 

286 

287 cmd.do(self) 

288 

289 seq += 1 

290 

291 # def add_metadata_parts(self, xobj, resource): 

292 # for (seq, name, data) in xobj.metadataparts: 

293 # params = {"name": name, 

294 # "data": data, 

295 # "seq": seq, 

296 # "solr_commit": False} 

297 # 

298 # cmd = ptf_cmds.addMetaDataPartPtfCmd(params) 

299 # cmd.set_resource(resource) 

300 # cmd.do(self) 

301 

302 @staticmethod 

303 def remove_publisher(publisher): 

304 cmd = ptf_cmds.addPublisherPtfCmd() 

305 cmd.set_object_to_be_deleted(publisher) 

306 cmd.undo() 

307 

308 # Update the published years of a collection (journal/acta/book-series...) 

309 @staticmethod 

310 def update_collection_years(pid, container, save=True): 

311 collection = Collection.objects.get(pid=pid) 

312 if container.year: 

313 year = container.year 

314 fyear, lyear = model_helpers.get_first_last_years(year) 

315 fyear = int(fyear) 

316 lyear = int(lyear) 

317 

318 if fyear < collection.fyear or not collection.fyear: 

319 collection.fyear = fyear 

320 

321 if lyear > collection.lyear or not collection.lyear: 

322 collection.lyear = lyear 

323 

324 if save: 

325 collection.save() 

326 

327 

328class addCollectionsXmlCmd(addXmlCmd): 

329 """ 

330 addCollectionsXmlCmd: adds/remove a collection 

331 

332 TODO: merge Collection and Journal ? 

333 

334 Exception raised: 

335 - exceptions.ResourceExists during do 

336 if the Collection already exists 

337 if the collection defines the same extlink/relatedobject multiple times 

338 - exceptions.ResourceDoesNotExist 

339 during undo if the Collection does not exist 

340 during do of the provider does not exist 

341 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value> 

342 - RuntimeError during undo if resources are still published 

343 """ 

344 

345 provider = None 

346 xml_format = None 

347 

348 def set_provider(self, provider): 

349 self.provider = provider 

350 

351 def add_collection(self, xcol, update=False): 

352 if not xcol: 352 ↛ 353line 352 didn't jump to line 353, because the condition on line 352 was never true

353 return None 

354 

355 if xcol.provider: 355 ↛ 358line 355 didn't jump to line 358, because the condition on line 355 was never false

356 provider = model_helpers.get_provider_by_name(xcol.provider) 

357 else: 

358 provider = self.provider 

359 

360 col_id = xcol.pid 

361 collection = model_helpers.get_collection(col_id) 

362 

363 existing = False 

364 

365 if collection is not None: 

366 existing = True 

367 if not update: 367 ↛ 371line 367 didn't jump to line 371

368 raise exceptions.ResourceExists(f"Collection {collection.pid} already exists") 

369 

370 # Create a collection 

371 params = { 

372 "xobj": xcol, 

373 "from_folder": self.from_folder, 

374 "to_folder": self.to_folder, 

375 "solr_commit": False, 

376 } 

377 

378 cls = ptf_cmds.addCollectionPtfCmd 

379 if update and existing: 379 ↛ 380line 379 didn't jump to line 380, because the condition on line 379 was never true

380 cls = ptf_cmds.updateCollectionPtfCmd 

381 

382 cmd = cls(params) 

383 cmd.set_provider(provider) 

384 collection = cmd.do(self) 

385 

386 self.add_objects_with_location(xcol.ext_links, collection, "ExtLink") 

387 

388 # if publisher: 

389 # model_helpers.publish_resource(publisher, journal) 

390 

391 return collection 

392 

393 def internal_do(self): 

394 super().internal_do() 

395 

396 collections = [] 

397 

398 if self.tree.tag == "journal-meta": 398 ↛ 399line 398 didn't jump to line 399, because the condition on line 398 was never true

399 raise ValueError( 

400 "Creation of a journal on the fly from an article is not yet supported" 

401 ) 

402 # # Code used when a journal is created on the fly while parsing an article (GDML - OAI) 

403 # # TODO 1 : Refactor all the JATS parsers (eudml/bdim/dmlcz/....) 

404 # # to be compatible with jats_parser.py 

405 # # TODO 2 : Prevent the creation of the collection on the fly ? 

406 # # Shouldn't the collection be monitored/controlled ? 

407 # xmldata = globals()[self.xml_format] 

408 # xcol = xmldata.Journal(self.tree) 

409 # collection = self.add_collection(xcol, update=True) 

410 # collections.append(collection) 

411 else: 

412 for node in self.tree: 

413 xcol = None 

414 if node.tag == "collection-meta": 414 ↛ 415line 414 didn't jump to line 415, because the condition on line 414 was never true

415 raise ValueError("Collection can only be created from <publication-meta>") 

416 # xcol = jats_parser.BitsCollection(tree=node) 

417 elif node.tag == "journal-meta": 417 ↛ 418line 417 didn't jump to line 418, because the condition on line 417 was never true

418 raise ValueError( 

419 "Collection can only be created from <publication-meta>, <journal-meta> are handled while parsing a <journal-issue>" 

420 ) 

421 # xcol = jats_parser.JatsJournal(tree=node) 

422 elif node.tag == "publication-meta": 422 ↛ 425line 422 didn't jump to line 425, because the condition on line 422 was never false

423 xcol = jats_parser.MathdocPublication(tree=node) 

424 

425 collection = self.add_collection(xcol) 

426 collections.append(collection) 

427 

428 return collections 

429 

430 

431class addIssueXmlCmd(addXmlCmd): 

432 """ 

433 addIssueXmlCmd: adds/remove an issue 

434 

435 from_folder / to_folder (declared in addXmlCmd): location of binary files to copy 

436 

437 extra_folder: folder where extra data (extid false_positive...) are stored in a json 

438 It is used 

439 - when you call addIssueXmlCmd directly to import from an archive, 

440 - when you call addOrUpdateIssueXmlCmd and we need to restore extra data after the import 

441 

442 Exception raised: 

443 - exceptions.ResourceExists during do if the issue already exists 

444 - exceptions.ResourceDoesNotExist 

445 during undo if the Issue does not exist 

446 during do if the serial/provider does not exist 

447 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value> 

448 - RuntimeError during undo if resources are still published 

449 """ 

450 

451 assign_doi = False 

452 full_text_folder = "" 

453 extra_folder = None 

454 prod_deployed_date_iso_8601_date_str = None 

455 xissue = None 

456 count = 0 

457 no_bib = False # Ignore the references during the import (used in Geodesic) 

458 embargo = False # Import only the open articles (used in Geodesic) 

459 

460 def create_child_collection(self, xjournal, journal): 

461 issn = xjournal.issn if xjournal.issn else xjournal.e_issn 

462 

463 new_xjournal = copy.deepcopy(xjournal) 

464 new_xjournal.wall = 0 

465 new_xjournal.pid = f"{xjournal.pid}-{issn}" 

466 new_xjournal.coltype = journal.coltype 

467 

468 params = {"xobj": new_xjournal} 

469 provider = model_helpers.get_provider_by_name("mathdoc") 

470 

471 cmd = ptf_cmds.addCollectionPtfCmd(params) 

472 cmd.set_parent(journal) 

473 cmd.set_provider(provider) 

474 

475 collection = cmd.do() 

476 # collection.parent = journal 

477 # journal = collection 

478 return collection 

479 

480 def get_historic_collection(self, xjournal, journal): 

481 use_meta_collections = ( 

482 settings.USE_META_COLLECTIONS if hasattr(settings, "USE_META_COLLECTIONS") else False 

483 ) 

484 

485 if not use_meta_collections: 485 ↛ 486line 485 didn't jump to line 486, because the condition on line 485 was never true

486 return journal 

487 

488 # meta-collections are used : journal may be the top collection or one of its children 

489 

490 value = id_type = None 

491 

492 # Take care of special case of STNB : 

493 # For that, we ignore the issn of STNB 2nd series 

494 if xjournal.pid == "JTNB" and xjournal.issn == "0989-5558": 494 ↛ 495line 494 didn't jump to line 495, because the condition on line 494 was never true

495 xjournal.issn = None 

496 xjournal.e_issn = None 

497 xjournal.ids = [] 

498 else: 

499 if xjournal.issn: 

500 value = xjournal.issn 

501 id_type = "issn" 

502 elif xjournal.e_issn: 

503 value = xjournal.e_issn 

504 id_type = "e-issn" 

505 

506 if value: 

507 # collection has at least one issn 

508 qs = Collection.objects.filter(resourceid__id_value=value, resourceid__id_type=id_type) 

509 if qs.exists(): 

510 journal = qs.first() 

511 else: 

512 # xjournal does not exist yet. 

513 journal = self.create_child_collection(xjournal, journal) 

514 else: 

515 # collection has no issn 

516 possible_pids = [xjournal.pid, f"{xjournal.pid}-{value}"] 

517 qs = Collection.objects.exclude(resourceid__id_value__isnull=False).filter( 

518 pid__in=possible_pids 

519 ) 

520 if qs.exists(): 520 ↛ 523line 520 didn't jump to line 523, because the condition on line 520 was never false

521 journal = qs.first() 

522 else: 

523 journal = self.create_child_collection(xjournal, journal) 

524 

525 return journal 

526 

527 def internal_do(self): 

528 super().internal_do() 

529 

530 ####################################################################### 

531 # get xissue 

532 

533 if self.xissue: 

534 xissue = self.xissue 

535 else: 

536 xissue = jats_parser.JatsIssue(tree=self.tree, no_bib=self.no_bib) 

537 self.warnings.extend(xissue.warnings) 

538 

539 ####################################################################### 

540 # Check if there is an existing issue / journal 

541 

542 issue_id = xissue.pid 

543 issue = model_helpers.get_container(issue_id) 

544 

545 if issue is not None: 

546 raise exceptions.ResourceExists(f"Issue {issue_id} already exists") 

547 

548 xjournal = xissue.journal 

549 journal_id = xjournal.pid 

550 journal = model_helpers.get_collection(journal_id) 

551 

552 # Note: Why use <issue-meta><custom-meta-group><custom-meta> to find the provider and then the journal 

553 # as there is a <journal-meta> with an id ? 

554 # The ptf_resource table (Resource objects) are created with only 1 id. 

555 # When you add a journal, the journal id is the one of its 

556 # <custom-meta-group><custom-meta> provider. 

557 # If you want to find the journal of an issue based on the <journal-meta> information, you might 

558 # have to search among the other ids (ptf_resourceid table, ResourceId objects) : sql JOIN select 

559 # To avoid the join select, it's better to use <issue-meta><custom-meta-group><custom-meta> to make sure 

560 # we use the correct provider. A simple select in the ptf_resource table is then needed. 

561 if journal is None: 561 ↛ 562line 561 didn't jump to line 562, because the condition on line 561 was never true

562 raise exceptions.ResourceDoesNotExist(f"Journal {journal_id} does not exist") 

563 

564 # Journal is the top collection (ex: AFST) 

565 # We want to get (or create) the journal that corresponds to the issue 

566 journal = self.get_historic_collection(xjournal, journal) 

567 

568 if self.embargo and journal.wall > 0: 568 ↛ 571line 568 didn't jump to line 571, because the condition on line 568 was never true

569 # Geodesic is for open access articles. 

570 # We do not want to import the issues under embargo 

571 if resolver.embargo(journal.wall, xissue.year): 

572 print(f"Embargo, ignore {xissue.pid}") 

573 return None 

574 

575 ####################################################################### 

576 # Get provider/publisher 

577 

578 provider_name = xissue.provider if xissue.provider else "mathdoc" 

579 provider = model_helpers.get_provider_by_name(provider_name) 

580 

581 ####################################################################### 

582 # Add the issue 

583 

584 params = { 

585 "xobj": xissue, 

586 "pid": xissue.pid, 

587 "from_folder": self.from_folder, 

588 "to_folder": self.to_folder, 

589 "solr_commit": False, 

590 } 

591 

592 cmd = ptf_cmds.addContainerPtfCmd(params) 

593 cmd.add_collection(journal) 

594 cmd.set_provider(provider) 

595 issue = cmd.do(self) 

596 

597 self.add_objects_with_location(xissue.ext_links, issue, "ExtLink") 

598 self.add_objects_with_location(xissue.related_objects, issue, "RelatedObject") 

599 self.add_objects_with_location(xissue.streams, issue, "DataStream") 

600 

601 ####################################################################### 

602 # Add the issue's articles 

603 

604 # JatsIssue is an iterator (has the __iter__ function) 

605 # you simply iterate the xissue to get its articles 

606 for seq, xarticle in enumerate(xissue, start=1): 

607 params = { 

608 "xarticle": xarticle, 

609 "journal": journal, 

610 "issue": issue, 

611 "seq": seq, 

612 "provider": provider, 

613 "assign_doi": self.assign_doi, 

614 "full_text_folder": self.full_text_folder, 

615 "use_body": False, 

616 "from_folder": self.from_folder, 

617 "to_folder": self.to_folder, 

618 "solr_commit_at_the_end": False, 

619 } 

620 cmd = addArticleXmlCmd(params) 

621 cmd.do(self) 

622 

623 # Update the top journal first year and last year 

624 self.update_collection_years(journal_id, issue) 

625 

626 # The collection maybe updated with update_collection_years and the assign_doi param (col.last_doi) 

627 # Update issue before returning the object. 

628 # Note that refresh_from_db does not update ForeignKey fields, we can't simply call issue.refresh_from_db() 

629 issue.my_collection.refresh_from_db() 

630 

631 # Used in post_do 

632 self._prod_deployed_date_iso_8601_date_str = xissue.prod_deployed_date_iso_8601_date_str 

633 

634 return issue 

635 

636 def post_do(self, resource=None): 

637 super().post_do(resource) 

638 

639 # Si le XML de l'issue a une last-modified, on la garde, sinon on en créé une. 

640 if resource.last_modified is None: 640 ↛ 641line 640 didn't jump to line 641, because the condition on line 640 was never true

641 resource.last_modified = timezone.now() 

642 resource.save() 

643 

644 # Sur ptf-tools, si le XML de l'issue a une prod_deployed_date, 

645 # On la propage aux Articles/Issue. 

646 # La restoration éventuelle des données (avec importExtraDataPtfCmd) peut écraser prod_deployed_date 

647 if self._prod_deployed_date_iso_8601_date_str and settings.SITE_NAME == "ptf_tools": 

648 prod_deployed_date = model_helpers.parse_date_str( 

649 self._prod_deployed_date_iso_8601_date_str 

650 ) 

651 journal_site = model_helpers.get_site_mersenne(resource.my_collection.pid) 

652 if journal_site: 652 ↛ 655line 652 didn't jump to line 655, because the condition on line 652 was never false

653 model_helpers.update_deployed_date(resource, journal_site, prod_deployed_date) 

654 

655 if self.extra_folder: 

656 ptf_cmds.importExtraDataPtfCmd( 

657 {"pid": resource.pid, "import_folder": self.extra_folder} 

658 ).do() 

659 

660 

661class addArticleXmlCmd(addXmlCmd): 

662 """ 

663 addArticleXmlCmd: adds/remove an issue 

664 

665 Exception raised: 

666 - exceptions.ResourceExists during do if the article already exists 

667 - exceptions.ResourceDoesNotExist 

668 during undo if the Article does not exist 

669 during do if the serial/issue/provider does not exist 

670 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value> 

671 """ 

672 

673 xarticle = None 

674 journal = None 

675 issue = None 

676 provider = None 

677 provider_col = None 

678 assign_doi = False 

679 full_text_folder = "" 

680 xml_format = "xmldata_jats" 

681 # restricted_mode is used by maxiDML. We do not try to import all the metadata, but only a subset 

682 restricted_mode = False 

683 # standalone is used to import isolated article, without issues 

684 standalone = False 

685 seq = ( 

686 0 # seq is used by the breadcrumbs. Generate it if it's not specified in the XML (ex: PCJ) 

687 ) 

688 keep_translations = False 

689 

690 def set_collection(self, collection): 

691 self.journal = collection 

692 self.provider = collection.provider 

693 

694 def set_xml_format(self, xml_format): 

695 self.xml_format = xml_format 

696 

697 def set_provider(self, provider): 

698 self.provider = provider 

699 

700 def set_provider_col(self, provider_col): 

701 self.provider_col = provider_col 

702 

703 def set_article_single_mode(self): 

704 self.xarticle = jats_parser.JatsArticle(tree=self.tree) 

705 self.warnings.extend(self.xarticle.warnings) 

706 

707 # TODO: MaxiDML: allow the creation of an issue on the fly 

708 # if not self.provider: 

709 # self.provider = model_helpers.get_provider_by_name(self.xarticle.provider) 

710 # 

711 # xmldata_jats.set_pid_type(self.provider.pid_type) 

712 # 

713 # bdy = etree.tostring(self.xarticle.journal.tree).decode("utf-8") 

714 # cmd = addCollectionsXmlCmd({'body': bdy, 

715 # 'xml_format': self.xml_format, 

716 # 'coltype': "journal"}) 

717 # cmd.set_provider(self.provider_col if self.provider_col else self.provider) 

718 # self.journal = cmd.do()[0] 

719 # 

720 # self.issue = model_helpers.get_container(self.xarticle.issue_id) 

721 # if self.issue is None: 

722 # # need to create the issue 

723 # date = datetime.datetime.strptime(self.xarticle.date_published_iso_8601_date_str, 

724 # '%Y-%m-%d') 

725 # pid = "{name}_{year}".format(name=self.journal.pid, year=date.year) 

726 # self.issue = model_helpers.get_container(pid) 

727 # if self.issue is None: 

728 # params = {'ctype': 'issue', 'year': date.year, 'pid': pid, 

729 # 'last_modified_iso_8601_date_str': datetime.datetime.now().strftime( 

730 # "%Y-%m-%d %H:%M:%S"), 'volume': self.xarticle.volume, 

731 # # if copy binary, need from_folder / to_folder 

732 # } 

733 # 

734 # cmd = ptf_cmds.addContainerPtfCmd(params) 

735 # cmd.add_collection(self.journal) 

736 # cmd.set_provider(self.provider) 

737 # self.issue = cmd.do() 

738 

739 def get_oai_identifier(self): 

740 return self.xarticle.oai_identifier 

741 

742 def update_xobj_with_body(self): 

743 # Import CEDRICS, le plein texte provient d'un fichier séparé 

744 if self.full_text_folder and not self.xarticle.body: 

745 if self.full_text_folder == settings.CEDRAM_TEX_FOLDER: 745 ↛ 757line 745 didn't jump to line 757, because the condition on line 745 was never false

746 text = "" 

747 locs = [ 

748 stream["location"] 

749 for stream in self.xarticle.streams 

750 if stream["mimetype"] == "application/pdf" 

751 ] 

752 if locs: 752 ↛ 755line 752 didn't jump to line 755, because the condition on line 752 was never false

753 full_pdf_location = os.path.join(self.full_text_folder, locs[0]) 

754 text = utils.pdf_to_text(full_pdf_location) 

755 self.xarticle.body = text 

756 else: 

757 full_text_file = self.full_text_folder + self.xarticle.pid + ".xml" 

758 

759 with open(full_text_file, mode="rb") as file_: 

760 body = file_.read() 

761 

762 parser = etree.XMLParser(huge_tree=True, recover=True) 

763 tree = etree.fromstring(body, parser=parser) 

764 node = tree.find("body") 

765 self.xarticle.body = xml_utils.get_text_from_node(node) 

766 self.xarticle.body_xml = xml_utils.get_xml_from_text("body", self.xarticle.body) 

767 elif not self.xarticle.body_xml and hasattr(self.xarticle, "pii"): 767 ↛ 768line 767 didn't jump to line 768, because the condition on line 767 was never true

768 full_text_file = os.path.join( 

769 "/numdam_dev/acquisition/donnees_traitees", 

770 self.journal.pid, 

771 self.issue.pid, 

772 self.xarticle.pid, 

773 self.xarticle.pid + ".xml", 

774 ) 

775 if os.path.isfile(full_text_file): 

776 with open(full_text_file, mode="rb") as file_: 

777 body = file_.read() 

778 

779 parser = etree.XMLParser(huge_tree=True, recover=True) 

780 tree = etree.fromstring(body, parser=parser) 

781 node = tree.find("body") 

782 self.xarticle.body = xml_utils.get_text_from_node(node) 

783 self.xarticle.body_xml = xml_utils.get_xml_from_text("body", self.xarticle.body) 

784 

785 def internal_do(self): 

786 super().internal_do() 

787 

788 if self.xarticle is None and self.journal is not None: 788 ↛ 790line 788 didn't jump to line 790, because the condition on line 788 was never true

789 # self.restricted_mode = True 

790 self.set_article_single_mode() 

791 self.update = True 

792 else: 

793 self.update = False 

794 

795 if self.xarticle.pid is None: 

796 self.xarticle.pid = ( 

797 self.xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_") 

798 ) 

799 

800 for xtranslated_article in self.xarticle.translations: 800 ↛ 801line 800 didn't jump to line 801, because the loop on line 800 never started

801 for xtream in xtranslated_article.streams: 

802 if xtream["mimetype"] == "text/html": 

803 if self.from_folder is None: 

804 raise ValueError( 

805 "The article has its full text in a separate HTML file. You need to set from_folder" 

806 ) 

807 

808 location = os.path.join(self.from_folder, xtream["location"]) 

809 body_html = resolver.get_body(location) 

810 body = xml_utils.get_text_from_xml_with_mathml(body_html) 

811 xtranslated_article.body_html = body_html 

812 xtranslated_article.body = body 

813 

814 for stream in self.xarticle.streams: 

815 if stream["mimetype"] == "text/html": 

816 location = os.path.join(self.from_folder, stream["location"]) 

817 body_html = resolver.get_body(location) 

818 body = xml_utils.get_text_from_xml_with_mathml(body_html) 

819 self.xarticle.body_html = body_html 

820 self.xarticle.body = body 

821 

822 if self.xarticle.doi: 

823 article = model_helpers.get_article_by_doi(self.xarticle.doi) 

824 else: 

825 article = model_helpers.get_article(self.xarticle.pid) 

826 needs_to_restore_article = False 

827 

828 if article is not None: 828 ↛ 829line 828 didn't jump to line 829, because the condition on line 828 was never true

829 if self.update or self.standalone: 

830 if self.standalone: 

831 self.provider = article.provider 

832 

833 needs_to_restore_article = True 

834 backup_obj_not_in_metadata(article) 

835 

836 if self.keep_translations: 

837 backup_translation(article) 

838 

839 cmd = ptf_cmds.addArticlePtfCmd( 

840 { 

841 "pid": article.pid, 

842 "to_folder": self.to_folder, # on supprime les fichiers pour être sûr 

843 } 

844 ) 

845 cmd.set_object_to_be_deleted(article) 

846 cmd.undo() 

847 else: 

848 raise exceptions.ResourceExists(f"Article {self.xarticle.pid} already exists") 

849 

850 # Override seq 

851 if self.standalone and article is not None: 851 ↛ 852line 851 didn't jump to line 852, because the condition on line 851 was never true

852 self.xarticle.seq = article.seq 

853 elif ( 

854 not self.standalone and self.issue and int(self.xarticle.seq) == 0 and self.seq != 0 

855 ) or (hasattr(self, "pii") and self.seq != 0): 

856 self.xarticle.seq = self.seq 

857 

858 # Get the article's text (body) for SolR if it is empty from the PDF 

859 self.update_xobj_with_body() 

860 

861 params = { 

862 "xobj": self.xarticle, 

863 "pid": self.xarticle.pid, 

864 "from_folder": self.from_folder, 

865 "to_folder": self.to_folder, 

866 "assign_doi": self.assign_doi and not self.xarticle.doi, 

867 "solr_commit": False, 

868 } 

869 

870 cmd = ptf_cmds.addArticlePtfCmd(params) 

871 if self.issue or not self.standalone: 871 ↛ 873line 871 didn't jump to line 873, because the condition on line 871 was never false

872 cmd.set_container(self.issue) 

873 cmd.add_collection(self.journal) 

874 article = cmd.do(self) 

875 

876 self.add_objects_with_location(self.xarticle.ext_links, article, "ExtLink") 

877 self.add_objects_with_location(self.xarticle.streams, article, "DataStream") 

878 if not self.restricted_mode: 878 ↛ 883line 878 didn't jump to line 883, because the condition on line 878 was never false

879 self.add_objects_with_location( 

880 self.xarticle.supplementary_materials, article, "SupplementaryMaterial" 

881 ) 

882 

883 if ( 

884 hasattr(settings, "SHOW_BODY") and settings.SHOW_BODY 

885 ) or settings.SITE_NAME == "ptf_tools": 

886 self.add_objects_with_location(self.xarticle.figures, article, "RelatedObject") 

887 

888 for xtrans_article, trans_article in zip( 888 ↛ 891line 888 didn't jump to line 891, because the loop on line 888 never started

889 self.xarticle.translations, cmd.cmd.translated_articles 

890 ): 

891 self.add_objects_with_location(xtrans_article.streams, trans_article, "DataStream") 

892 

893 if needs_to_restore_article: 893 ↛ 894line 893 didn't jump to line 894, because the condition on line 893 was never true

894 restore_obj_not_in_metadata(article) 

895 

896 if self.keep_translations: 

897 restore_translation(article) 

898 

899 return article 

900 

901 

902class addTranslatedArticleXmlCmd(addXmlCmd): 

903 """ 

904 addTranslatedArticleXmlCmd: adds/remove translations. 

905 The original article is not changed 

906 The current translations are first removed 

907 """ 

908 

909 lang = "" 

910 html_file_name = "" 

911 pdf_file_name = "" 

912 date_published_str = "" 

913 

914 def internal_do(self): 

915 super().internal_do() 

916 

917 xarticle = jats_parser.JatsArticle(tree=self.tree) 

918 article = model_helpers.get_article(xarticle.pid) 

919 

920 if article is None: 

921 raise exceptions.ResourceDoesNotExist(f"Article {self.xarticle.pid} does not exist") 

922 

923 # Merge existing article with new translation 

924 data_article = model_data_converter.db_to_article_data(article) 

925 new_translations = [ 

926 translation 

927 for translation in data_article.translations 

928 if translation.lang != self.lang 

929 ] 

930 

931 for xtrans_article in xarticle.translations: 

932 if xtrans_article.lang == self.lang: 

933 # Upload/views has copied the HTML file on disk 

934 # Add a DataStream. 

935 # TODO: check if the datastream is not already present 

936 if self.html_file_name: 

937 data = model_data.create_datastream() 

938 data["rel"] = "full-text" 

939 data["mimetype"] = "text/html" 

940 data["location"] = self.html_file_name 

941 xtrans_article.streams.append(data) 

942 

943 if self.pdf_file_name: 

944 # Create a pdf file 

945 # pdf-translate needs the article/sub-article XML 

946 # Simply add a datastream for now 

947 # The new Article created in Django will be complete 

948 # But generate the PDF file at the end 

949 data = model_data.create_datastream() 

950 data["rel"] = "full-text" 

951 data["mimetype"] = "application/pdf" 

952 data["location"] = self.pdf_file_name 

953 xtrans_article.streams.append(data) 

954 

955 if self.date_published_str: 

956 xtrans_article.date_published_iso_8601_date_str = self.date_published_str 

957 

958 new_translations.append(xtrans_article) 

959 

960 data_article.translations = new_translations 

961 

962 cmd = addArticleXmlCmd( 

963 { 

964 "xarticle": data_article, 

965 "use_body": False, 

966 "issue": article.my_container, 

967 "standalone": True, 

968 "from_folder": self.from_folder, 

969 } 

970 ) 

971 cmd.set_collection(article.get_collection()) 

972 article = cmd.do() 

973 

974 # pdf-translate needs the article/sub-article XML 

975 xml = ptf_cmds.exportPtfCmd( 

976 { 

977 "pid": article.pid, 

978 "with_body": False, 

979 "with_djvu": False, 

980 "article_standalone": True, 

981 "collection_pid": settings.COLLECTION_PID, 

982 } 

983 ).do() 

984 

985 tex.create_translated_pdf( 

986 article, 

987 xml, 

988 self.lang, 

989 os.path.join(self.from_folder, self.pdf_file_name), 

990 os.path.join(self.from_folder, self.html_file_name), 

991 # If the date_published is specified, we assume that the PDF already exists 

992 skip_compilation=self.date_published_str != "", 

993 ) 

994 

995 return article 

996 

997 

998class addPCJArticleXmlCmd(addXmlCmd): 

999 """ 

1000 addPCJArticleXmlCmd: 

1001 """ 

1002 

1003 html_file_name = "" 

1004 

1005 def internal_do(self): 

1006 super().internal_do() 

1007 

1008 xarticle = jats_parser.JatsArticle(tree=self.tree) 

1009 

1010 if self.html_file_name: 1010 ↛ 1017line 1010 didn't jump to line 1017, because the condition on line 1010 was never false

1011 data = model_data.create_datastream() 

1012 data["rel"] = "full-text" 

1013 data["mimetype"] = "text/html" 

1014 data["location"] = self.html_file_name 

1015 xarticle.streams.append(data) 

1016 

1017 cmd = addArticleXmlCmd( 

1018 { 

1019 "xarticle": xarticle, 

1020 "use_body": False, 

1021 "issue": self.issue, 

1022 "standalone": True, 

1023 "from_folder": self.from_folder, 

1024 } 

1025 ) 

1026 cmd.set_collection(self.collection) 

1027 article = cmd.do() 

1028 

1029 return article 

1030 

1031 

1032class addBookXmlCmd(addXmlCmd): 

1033 """ 

1034 addBookXmlCmd: adds/remove a book 

1035 

1036 Exception raised: 

1037 - exceptions.ResourceExists during do if the book already exists 

1038 - exceptions.ResourceDoesNotExist 

1039 during undo if the Book does not exist 

1040 during do if the serial/provider does not exist 

1041 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value> 

1042 - RuntimeError during undo if resources are still published 

1043 """ 

1044 

1045 provider = None 

1046 import_oai_mode = False 

1047 journal = None 

1048 xml_format = "xmldata_jats" 

1049 xbook = None 

1050 _collection = None 

1051 no_bib = False # Ignore the references during the import (used in Geodesic) 

1052 

1053 def set_provider(self, provider): 

1054 self.provider = provider 

1055 

1056 def add_parts(self, xparts, pseq): 

1057 if xparts: 

1058 seq = 1 

1059 for xpart in xparts: 

1060 self.add_part(xpart, seq, pseq) 

1061 seq += 1 

1062 

1063 def add_part(self, xpart, seq, pseq): 

1064 if xpart is None: 1064 ↛ 1065line 1064 didn't jump to line 1065, because the condition on line 1064 was never true

1065 return 

1066 

1067 # An Article is used to store a book part in the database 

1068 article = model_helpers.get_article(xpart.pid) 

1069 

1070 if article is not None: 1070 ↛ 1071line 1070 didn't jump to line 1071, because the condition on line 1070 was never true

1071 raise exceptions.ResourceExists(f"BookPart {xpart.pid} already exists") 

1072 

1073 params = { 

1074 "xobj": xpart, 

1075 "pid": xpart.pid, 

1076 "seq": seq, 

1077 "pseq": pseq, 

1078 # "deployed": deployed, 

1079 "from_folder": self.from_folder, 

1080 "to_folder": self.to_folder, 

1081 "solr_commit": False, 

1082 } 

1083 

1084 cmd = ptf_cmds.addBookPartPtfCmd(params) 

1085 cmd.set_container(self.book) 

1086 cmd.add_collection(self._collection) 

1087 article = cmd.do(self) 

1088 

1089 self.add_objects_with_location(xpart.ext_links, article, "ExtLink") 

1090 self.add_objects_with_location(xpart.streams, article, "DataStream") 

1091 

1092 self.add_parts(xpart.parts, seq) 

1093 

1094 def set_import_oai_mode(self): 

1095 self.import_oai_mode = True 

1096 

1097 def internal_do(self): 

1098 super().internal_do() 

1099 

1100 ####################################################################### 

1101 # Get xbook 

1102 

1103 if self.import_oai_mode: 1103 ↛ 1104line 1103 didn't jump to line 1104, because the condition on line 1103 was never true

1104 xmldata = globals()[self.xml_format] 

1105 xbook = xmldata.Book(self.tree) 

1106 self.journal = model_helpers.get_collection("GDML_Books") 

1107 

1108 else: 

1109 if self.xbook: 

1110 xbook = self.xbook 

1111 else: 

1112 xbook = jats_parser.BitsBook(tree=self.tree, no_bib=self.no_bib) 

1113 self.warnings.extend(xbook.warnings) 

1114 

1115 ####################################################################### 

1116 # Get existing book if any 

1117 

1118 if not self.provider: 1118 ↛ 1122line 1118 didn't jump to line 1122, because the condition on line 1118 was never false

1119 provider = model_helpers.get_provider_by_name(xbook.provider) 

1120 self.provider = provider 

1121 

1122 book_id = xbook.pid 

1123 book = model_helpers.get_container(book_id) 

1124 

1125 ####################################################################### 

1126 # Delete any existing book 

1127 

1128 if book is not None: 

1129 if self.import_oai_mode: 1129 ↛ 1130line 1129 didn't jump to line 1130, because the condition on line 1129 was never true

1130 publisher = book.my_publisher 

1131 

1132 # Note: the existing collection is not removed even if it no longer has a resource 

1133 # TODO: urls/commands to add/update/delete a collection 

1134 

1135 # Removes the book 

1136 cmd = ptf_cmds.addContainerPtfCmd() 

1137 cmd.set_object_to_be_deleted(book) 

1138 cmd.undo() 

1139 

1140 if publisher and publisher.publishes.count() == 0: 

1141 self.remove_publisher(publisher) 

1142 else: 

1143 raise exceptions.ResourceExists("Book %s already exists" % book_id) 

1144 

1145 ####################################################################### 

1146 # Add new book 

1147 

1148 if xbook.incollection: 1148 ↛ 1153line 1148 didn't jump to line 1153, because the condition on line 1148 was never false

1149 colid = xbook.incollection[0].pid 

1150 self._collection = model_helpers.get_collection(colid) 

1151 if self._collection is None: 

1152 raise exceptions.ResourceDoesNotExist(f"The collection {colid} does not exist") 

1153 elif self.import_oai_mode: 

1154 self._collection = self.journal 

1155 

1156 params = { 

1157 "xobj": xbook, 

1158 "pid": xbook.pid, 

1159 "from_folder": self.from_folder, 

1160 "to_folder": self.to_folder, 

1161 "solr_commit": False, 

1162 } 

1163 

1164 cmd = ptf_cmds.addContainerPtfCmd(params) 

1165 cmd.add_collection(self._collection) 

1166 cmd.set_provider(provider) 

1167 

1168 book = cmd.do(self) 

1169 self.book = book 

1170 

1171 self.add_objects_with_location(xbook.ext_links, book, "ExtLink") 

1172 self.add_objects_with_location(xbook.related_objects, book, "RelatedObject") 

1173 self.add_objects_with_location(xbook.streams, book, "DataStream") 

1174 

1175 # self.add_metadata_parts(xbook, book) TODO support Metadataparts ? 

1176 

1177 ####################################################################### 

1178 # Add Book parts 

1179 

1180 # JatsIssue is an iterator (has the __iter__ function) 

1181 # TODO make JatsBook an iterator as well ? 

1182 self.add_parts(xbook.parts, 0) 

1183 

1184 # Update the collection first year and last year 

1185 for incol in xbook.incollection: 

1186 self.update_collection_years(incol.pid, book) 

1187 

1188 return book 

1189 

1190 

1191###################################################################################### 

1192###################################################################################### 

1193# 

1194# Update Commands 

1195# 

1196###################################################################################### 

1197###################################################################################### 

1198 

1199 

1200class updateCollectionsXmlCmd(addXmlCmd): 

1201 """ 

1202 updateSerialsXmlCmd: updates one or more journals 

1203 

1204 Exception raised: 

1205 - exceptions.ResourceDoesNotExist during do if the Collection does not exist 

1206 - RuntimeError if undo is called 

1207 """ 

1208 

1209 def update_collection(self, xcol, do_update=True): 

1210 if not xcol: 1210 ↛ 1211line 1210 didn't jump to line 1211, because the condition on line 1210 was never true

1211 return None 

1212 

1213 provider = model_helpers.get_provider_by_name(xcol.provider) 

1214 

1215 col_id = xcol.pid 

1216 col = model_helpers.get_collection(col_id) 

1217 

1218 if col is None: 

1219 raise exceptions.ResourceDoesNotExist("Collection %s does not exist" % xcol.pid) 

1220 

1221 if do_update: 

1222 params = { 

1223 "xobj": xcol, 

1224 "solr_commit": False, 

1225 "from_folder": self.from_folder, 

1226 "to_folder": self.to_folder, 

1227 } 

1228 

1229 # The existing other_ids, abstracts are removed in updateCollectionDatabaseCmd::internal_do 

1230 # and the new ones are added in the post_do (addResourceDatabaseCmd) 

1231 

1232 cmd = ptf_cmds.updateCollectionPtfCmd(params) 

1233 cmd.set_provider(provider) 

1234 # cmd.set_publisher(publisher) 

1235 col = cmd.do() 

1236 

1237 # The existing extlinks are removed in updateCollectionDatabaseCmd::internal_do 

1238 self.add_objects_with_location(xcol.ext_links, col, "ExtLink") 

1239 resolver.copy_binary_files(col, self.from_folder, self.to_folder) 

1240 

1241 # if publisher: 

1242 # model_helpers.publish_resource(publisher, col) 

1243 

1244 return col 

1245 

1246 def internal_do(self): 

1247 super().internal_do() 

1248 

1249 collections = [] 

1250 

1251 # First, check that all journals exist 

1252 for node in self.tree: 

1253 xcol = None 

1254 if node.tag == "collection-meta": 1254 ↛ 1255line 1254 didn't jump to line 1255, because the condition on line 1254 was never true

1255 xcol = jats_parser.BitsCollection(tree=node) 

1256 elif node.tag == "journal-meta": 1256 ↛ 1257line 1256 didn't jump to line 1257, because the condition on line 1256 was never true

1257 xcol = jats_parser.JatsJournal(tree=node) 

1258 elif node.tag == "publication-meta": 1258 ↛ 1260line 1258 didn't jump to line 1260, because the condition on line 1258 was never false

1259 xcol = jats_parser.MathdocPublication(tree=node) 

1260 self.update_collection(xcol, False) 

1261 

1262 for node in self.tree: 

1263 xcol = None 

1264 if node.tag == "collection-meta": 1264 ↛ 1265line 1264 didn't jump to line 1265, because the condition on line 1264 was never true

1265 xcol = jats_parser.BitsCollection(tree=node) 

1266 elif node.tag == "journal-meta": 1266 ↛ 1267line 1266 didn't jump to line 1267, because the condition on line 1266 was never true

1267 xcol = jats_parser.JatsJournal(tree=node) 

1268 elif node.tag == "publication-meta": 1268 ↛ 1270line 1268 didn't jump to line 1270, because the condition on line 1268 was never false

1269 xcol = jats_parser.MathdocPublication(tree=node) 

1270 self.warnings.extend(xcol.warnings) 

1271 xcol = self.update_collection(xcol) 

1272 collections.append(xcol) 

1273 

1274 return collections 

1275 

1276 def internal_undo(self): 

1277 raise RuntimeError("update commands do not support the undo") 

1278 

1279 

1280##################################################################### 

1281# 

1282# replaceIssueXmlCmd: updates an issue 

1283# 

1284# Exception raised: 

1285# - exceptions.ResourceDoesNotExist during do if the Collection/Issue/Provider does not exist 

1286# <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value> 

1287# - RuntimeError if undo is called 

1288# 

1289###################################################################### 

1290class replaceIssueXmlCmd(addXmlCmd): 

1291 def internal_do(self): 

1292 super().internal_do() 

1293 

1294 xissue = jats_parser.JatsIssue(tree=self.tree) 

1295 self.warnings.extend(xissue.warnings) 

1296 

1297 xjournal = xissue.journal 

1298 journal_id = xjournal.pid 

1299 journal = model_helpers.get_collection(journal_id) 

1300 

1301 if journal is None: 1301 ↛ 1302line 1301 didn't jump to line 1302, because the condition on line 1301 was never true

1302 raise exceptions.ResourceDoesNotExist("Journal %s does not exist" % xjournal.pid) 

1303 

1304 issue_id = xissue.pid 

1305 issue = model_helpers.get_container(issue_id) 

1306 

1307 if issue is None: 1307 ↛ 1308line 1307 didn't jump to line 1308, because the condition on line 1307 was never true

1308 raise exceptions.ResourceDoesNotExist("Issue %s does not exist" % issue_id) 

1309 

1310 publisher = issue.my_publisher 

1311 

1312 cmd = ptf_cmds.addContainerPtfCmd() 

1313 cmd.set_object_to_be_deleted(issue) 

1314 cmd.undo() 

1315 

1316 if publisher.publishes.count() == 0: 

1317 self.remove_publisher(publisher) 

1318 

1319 # update the journal first and last year 

1320 for the_issue in journal.content.all(): 

1321 self.update_collection_years(journal_id, the_issue, False) 

1322 

1323 journal.save() 

1324 

1325 cmd = addIssueXmlCmd( 

1326 { 

1327 "xissue": xissue, 

1328 "use_body": False, 

1329 "solr_commit": False, 

1330 "extra_folder": self.from_folder, 

1331 "to_folder": self.to_folder, 

1332 } 

1333 ) 

1334 issue = cmd.do() 

1335 

1336 return issue 

1337 

1338 # node_tag = self.tree.tag 

1339 # for child in self.tree: 

1340 # node_tag = child.tag 

1341 

1342 def internal_undo(self): 

1343 raise RuntimeError("update commands do not support the undo") 

1344 

1345 

1346class updateBookXmlCmd(addXmlCmd): 

1347 """ 

1348 updateBookXmlCmd: updates a book 

1349 

1350 Exception raised: 

1351 - exceptions.ResourceDoesNotExist during do if the Book does not exist 

1352 - RuntimeError if undo is called 

1353 """ 

1354 

1355 no_bib = False # Ignore the references during the import (used in Geodesic) 

1356 

1357 def internal_do(self): 

1358 super().internal_do() 

1359 

1360 xbook = jats_parser.BitsBook(tree=self.tree, no_bib=self.no_bib) 

1361 self.warnings.extend(xbook.warnings) 

1362 

1363 book_id = xbook.pid 

1364 book = model_helpers.get_container(book_id) 

1365 

1366 if book is None: 1366 ↛ 1367line 1366 didn't jump to line 1367, because the condition on line 1366 was never true

1367 raise exceptions.ResourceDoesNotExist("Book %s does not exist" % xbook.pid) 

1368 

1369 # unpublish and delete the existing publisher if necessary 

1370 # self.update_publisher(xbook, book) 

1371 

1372 # Note: the existing collection is not removed even if it no longer has a resource 

1373 # TODO: urls/commands to add/update/delete a collection 

1374 

1375 # Removes the book 

1376 cmd = ptf_cmds.addContainerPtfCmd() 

1377 cmd.set_object_to_be_deleted(book) 

1378 cmd.undo() 

1379 

1380 cmd = addBookXmlCmd( 

1381 { 

1382 "xbook": xbook, 

1383 "use_body": False, 

1384 "solr_commit": False, 

1385 "from_folder": self.from_folder, 

1386 "no_bib": self.no_bib, 

1387 "to_folder": self.to_folder, 

1388 } 

1389 ) 

1390 book = cmd.do() 

1391 

1392 return book 

1393 

1394 def internal_undo(self): 

1395 raise RuntimeError("update commands do not support the undo") 

1396 

1397 

1398class addOrUpdateContainerXmlCmd(addXmlCmd): 

1399 """ 

1400 addOrUpdateContainerXmlCmd: detects Container type from xml and adds or updates an issue or a book 

1401 

1402 just detect Container type (do not check params etc.) 

1403 """ 

1404 

1405 keep_metadata = False 

1406 keep_translations = False 

1407 backup_folder = None 

1408 full_text_folder = "" 

1409 fake = False # Parse the XML but do not import 

1410 no_bib = False # Ignore the references during the import (used in Geodesic) 

1411 embargo = False # Import only the open articles (used in Geodesic) 

1412 

1413 def check_params(self): 

1414 super().check_params() 

1415 

1416 def internal_do(self): 

1417 super().internal_do() 

1418 

1419 tag = normalize(self.tree.tag) 

1420 

1421 if tag == "journal-issue": 1421 ↛ 1439line 1421 didn't jump to line 1439, because the condition on line 1421 was never false

1422 cmd = addOrUpdateIssueXmlCmd( 

1423 { 

1424 "body": self.body, 

1425 "keep_metadata": self.keep_metadata, 

1426 "keep_translations": self.keep_translations, 

1427 "backup_folder": self.backup_folder, 

1428 "to_folder": self.to_folder, 

1429 "from_folder": self.from_folder, 

1430 "xml_file_folder": self.xml_file_folder, 

1431 "fake": self.fake, 

1432 "no_bib": self.no_bib, 

1433 "embargo": self.embargo, 

1434 } 

1435 ) 

1436 obj = cmd.do() 

1437 self.warnings.extend(cmd.warnings) 

1438 return obj 

1439 elif tag == "book": 

1440 cmd = addOrUpdateBookXmlCmd( 

1441 { 

1442 "body": self.body, 

1443 "from_folder": self.from_folder, 

1444 "to_folder": self.to_folder, 

1445 "no_bib": self.no_bib, 

1446 "embargo": self.embargo, 

1447 } 

1448 ) 

1449 obj = cmd.do() 

1450 self.warnings.extend(cmd.warnings) 

1451 return obj 

1452 else: 

1453 raise RuntimeError("addOrupdateContainer command can't detect container type") 

1454 

1455 def internal_undo(self): 

1456 raise RuntimeError("update commands do not support the undo") 

1457 

1458 

1459class addOrUpdateIssueXmlCmd(addXmlCmd): 

1460 """ 

1461 addOrUpdateIssueXmlCmd: adds or updates an issue 

1462 

1463 Adds an issue if it is not in the system or updates the issue if it is already there. 

1464 By default, no DOI is assigned for the articles. Set assign_doi to True. 

1465 

1466 from_folder / to_folder (declared in addXmlCmd): location of binary files to copy 

1467 backup_folder: folder where extra data (extid false_positive...) are (to be) stored in a json 

1468 

1469 keep_metadata: 

1470 True if you want to back up extra data (icon, dates, matching ids, ...) in the backup_folder 

1471 Default: False 

1472 Note: backup_obj_not_in_metadata / restore_obj_not_in_metadata is always called 

1473 We always want to preserve GraphicalAbstracts (they are not in the issue XML) 

1474 

1475 keep_translations: 

1476 True if you want back up/restore translations. 

1477 Default: False 

1478 Note: When you post an article to a journal (test) website, the translation is declared in the XML 

1479 But if you import a Cedrics article in Trammel, the XML does not list translations 

1480 

1481 Exception raised: 

1482 - exceptions.ResourceDoesNotExist during do if the Collection/Issue/Provider does not exist 

1483 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value> 

1484 - RuntimeError if undo is called 

1485 """ 

1486 

1487 keep_metadata = False 

1488 keep_translations = False 

1489 backup_folder = None 

1490 assign_doi = False 

1491 full_text_folder = "" 

1492 

1493 xissue = None 

1494 fake = False # Parse the XML but do not import 

1495 no_bib = False # Ignore the references during the import (used in Geodesic) 

1496 embargo = False # Import only the open articles (used in Geodesic) 

1497 

1498 def check_params(self): 

1499 super().check_params() 

1500 

1501 if self.keep_metadata and self.assign_doi: 1501 ↛ 1502line 1501 didn't jump to line 1502, because the condition on line 1501 was never true

1502 raise ValueError("keep_metadata and assign_doi cannot both be true.") 

1503 

1504 if self.keep_metadata and self.backup_folder is None: 1504 ↛ 1505line 1504 didn't jump to line 1505, because the condition on line 1504 was never true

1505 raise ValueError("backup_folder needs to be set when keep_metadata is true.") 

1506 

1507 def internal_do(self): 

1508 super().internal_do() 

1509 

1510 if not self.xissue: 

1511 self.xissue = xissue = jats_parser.JatsIssue( 

1512 tree=self.tree, from_folder=self.from_folder, no_bib=self.no_bib 

1513 ) 

1514 if len(xissue.warnings) > 0 and self.xml_file_folder: 

1515 warnings = [] 

1516 warning_keys = [] 

1517 for warning in xissue.warnings: 

1518 for key, value in warning.items(): 

1519 if value not in warning_keys: 

1520 warning_keys.append(value) 

1521 warnings.append({key: value}) 

1522 for warning in warnings: 

1523 print(warning) 

1524 self.warnings.extend(xissue.warnings) 

1525 else: 

1526 xissue = self.xissue 

1527 

1528 if self.fake: 1528 ↛ 1529line 1528 didn't jump to line 1529, because the condition on line 1528 was never true

1529 return 

1530 

1531 xjournal = xissue.journal 

1532 journal_id = xjournal.pid 

1533 journal = model_helpers.get_collection(journal_id) 

1534 

1535 if journal is None: 1535 ↛ 1536line 1535 didn't jump to line 1536, because the condition on line 1535 was never true

1536 raise exceptions.ResourceDoesNotExist("Journal %s does not exist" % xjournal.pid) 

1537 

1538 existing_issue = model_helpers.get_container(xissue.pid) 

1539 

1540 if existing_issue: 

1541 if self.embargo and existing_issue.embargo(): 1541 ↛ 1544line 1541 didn't jump to line 1544, because the condition on line 1541 was never true

1542 # Geodesic is for open access articles. 

1543 # We do not want to import the issues under embargo 

1544 print(f"Embargo, ignore {xissue.pid}") 

1545 return None 

1546 

1547 if self.keep_metadata: 

1548 # On commence par faire un backup de l'existant en cas de bug. 

1549 ptf_cmds.exportPtfCmd( 

1550 { 

1551 "pid": existing_issue.pid, 

1552 "with_internal_data": True, 

1553 "with_binary_files": False, 

1554 "for_archive": False, 

1555 "export_folder": os.path.join(settings.MERSENNE_TMP_FOLDER, "backup"), 

1556 } 

1557 ).do() 

1558 

1559 # On sauvegarde les données additionnelles (extid, deployed_date,...) 

1560 # dans un json qui sera ré-importé avec l'import du nouvel issue 

1561 params = { 

1562 "pid": existing_issue.pid, 

1563 "export_folder": self.backup_folder, 

1564 "export_all": True, 

1565 "with_binary_files": True, 

1566 } 

1567 ptf_cmds.exportExtraDataPtfCmd(params).do() 

1568 

1569 for article in existing_issue.article_set.all(): 

1570 backup_obj_not_in_metadata(article) 

1571 if self.keep_translations: 

1572 backup_translation(article) 

1573 

1574 # On efface l'issue existant, sinon l'import va se plaindre d'articles existants 

1575 cmd = ptf_cmds.addContainerPtfCmd() 

1576 cmd.set_object_to_be_deleted(existing_issue) 

1577 cmd.undo() 

1578 

1579 # update the journal first and last year 

1580 for the_issue in journal.content.all(): 

1581 self.update_collection_years(journal_id, the_issue, False) 

1582 

1583 journal.save() 

1584 else: 

1585 issue_to_appear = model_helpers.get_issue_to_appear(journal_id) 

1586 

1587 # Dans le cas des AIF, les articles du volume à paraitre sont déplacés 

1588 # dans un nouveau volume avant publication (de AIF_0__0_ vers AIF_2018... par ex) 

1589 # La 1ère fois, AIF_2018_ n'est pas encore dans PTF et existing_issue vaut None. 

1590 # Exemple : AIF_0_0 contient doi1, doi2 et doi3, AIF_2018 contient doi1 et doi2. 

1591 # L'import va échouer car on ne peut avoir 2 fois le même article. 

1592 # La solution d'effacer AIF_0_0 n'est pas bonne car on perd doi3. 

1593 # Il faut supprimer les articles en commun (de _0__0 et 2018_) avant l'import 

1594 # du nouveau volume sinon il va y avoir des conflits. 

1595 

1596 if issue_to_appear and xissue.pid != issue_to_appear.pid: 

1597 # On sauvegarde les données additionnelles (extid, deployed_date,...) 

1598 # dans un json qui sera ré-importé avec l'import du nouvel issue 

1599 # ainsi que image associée via ptf-tools 

1600 if self.keep_metadata: 

1601 params = { 

1602 "pid": issue_to_appear.pid, 

1603 "force_pid": xissue.pid, 

1604 "export_folder": self.backup_folder, 

1605 "export_all": True, 

1606 "with_binary_files": True, 

1607 } 

1608 ptf_cmds.exportExtraDataPtfCmd(params).do() 

1609 

1610 for xarticle in xissue: 

1611 xdoi = getattr(xarticle, "doi") 

1612 article = issue_to_appear.article_set.filter(doi=xdoi).first() 

1613 if article: 

1614 backup_obj_not_in_metadata(article) 

1615 if self.keep_translations: 

1616 backup_translation(article) 

1617 

1618 params = {"to_folder": self.to_folder} # pour suppression des binaires 

1619 cmd = ptf_cmds.addArticlePtfCmd(params) 

1620 cmd.set_object_to_be_deleted(article) 

1621 cmd.undo() 

1622 

1623 # si backup_folder est différent de None, alors addIssueXmlCmd.post_do() utilise importExtraDataPtfCmd 

1624 cmd = addIssueXmlCmd( 

1625 { 

1626 "xissue": xissue, 

1627 "use_body": False, 

1628 # "body": self.body, 

1629 "assign_doi": self.assign_doi, 

1630 "full_text_folder": self.full_text_folder, # Cedrics: the full text for SolR is in a separate file 

1631 "extra_folder": self.backup_folder, 

1632 "from_folder": self.from_folder, 

1633 "to_folder": self.to_folder, 

1634 "no_bib": self.no_bib, 

1635 "embargo": self.embargo, 

1636 "solr_commit": False, 

1637 } 

1638 ) 

1639 new_issue = cmd.do() 

1640 

1641 if new_issue: 1641 ↛ 1654line 1641 didn't jump to line 1654, because the condition on line 1641 was never false

1642 new_articles = new_issue.article_set.all() 

1643 

1644 # Avec l'option self.assign_doi, on vérifie que les doi ont bien été assignés 

1645 for article in new_articles: 

1646 if self.assign_doi and article.doi is None: 1646 ↛ 1647line 1646 didn't jump to line 1647, because the condition on line 1646 was never true

1647 raise exceptions.ResourceHasNoDoi("The article %s has no DOI" % article.pid) 

1648 

1649 # TODO garbage collector on articles no longer in the issue 

1650 restore_obj_not_in_metadata(article) 

1651 if self.keep_translations: 

1652 restore_translation(article) 

1653 

1654 return new_issue 

1655 

1656 def internal_undo(self): 

1657 raise RuntimeError("update commands do not support the undo") 

1658 

1659 

1660class addOrUpdateBookXmlCmd(addXmlCmd): 

1661 xbook = None 

1662 no_bib = False # Ignore the references during the import (used in Geodesic) 

1663 

1664 def internal_do(self): 

1665 super().internal_do() 

1666 

1667 if not self.xbook: 1667 ↛ 1671line 1667 didn't jump to line 1671, because the condition on line 1667 was never false

1668 xbook = jats_parser.BitsBook(tree=self.tree, no_bib=self.no_bib) 

1669 self.warnings.extend(xbook.warnings) 

1670 else: 

1671 xbook = self.xbook 

1672 

1673 book_id = xbook.pid 

1674 book = model_helpers.get_container(book_id) 

1675 

1676 if book: 1676 ↛ 1677line 1676 didn't jump to line 1677, because the condition on line 1676 was never true

1677 cmd = ptf_cmds.addContainerPtfCmd() 

1678 cmd.set_object_to_be_deleted(book) 

1679 cmd.undo() 

1680 

1681 collection = book.get_collection() 

1682 

1683 # update the collection first and last year 

1684 for container in collection.content.all(): 

1685 self.update_collection_years(collection.pid, container, False) 

1686 

1687 collection.save() 

1688 

1689 cmd = addBookXmlCmd( 

1690 { 

1691 "xbook": xbook, 

1692 "use_body": False, 

1693 # "body": self.body, 

1694 "from_folder": self.from_folder, 

1695 "to_folder": self.to_folder, 

1696 "no_bib": self.no_bib, 

1697 "solr_commit": False, 

1698 } 

1699 ) 

1700 book = cmd.do() 

1701 return book 

1702 

1703 

1704class updateBibitemCitationXmlCmd(baseCmd): 

1705 """ """ 

1706 

1707 def __init__(self, params=None): 

1708 self.bibitem = None 

1709 

1710 super().__init__(params) 

1711 

1712 self.required_params.extend(["bibitem"]) 

1713 

1714 def set_bibitem(self, bibitem): 

1715 self.bibitem = bibitem 

1716 

1717 def internal_do(self): 

1718 super().internal_do() 

1719 

1720 new_ids = {} 

1721 for bibitemid in self.bibitem.bibitemid_set.all(): 

1722 new_ids[bibitemid.id_type] = { 

1723 "id_type": bibitemid.id_type, 

1724 "id_value": bibitemid.id_value, 

1725 "checked": bibitemid.checked, 

1726 "false_positive": bibitemid.false_positive, 

1727 } 

1728 

1729 xbibitem = jats_parser.update_bibitem_xml(self.bibitem, new_ids) 

1730 self.warnings.extend(xbibitem.warnings) 

1731 

1732 self.bibitem.citation_xml = xbibitem.citation_xml 

1733 self.bibitem.citation_html = xbibitem.citation_html 

1734 self.bibitem.citation_tex = xbibitem.citation_tex 

1735 self.bibitem.save() 

1736 

1737 def internal_undo(self): 

1738 raise RuntimeError("update commands do not support the undo") 

1739 

1740 

1741###################################################################################### 

1742###################################################################################### 

1743# 

1744# Import Commands 

1745# 

1746###################################################################################### 

1747###################################################################################### 

1748 

1749 

1750class collectEntireCollectionXmlCmd(baseCmd): 

1751 """ 

1752 Get the PIDs of all the XML of a collection (collection.xml, issues.xml) of a given folder 

1753 

1754 results: 

1755 """ 

1756 

1757 def __init__(self, params=None): 

1758 self.pid = None 

1759 self.folder = None 

1760 

1761 super().__init__(params) 

1762 

1763 self.required_params.extend(["pid", "folder"]) 

1764 

1765 def internal_do(self): 

1766 super().internal_do() 

1767 pids = [pid for pid, _ in resolver.iterate_collection_folder(self.folder, self.pid)] 

1768 return pids 

1769 

1770 

1771class importEntireCollectionXmlCmd(baseCmd): 

1772 """ 

1773 Import all the XML of a collection (collection.xml, issues.xml) of a given folder 

1774 

1775 results: 

1776 """ 

1777 

1778 def __init__(self, params=None): 

1779 self.pid = None 

1780 self.from_folder = None 

1781 self.to_folder = None 

1782 self.backup_folder = None 

1783 self.keep_metadata = False 

1784 self.keep_translations = False 

1785 

1786 self.with_cedrics = True 

1787 self.from_cedrics = False # The entire collection is in Cedrics format 

1788 self.date_for_pii = False # Fetch publication_date for Elsevier articles 

1789 self.first_issue = "" 

1790 self.fake = False # Parse the XML but do not import 

1791 

1792 self.no_bib = False # Ignore the references during the import (used in Geodesic) 

1793 self.embargo = False # Import only the open articles (used in Geodesic) 

1794 

1795 self.caller = None 

1796 self.callback = None 

1797 self.job = None 

1798 

1799 super().__init__(params) 

1800 

1801 self.required_params.extend(["pid", "from_folder"]) 

1802 

1803 def internal_do(self): 

1804 super().internal_do() 

1805 

1806 pid = self.pid 

1807 resource = model_helpers.get_resource(pid) 

1808 if not resource and not self.fake: 1808 ↛ 1817line 1808 didn't jump to line 1817, because the condition on line 1808 was never false

1809 body = resolver.get_archive_body(self.from_folder, pid, None) 

1810 journals = addCollectionsXmlCmd( 

1811 {"body": body, "from_folder": self.from_folder, "to_folder": self.to_folder} 

1812 ).do() 

1813 if not journals: 1813 ↛ 1814line 1813 didn't jump to line 1814, because the condition on line 1813 was never true

1814 raise ValueError(self.from_folder + " does not contain a collection") 

1815 resource = journals[0] 

1816 

1817 obj = resource.cast() 

1818 

1819 if obj.classname != "Collection": 1819 ↛ 1820line 1819 didn't jump to line 1820, because the condition on line 1819 was never true

1820 raise ValueError(pid + " does not contain a collection") 

1821 

1822 if self.with_cedrics: 1822 ↛ 1825line 1822 didn't jump to line 1825, because the condition on line 1822 was never true

1823 # with_cedrics means that you want to import everything from scratch 

1824 # Delete solr documents (01/28/2020: Solr can have multiple docs with the same PID) 

1825 cmd = solr_cmds.solrDeleteCmd({"q": "pid:" + self.pid + "*"}) 

1826 cmd.do() 

1827 

1828 i = 0 

1829 for pid, file_ in resolver.iterate_collection_folder( 

1830 self.from_folder, self.pid, self.first_issue 

1831 ): 

1832 if self.callback is None: 1832 ↛ 1835line 1832 didn't jump to line 1835, because the condition on line 1832 was never false

1833 print(pid) 

1834 

1835 if self.from_cedrics: 1835 ↛ 1836line 1835 didn't jump to line 1836, because the condition on line 1835 was never true

1836 cmd = importCedricsIssueDirectlyXmlCmd( 

1837 { 

1838 "colid": self.pid, 

1839 "input_file": file_, 

1840 "remove_email": False, 

1841 "remove_date_prod": True, 

1842 "copy_files": True, 

1843 "force_dois": False, 

1844 } 

1845 ) 

1846 else: 

1847 body = resolver.get_body(file_) 

1848 xml_file_folder = os.path.dirname(file_) 

1849 cmd = addOrUpdateContainerXmlCmd( 

1850 { 

1851 "body": body, 

1852 "from_folder": self.from_folder, 

1853 "to_folder": self.to_folder, 

1854 "backup_folder": self.backup_folder, # Read extra data (if any) stored in a json file 

1855 "xml_file_folder": xml_file_folder, # when article.XML are in separate files 

1856 "keep_metadata": self.keep_metadata, # Backup/Restore existing data not in the XML 

1857 "keep_translations": self.keep_translations, # Backup/Restore existing translations 

1858 "no_bib": self.no_bib, 

1859 "embargo": self.embargo, 

1860 # Needed in Trammel 

1861 "fake": self.fake, 

1862 } 

1863 ) 

1864 cmd.do() 

1865 

1866 i += 1 

1867 if self.callback: 1867 ↛ 1868line 1867 didn't jump to line 1868, because the condition on line 1867 was never true

1868 self.callback(self.job, i) 

1869 

1870 if self.with_cedrics: 1870 ↛ 1871line 1870 didn't jump to line 1871, because the condition on line 1870 was never true

1871 src_folder = os.path.join(settings.CEDRAM_XML_FOLDER, self.pid, "metadata") 

1872 

1873 xml_files = [ 

1874 os.path.join(src_folder, f) 

1875 for f in os.listdir(src_folder) 

1876 if os.path.isfile(os.path.join(src_folder, f)) and f.endswith(".xml") 

1877 ] 

1878 for xml_file in xml_files: 

1879 if self.callback is None: 

1880 print(xml_file) 

1881 

1882 cmd = importCedricsIssueXmlCmd( 

1883 { 

1884 "colid": self.pid, 

1885 "input_file": xml_file, 

1886 "from_folder": self.from_folder, 

1887 "to_folder": self.to_folder, 

1888 } 

1889 ) 

1890 cmd.do() 

1891 

1892 

1893class importCedricsIssueXmlCmd(baseCmd): 

1894 def __init__(self, params=None): 

1895 self.colid = None 

1896 self.input_file = None 

1897 self.remove_email = True 

1898 self.remove_date_prod = True 

1899 self.diff_only = False 

1900 self.body = None 

1901 self.xissue = None 

1902 self.copy_files = True 

1903 

1904 super().__init__(params) 

1905 

1906 self.required_params.extend(["colid"]) 

1907 

1908 def import_full_text(self, issue): 

1909 """ 

1910 Some journals want to display the full text in HTML (CRCHIM/CRGEOS/CEBIOL) 

1911 Read the XML file and convert the body in HTML 

1912 """ 

1913 tex_src_folder = resolver.get_cedram_issue_tex_folder(self.colid, issue.pid) 

1914 tex_folders, _ = resolver.get_cedram_tex_folders(self.colid, issue.pid) 

1915 

1916 if len(tex_folders) > 0: 1916 ↛ exitline 1916 didn't return from function 'import_full_text', because the condition on line 1916 was never false

1917 i = 0 

1918 for article in issue.article_set.all(): 

1919 article_folder = tex_folders[i] 

1920 xml_file = os.path.join( 

1921 tex_src_folder, article_folder, "FullText", article_folder + ".xml" 

1922 ) 

1923 

1924 cmd = ptf_cmds.updateResourceIdPtfCmd( 

1925 {"id_type": "ojs-id", "id_value": article_folder} 

1926 ) 

1927 cmd.set_resource(article) 

1928 cmd.do() 

1929 

1930 if os.path.isfile(xml_file): 

1931 with open(xml_file, encoding="utf-8") as f: 

1932 body = f.read() 

1933 

1934 cmd = addBodyInHtmlXmlCmd( 

1935 { 

1936 "body": body, 

1937 "from_folder": settings.CEDRAM_XML_FOLDER, 

1938 # nécessaire pour la copie des binaires type image 

1939 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER, # idem 

1940 } 

1941 ) 

1942 cmd.set_article(article) 

1943 cmd.do() 

1944 

1945 i += 1 

1946 

1947 def import_in_db(self): 

1948 """ 

1949 Import Cedrics issue from /cedram_dev/exploitation/cedram 

1950 This worflow is no longer used. 

1951 """ 

1952 

1953 # Cedrics: the full text for SolR is in a separate file 

1954 full_text_folder = os.path.dirname(os.path.dirname(self.input_file)) + "/plaintext/" 

1955 

1956 params = { 

1957 "assign_doi": False, 

1958 "full_text_folder": full_text_folder, 

1959 "keep_metadata": True, 

1960 "keep_translations": True, 

1961 "use_body": False, 

1962 "xissue": self.xissue, 

1963 "backup_folder": settings.MERSENNE_TMP_FOLDER, 

1964 "from_folder": settings.CEDRAM_XML_FOLDER, 

1965 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER if self.copy_files else None, 

1966 } 

1967 

1968 # params['body'] = self.body 

1969 

1970 cmd = addOrUpdateIssueXmlCmd(params) 

1971 issue = cmd.do() 

1972 self.warnings.extend(cmd.get_warnings()) 

1973 

1974 # resolver.copy_binary_files( 

1975 # issue, 

1976 # settings.CEDRAM_XML_FOLDER, 

1977 # settings.MERSENNE_TEST_DATA_FOLDER) 

1978 

1979 self.import_full_text(issue) 

1980 

1981 return issue 

1982 

1983 def compare_issue(self): 

1984 xissue = self.xissue 

1985 issues_diff = {} 

1986 result = True 

1987 

1988 time1 = timezone.now() 

1989 

1990 new_dois = [article.doi for article in xissue.articles] 

1991 

1992 article_qs = Article.objects.filter(doi__in=new_dois).prefetch_related( 

1993 "abstract_set", 

1994 "kwd_set", 

1995 "subj_set", 

1996 "datastream_set", 

1997 "relatedobject_set", 

1998 "resourcecount_set", 

1999 "contributions", 

2000 "contributions__contribaddress_set", 

2001 "bibitem_set__bibitemid_set", 

2002 "bibitem_set__contributions", 

2003 "bibitem_set__contributions__contribaddress_set", 

2004 ) 

2005 

2006 issue = None 

2007 try: 

2008 issue = ( 

2009 Container.objects.select_related("my_collection", "my_publisher") 

2010 .prefetch_related( 

2011 Prefetch("article_set", queryset=article_qs, to_attr="articles_from_doi") 

2012 ) 

2013 .get(sites__id=settings.SITE_ID, pid=xissue.pid) 

2014 ) 

2015 except Container.DoesNotExist: 

2016 pass 

2017 

2018 if issue: 

2019 data_issue = model_data_converter.db_to_issue_data(issue, issue.articles_from_doi) 

2020 

2021 time2 = timezone.now() 

2022 delta = time2 - time1 

2023 

2024 delta.seconds + delta.microseconds / 1e6 

2025 print(delta) 

2026 

2027 # Handle xml cmds side effects (ex: "numdam" changed into "mathdoc", ...) 

2028 model_data_comparator.prepare_issue_for_comparison(xissue) 

2029 

2030 issue_comparator = model_data_comparator.IssueDataComparator() 

2031 

2032 result = issue_comparator.compare(data_issue, xissue, issues_diff) 

2033 

2034 return (result, issues_diff, xissue) 

2035 

2036 def delete_previous_file(self, output_folder): 

2037 basename = os.path.basename(self.input_file) 

2038 

2039 output_file = os.path.join(output_folder, self.colid, basename) 

2040 if os.path.isfile(output_file): 

2041 os.remove(output_file) 

2042 

2043 os.makedirs(output_folder, exist_ok=True) 

2044 os.makedirs(os.path.dirname(output_file), exist_ok=True) 

2045 

2046 return output_file 

2047 

2048 def import_cedrics_issue(self): 

2049 """ 

2050 Import Cedrics issue from /cedram_dev/exploitation/cedram 

2051 This worflow is no longer used. 

2052 Cedrics issues are imported from /cedram_dev/production_tex/CEDRAM 

2053 (see importCedricsIssueDirectlyXmlCmd below) 

2054 """ 

2055 

2056 output_folder = settings.MERSENNE_TMP_FOLDER 

2057 ptf_xsl_folder = settings.PTF_XSL_FOLDER 

2058 log_file = os.path.join(output_folder, settings.MERSENNE_LOG_FILE) 

2059 

2060 # 1. Delete the previous file 

2061 output_file = self.delete_previous_file(output_folder) 

2062 

2063 # 2. Transform the cedrics XML into JATS 

2064 cmd_folder = os.path.join(ptf_xsl_folder, "cedram") 

2065 

2066 cmd_str = 'cd {}; {} cedram2ptf.py -v -x {} -p {} -o {} -b "" -l {} {} {} > {} 2>&1'.format( 

2067 cmd_folder, 

2068 os.path.join(settings.VIRTUALENV_DIR, "bin/python"), 

2069 "-s" if self.colid in settings.MERSENNE_SEMINARS else "", 

2070 self.input_file, 

2071 output_folder, 

2072 log_file + "1", 

2073 # option -e for cedram2ptf.py for not removing email 

2074 "-e" if not self.remove_email else "", 

2075 "-t" if self.remove_date_prod else "", 

2076 log_file, 

2077 ) 

2078 

2079 log_file2 = log_file + "2" 

2080 with open(log_file2, "w", encoding="ascii") as file_: 

2081 file_.write(cmd_str + "\n") 

2082 

2083 sys.path.append(ptf_xsl_folder + "/lib") 

2084 

2085 try: 

2086 result = subprocess.check_output(cmd_str, shell=True) 

2087 except Exception as e: 

2088 with open(log_file) as logfile_: 

2089 logfile_body = logfile_.read() 

2090 message = str(e) + "\n" + logfile_body + "\n" 

2091 file_.write(message) 

2092 file_.close() 

2093 raise RuntimeError(message) 

2094 

2095 file_.write(str(result) + "\n") 

2096 

2097 # Check if the output_file has been created 

2098 if not os.path.isfile(output_file): 

2099 raise RuntimeError("The file was not converted in JATS") 

2100 

2101 with open(output_file, encoding="utf-8") as f: 

2102 self.body = f.read() 

2103 

2104 parser = etree.XMLParser( 

2105 huge_tree=True, recover=True, remove_blank_text=True, remove_comments=True 

2106 ) 

2107 tree = etree.fromstring(self.body.encode("utf-8"), parser=parser) 

2108 self.xissue = jats_parser.JatsIssue(tree=tree) 

2109 self.warnings.extend(self.xissue.warnings) 

2110 

2111 def internal_do(self): 

2112 super().internal_do() 

2113 

2114 if not self.xissue: 2114 ↛ 2117line 2114 didn't jump to line 2117, because the condition on line 2114 was never false

2115 self.import_cedrics_issue() 

2116 

2117 result = None 

2118 

2119 if self.diff_only: 2119 ↛ 2120line 2119 didn't jump to line 2120, because the condition on line 2119 was never true

2120 result = self.compare_issue() 

2121 else: 

2122 result = self.import_in_db() 

2123 

2124 return result 

2125 

2126 

2127# import from /cedram_dev/production_tex/CEDRAM 

2128class importCedricsIssueDirectlyXmlCmd(importCedricsIssueXmlCmd): 

2129 def __init__(self, params=None): 

2130 self.is_seminar = False 

2131 self.article_folders = None 

2132 self.force_dois = True 

2133 super().__init__(params) 

2134 

2135 def read_file(self, filename, skip_lines=2): 

2136 i = 0 

2137 lines = [] 

2138 try: 

2139 with open(filename, encoding="utf-8") as fr: 

2140 for line in fr: 

2141 if i > skip_lines: 

2142 lines.append(line) 

2143 i += 1 

2144 except UnicodeDecodeError: 

2145 i = 0 

2146 lines = [] 

2147 with open(filename, encoding="iso-8859-1") as fr: 

2148 for line in fr: 

2149 if i > skip_lines: 

2150 lines.append(line) 

2151 i += 1 

2152 

2153 return lines 

2154 

2155 def import_cedrics_issue(self): 

2156 """ 

2157 Parse the Cedrics XML directly, without Cedrics -> JATS transformation 

2158 The deplace_fasc script is no longer needed, but the Cedrics issue XML has to be created 

2159 Workflow 

2160 1. Get the list of articles from /cedram_dev/production_tex/CEDRAM 

2161 2. Cat the article XML files into one issue.XML 

2162 3. Read the Cedrics issue.XML 

2163 

2164 :return: 

2165 """ 

2166 

2167 output_folder = settings.MERSENNE_TMP_FOLDER 

2168 output_file = self.delete_previous_file(output_folder) 

2169 

2170 basename = os.path.basename(self.input_file) 

2171 if "-cdrxml" in basename: 2171 ↛ 2174line 2171 didn't jump to line 2174, because the condition on line 2171 was never false

2172 pid = basename.split("-cdrxml.")[0] 

2173 else: 

2174 pid = basename.split(".xml")[0] 

2175 

2176 # 1. Get the list of articles 

2177 tex_src_folder = resolver.get_cedram_issue_tex_folder(self.colid, pid) 

2178 self.article_folders, self.dois = resolver.get_cedram_tex_folders(self.colid, pid) 

2179 

2180 # 2. Create the issue XML file 

2181 with open(output_file, "w", encoding="utf-8") as fw: 

2182 # 2.a. Start the issue.xml based on @pid-cdrxml.xml 

2183 fw.write('<?xml version="1.0" encoding="utf-8" standalone="no"?>\n') 

2184 fw.write('<!DOCTYPE cedram SYSTEM "/home/cedram/XML/dtd/cedram.dtd">\n') 

2185 fw.write("<cedram>\n") 

2186 

2187 lines = self.read_file(self.input_file) 

2188 for line in lines: 

2189 fw.write(line) 

2190 

2191 # 2.b. Cat the article XML files 

2192 for basename in self.article_folders: 

2193 src_file = os.path.join(tex_src_folder, basename, basename + "-cdrxml.xml") 

2194 

2195 lines = self.read_file(src_file) 

2196 for line in lines: 

2197 fw.write(line) 

2198 

2199 fw.write("</cedram>\n") 

2200 

2201 # 3. Read the Cedrics issue.XML 

2202 with open(output_file, encoding="utf-8") as f: 

2203 self.body = f.read() 

2204 

2205 parser = etree.XMLParser( 

2206 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True 

2207 ) 

2208 tree = etree.fromstring(self.body.encode("utf-8"), parser=parser) 

2209 self.xissue = cedrics_parser.CedricsIssue( 

2210 tree=tree, 

2211 is_seminar=self.is_seminar, 

2212 ignore_date_published=self.remove_date_prod, 

2213 article_folders=self.article_folders, 

2214 dois=self.dois, 

2215 ) 

2216 if self.force_dois: 2216 ↛ 2221line 2216 didn't jump to line 2221, because the condition on line 2216 was never false

2217 for xarticle in self.xissue.articles: 

2218 if xarticle.doi is None: 2218 ↛ 2219line 2218 didn't jump to line 2219, because the condition on line 2218 was never true

2219 raise ValueError(xarticle.pid, "n'a pas de doi") 

2220 

2221 self.warnings.extend(self.xissue.warnings) 

2222 

2223 def import_in_db(self): 

2224 params = { 

2225 "assign_doi": False, 

2226 "full_text_folder": settings.CEDRAM_TEX_FOLDER, # the full text for SolR is in a separate file 

2227 "keep_metadata": True, 

2228 "keep_translations": True, # The cedrics XML does not have the translations. backup/restore them. 

2229 "use_body": False, 

2230 "xissue": self.xissue, 

2231 "backup_folder": settings.MERSENNE_TMP_FOLDER, # temp folder used to backup/restore info during the import 

2232 "from_folder": settings.CEDRAM_TEX_FOLDER, 

2233 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER if self.copy_files else None, 

2234 } 

2235 

2236 cmd = addOrUpdateIssueXmlCmd(params) 

2237 issue = cmd.do() 

2238 self.warnings.extend(cmd.get_warnings()) 

2239 

2240 self.import_full_text(issue) 

2241 

2242 return issue 

2243 

2244 

2245class addCedricsIssueXmlCmd(addXmlCmd): 

2246 assign_doi = False 

2247 full_text_folder = "" 

2248 import_folder = None 

2249 prod_deployed_date_iso_8601_date_str = None 

2250 xissue = None 

2251 remove_blank_text = False 

2252 is_seminar = False 

2253 

2254 def internal_do(self): 

2255 super().internal_do() 

2256 

2257 self.xissue = cedrics_parser.CedricsIssue(tree=self.tree, is_seminar=self.is_seminar) 

2258 

2259 return self.xissue 

2260 

2261 

2262class addorUpdateCedricsArticleXmlCmd(baseCmd): 

2263 def __init__(self, params=None): 

2264 self.container_pid = None 

2265 self.article_folder_name = None 

2266 

2267 super().__init__(params) 

2268 

2269 self.required_params.extend(["container_pid", "article_folder_name"]) 

2270 

2271 def internal_do(self): 

2272 super().internal_do() 

2273 

2274 issue = model_helpers.get_container(self.container_pid) 

2275 if not issue: 

2276 raise exceptions.ResourceDoesNotExist(f"Issue {self.container_pid} does not exist") 

2277 

2278 colid = issue.my_collection.pid 

2279 article_folder = os.path.join( 

2280 settings.CEDRAM_TEX_FOLDER, colid, self.container_pid, self.article_folder_name 

2281 ) 

2282 

2283 # 1. Read the Cedrics article.XML 

2284 input_file = os.path.join(article_folder, f"{self.article_folder_name}-cdrxml.xml") 

2285 with open(input_file, encoding="utf-8") as f: 

2286 body = f.read() 

2287 

2288 # 2. Parse the file and create an xarticle 

2289 is_seminar = colid in settings.MERSENNE_SEMINARS 

2290 parser = etree.XMLParser( 

2291 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True 

2292 ) 

2293 tree = etree.fromstring(body.encode("utf-8"), parser=parser) 

2294 xarticle = cedrics_parser.CedricsArticle( 

2295 tree=tree, 

2296 colid=colid, 

2297 issue_id=self.container_pid, 

2298 is_seminar=is_seminar, 

2299 ignore_date_published=True, 

2300 article_folder=self.article_folder_name, 

2301 ) 

2302 if xarticle.doi is None: 

2303 raise ValueError(xarticle.pid, "n'a pas de doi") 

2304 

2305 # Get the article position in its issue (seq) to preserve its order 

2306 article_folders, dois = resolver.get_cedram_tex_folders(colid, self.container_pid) 

2307 i = 1 

2308 for folder in article_folders: 

2309 if folder == self.article_folder_name: 

2310 xarticle.seq = i 

2311 i += 1 

2312 

2313 existing_article = model_helpers.get_article(xarticle.pid) 

2314 temp_folder = settings.MERSENNE_TMP_FOLDER 

2315 

2316 # 3. Backup/Suppression de l'article existant 

2317 if existing_article: 

2318 # On commence par faire un backup de l'existant en cas de bug. 

2319 ptf_cmds.exportPtfCmd( 

2320 { 

2321 "pid": self.container_pid, 

2322 "with_internal_data": True, 

2323 "with_binary_files": False, 

2324 "for_archive": False, 

2325 "export_folder": os.path.join(temp_folder, "backup"), 

2326 } 

2327 ).do() 

2328 

2329 # On sauvegarde les données additionnelles (extid, deployed_date,...) dans un json 

2330 params = { 

2331 "pid": existing_article.pid, 

2332 "export_folder": temp_folder, 

2333 "export_all": True, 

2334 "with_binary_files": True, 

2335 } 

2336 ptf_cmds.exportExtraDataPtfCmd(params).do() 

2337 

2338 backup_obj_not_in_metadata(existing_article) 

2339 backup_translation(existing_article) 

2340 

2341 # Inutile d'effacer l'article existant, addArticleXmlCmd le fait en mode standalone 

2342 

2343 # 4. Ajout de l'article dans Django/SolR 

2344 params = { 

2345 "xarticle": xarticle, 

2346 "issue": issue, 

2347 "standalone": True, 

2348 "use_body": False, # No self.body with the content of the XML file; xarticle is passed directly 

2349 "full_text_folder": settings.CEDRAM_TEX_FOLDER, # the full text for SolR is in a separate file 

2350 # temp folder used to backup/restore info during the import 

2351 "from_folder": settings.CEDRAM_TEX_FOLDER, 

2352 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER, 

2353 "keep_translations": True, 

2354 } 

2355 

2356 cmd = addArticleXmlCmd(params) 

2357 cmd.set_collection(issue.my_collection) 

2358 article = cmd.do() 

2359 

2360 # 5. Lecture du full text en HTML 

2361 xml_file = os.path.join(article_folder, "FullText", self.article_folder_name + ".xml") 

2362 if os.path.isfile(xml_file): 

2363 with open(xml_file, encoding="utf-8") as f: 

2364 body = f.read() 

2365 

2366 cmd = addBodyInHtmlXmlCmd( 

2367 { 

2368 "body": body, 

2369 "from_folder": settings.CEDRAM_XML_FOLDER, 

2370 # nécessaire pour la copie des binaires type image 

2371 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER, # idem 

2372 "remove_blank_text": False, 

2373 } 

2374 ) 

2375 cmd.set_article(article) 

2376 cmd.do() 

2377 

2378 # 6. On ajoute l'ojs-id pour ptf-tools 

2379 cmd = ptf_cmds.updateResourceIdPtfCmd( 

2380 {"id_type": "ojs-id", "id_value": self.article_folder_name} 

2381 ) 

2382 cmd.set_resource(article) 

2383 cmd.do() 

2384 

2385 # 7. On restaure les données additionnelles (extid, deployed_date,...) 

2386 if existing_article: 

2387 ptf_cmds.importExtraDataPtfCmd( 

2388 {"pid": existing_article.pid, "import_folder": temp_folder} 

2389 ).do() 

2390 

2391 restore_obj_not_in_metadata(article) 

2392 restore_translation(article) 

2393 

2394 return article 

2395 

2396 

2397class transformBodyInHtmlXmlCmd(addXmlCmd): 

2398 """ 

2399 transformBodyInHtmlXmlCmd: transform the JATS body in HTML 

2400 

2401 TODO: handle images,... 

2402 

2403 """ 

2404 

2405 use_body = False 

2406 

2407 def internal_do(self): 

2408 super().internal_do() 

2409 

2410 xsl_file = settings.PTF_HTML_XSL 

2411 xslt_doc = etree.parse(xsl_file) 

2412 t = etree.XSLT(xslt_doc) 

2413 

2414 html_tree = t(self.tree).getroot() 

2415 

2416 body = html_tree.find("body/article/main") 

2417 text = xmldata_jats.innerxml(body).decode("utf-8") 

2418 

2419 return text 

2420 

2421 

2422class addBodyInHtmlXmlCmd(addXmlCmd): 

2423 """ 

2424 addBodyInHtmlXmlCmd: read the JATS body of an article 

2425 and create the corresponding HTML 

2426 

2427 TODO: handle images,... manage warnings for unused tag ? 

2428 

2429 """ 

2430 

2431 def __init__(self, params=None): 

2432 self.article = None 

2433 self.pid = None 

2434 

2435 super().__init__(params) 

2436 

2437 def set_article(self, article): 

2438 self.article = article 

2439 

2440 def pre_do(self): 

2441 super().pre_do() 

2442 

2443 if self.pid is None and self.article is None: 2443 ↛ 2444line 2443 didn't jump to line 2444, because the condition on line 2443 was never true

2444 raise ValueError("pid et article sont vides") 

2445 

2446 if self.article is None: 2446 ↛ 2447line 2446 didn't jump to line 2447, because the condition on line 2446 was never true

2447 self.article = model_helpers.get_article(self.pid) 

2448 

2449 if self.pid is None: 2449 ↛ exitline 2449 didn't return from function 'pre_do', because the condition on line 2449 was never false

2450 self.pid = self.article.pid 

2451 

2452 def internal_do(self): 

2453 super().internal_do() 

2454 

2455 xarticle = jats_parser.JatsArticle(tree=self.tree, pid=self.pid) 

2456 # faut il récupérer les warnings du parseHTML ? 

2457 # self.warnings.extend(xarticle.warnings) 

2458 self.article.relatedobject_set.filter(rel="html-image").delete() 

2459 self.add_objects_with_location(xarticle.figures, self.article, "RelatedObject") 

2460 

2461 params = { 

2462 "body_html": xarticle.body_html, 

2463 "body_tex": xarticle.body_tex, 

2464 "body_xml": xarticle.body_xml, 

2465 "use_page_count": False, 

2466 } 

2467 

2468 cmd = ptf_cmds.updateArticlePtfCmd(params) 

2469 cmd.set_article(self.article) 

2470 cmd.do() 

2471 

2472 # copy_binary_files will call resolver.copy_html_images 

2473 # to copy the article images 

2474 # because updateArticlePtfCmd is not from addPtfCmd, need to copy files here 

2475 

2476 resolver.copy_html_images( 

2477 self.article, settings.MERSENNE_TEST_DATA_FOLDER, settings.CEDRAM_XML_FOLDER 

2478 ) 

2479 

2480 

2481class updateCacheXmlCmd(baseCmd): 

2482 """ 

2483 recreate the citation_html field of the bibitems 

2484 

2485 Params: colid: pid of the collection to process 

2486 """ 

2487 

2488 def __init__(self, params=None): 

2489 self.colid = None 

2490 self.start_id = None 

2491 

2492 super().__init__(params) 

2493 

2494 self.required_params.extend(["colid"]) 

2495 

2496 def update_article(self, xarticle): 

2497 article = model_helpers.get_article(xarticle.pid) 

2498 if article is None: 

2499 raise exceptions.ResourceDoesNotExist(f"Article {xarticle.pid} does not exist") 

2500 

2501 article.title_html = xarticle.title_html 

2502 article.title_tex = xarticle.title_tex 

2503 article.trans_title_html = xarticle.trans_title_html 

2504 article.trans_title_tex = xarticle.trans_title_tex 

2505 article.save() 

2506 

2507 for xabstract, abstract in zip(xarticle.abstracts, article.abstract_set.all()): 

2508 abstract.value_html = xabstract["value_html"] 

2509 abstract.value_tex = xabstract["value_tex"] 

2510 abstract.save() 

2511 

2512 # for xkwd_group, kwd_group in zip(xarticle.kwd_groups, article.kwdgroup_set.all()): 

2513 # kwd_group.value_html = xkwd_group['value_html'] 

2514 # kwd_group.value_tex = xkwd_group['value_tex'] 

2515 # kwd_group.save() 

2516 

2517 for xbib, bib in zip(xarticle.bibitems, article.bibitem_set.all()): 

2518 bib.citation_html = xbib.citation_html 

2519 bib.citation_tex = xbib.citation_tex 

2520 bib.article_title_tex = xbib.article_title_tex 

2521 bib.chapter_title_tex = xbib.chapter_title_tex 

2522 bib.source_tex = xbib.source_tex 

2523 bib.volume = xbib.volume 

2524 bib.save() 

2525 

2526 if hasattr(settings, "SHOW_BODY") and settings.SHOW_BODY: 

2527 params = { 

2528 "body_html": xarticle.body_html, 

2529 "body_tex": xarticle.body_tex, 

2530 "body_xml": xarticle.body_xml, 

2531 "use_page_count": False, 

2532 } 

2533 

2534 cmd = ptf_cmds.updateArticlePtfCmd(params) 

2535 cmd.set_article(article) 

2536 cmd.do() 

2537 

2538 def internal_do(self): 

2539 super().internal_do() 

2540 

2541 collection = model_helpers.get_collection(self.colid) 

2542 if collection is None: 

2543 raise exceptions.ResourceDoesNotExist(f"Collection {self.colid} does not exist") 

2544 

2545 qs = collection.content.all().order_by("pid") 

2546 start = self.start_id is None 

2547 for container in qs: 

2548 if not start and container.pid == self.start_id: 

2549 start = True 

2550 

2551 if start: 

2552 print(container.pid) 

2553 with_body = hasattr(settings, "SHOW_BODY") and settings.SHOW_BODY 

2554 xml_body = ptf_cmds.exportPtfCmd( 

2555 {"pid": container.pid, "with_body": with_body} 

2556 ).do() 

2557 

2558 parser = etree.XMLParser( 

2559 huge_tree=True, 

2560 recover=True, 

2561 remove_blank_text=False, 

2562 remove_comments=True, 

2563 resolve_entities=True, 

2564 ) 

2565 tree = etree.fromstring(xml_body.encode("utf-8"), parser=parser) 

2566 xissue = jats_parser.JatsIssue(tree=tree) 

2567 

2568 for xarticle in xissue: 

2569 self.update_article(xarticle)