Coverage for apps/ptf/cmds/xml_cmds.py: 67%
1212 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-05-19 19:20 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-05-19 19:20 +0000
1import copy
2import datetime
3import os.path
4import subprocess
5import sys
6import traceback
8from lxml import ElementInclude
9from lxml import etree
11from django.conf import settings
12from django.db import transaction
13from django.db.models import Prefetch
14from django.utils import timezone
16from ptf import exceptions
17from ptf import model_data
18from ptf import model_data_comparator
19from ptf import model_data_converter
20from ptf import model_helpers
21from ptf import tex
22from ptf import utils
23from ptf.cmds import ptf_cmds
24from ptf.cmds import solr_cmds
25from ptf.cmds.base_cmds import baseCmd
26from ptf.cmds.xml import xml_utils
27from ptf.cmds.xml.cedrics import cedrics_parser
29# KEEP THIS UNUSED IMPORT THEY ARE USED
30from ptf.cmds.xml.jats import jats_parser
31from ptf.cmds.xml.jats import xmldata as xmldata_jats
32from ptf.cmds.xml.xml_utils import normalize
33from ptf.display import resolver
34from ptf.models import Article
35from ptf.models import Collection
36from ptf.models import Container
37from ptf.models import Person
38from ptf.models import backup_obj_not_in_metadata
39from ptf.models import backup_translation
40from ptf.models import restore_obj_not_in_metadata
41from ptf.models import restore_translation
44def find_file(name):
45 paths = settings.MANAGER_XSLT_DIRS
46 for path in paths:
47 for root, _, files in os.walk(path):
48 if name in files:
49 return os.path.join(root, name)
50 return None
53def get_transform(name):
54 file_path = find_file(f"{name}.xsl")
55 xslt_doc = etree.parse(file_path)
56 return etree.XSLT(xslt_doc)
59class addXmlCmd(baseCmd):
60 """
61 addXmlCmd: base class for commands that take an XML as input
62 The XML is passed with the body param
64 from_folder / to_folder: location of binary files to copy
66 Example with a file:
67 f = open('journal.xml')
68 body = f.read()
69 f.close()
70 cmd = add...XmlCmd( { "body":body } )
72 Exception raised:
73 - ValueError if the init params are empty
74 """
76 use_body = True
77 body = None
78 tree = None
79 solr_commit_at_the_end = True
80 xml_filename_in_log = None
81 remove_blank_text = False
82 xml_file_folder = None
84 def __init__(self, params=None):
85 super().__init__(params)
87 if self.use_body:
88 self.required_params.extend(["body"])
90 def get_logname(self):
91 filename = ""
93 if hasattr(settings, "LOG_DIR"): 93 ↛ 103line 93 didn't jump to line 103, because the condition on line 93 was never false
94 i = 0
95 today = datetime.date.today()
96 basename = str(today) + "-" + self.__class__.__name__ + "-"
97 filename = os.path.join(settings.LOG_DIR, basename + str(i) + ".xml")
99 while os.path.isfile(filename):
100 i += 1
101 filename = os.path.join(settings.LOG_DIR, basename + str(i) + ".xml")
103 return filename
105 def pre_do(self):
106 super().pre_do()
108 if self.use_body:
109 # The Cedrics -> JATS XSLT transform manually adds space=preserve around
110 # the nodes with mixed-content, but leaves the text unchanged.
111 # As such, parsing the Cedrics XML cannot be done with remove_blank_text=True
112 # Or the spaces will be removed whereas the JATS XML will keep them.
113 # We still need the remove_blank_text=True for JATS XML for all the other nodes
114 parser = etree.XMLParser(
115 huge_tree=True,
116 recover=True,
117 remove_blank_text=self.remove_blank_text,
118 remove_comments=True,
119 resolve_entities=True,
120 )
121 # if isinstance(self.body, str):
122 # self.body = self.body
123 if self.xml_file_folder is not None:
124 if self.xml_file_folder[-1] != "/":
125 self.xml_file_folder += "/"
126 # For ElementInclude to find the href
127 self.body = self.body.replace(
128 'xmlns:xlink="http://www.w3.org/1999/xlink"', ""
129 ).replace("xlink:href", "href")
130 tree = etree.fromstring(self.body.encode("utf-8"), parser=parser)
132 if self.xml_file_folder is not None:
133 ElementInclude.include(tree, base_url=self.xml_file_folder)
134 # t = get_transform('strip-namespace')
135 # self.tree = t(tree).getroot()
136 self.tree = tree
138 if self.tree is None: 138 ↛ 139line 138 didn't jump to line 139, because the condition on line 138 was never true
139 raise ValueError("tree est vide")
141 # Write the xml body on disk
142 if hasattr(settings, "LOG_DIR") and self.body and self.use_body:
143 self.xml_filename_in_log = self.get_logname()
145 with open(self.xml_filename_in_log, "w", encoding="utf-8") as file_:
146 file_.write(self.body)
148 @transaction.atomic
149 def do(self, parent=None):
150 try:
151 obj = super().do(parent)
152 except Exception as e:
153 ptf_cmds.do_solr_rollback()
155 # Empty sub_cmds to ignore undo
156 self.cmds = []
158 # Write the xml body on disk
159 if hasattr(settings, "LOG_DIR") and self.body and self.use_body:
160 with open(
161 os.path.join(settings.LOG_DIR, "cmds.log"), "a", encoding="utf-8"
162 ) as file_:
163 file_.write("----------------------\n")
165 if self.xml_filename_in_log is None: 165 ↛ 166line 165 didn't jump to line 166, because the condition on line 165 was never true
166 self.xml_filename_in_log = self.get_logname()
168 file_.write(self.xml_filename_in_log + " : FAILED\n")
169 exc_type, exc_value, exc_traceback = sys.exc_info()
170 lines = traceback.format_exception(exc_type, exc_value, exc_traceback)
171 for line in lines:
172 file_.write(line + "\n")
173 file_.write("----------------------\n")
175 raise e
177 if self.solr_commit_at_the_end:
178 ptf_cmds.do_solr_commit()
180 return obj
182 def post_undo(self):
183 super().post_undo()
185 Person.objects.clean()
187 def post_do(self, resource=None):
188 super().post_do(resource)
190 Person.objects.clean()
192 if hasattr(settings, "LOG_DIR") and resource and self.use_body:
193 today = datetime.date.today()
194 basename = str(today) + "-" + self.__class__.__name__
196 pids = ""
197 first = True
198 if isinstance(resource, list):
199 for resource_item in resource:
200 if first: 200 ↛ 203line 200 didn't jump to line 203, because the condition on line 200 was never false
201 first = False
202 else:
203 pids += ", "
205 pids += resource_item.pid
206 else:
207 pids = resource.pid
209 with open(os.path.join(settings.LOG_DIR, "cmds.log"), "a", encoding="utf-8") as file_:
210 file_.write(basename + " : " + pids + "\n")
212 if hasattr(resource, "my_collection") and resource.my_collection:
213 folder = os.path.join(
214 settings.LOG_DIR, resource.get_top_collection().pid, resource.pid
215 )
216 filename = os.path.join(folder, resource.pid + ".xml")
217 resolver.create_folder(folder)
218 with open(filename, "w", encoding="utf-8") as file_:
219 file_.write(self.body)
221 # #if test, then raise an exeption if self.warnings not empty (in self.warnings we have all tags not parsed)
222 # if 'test' in sys.argv:
223 # if len(self.warnings) > 0:
224 # print(self.warnings)
225 # raise UserWarning("All tags are not parsed", self.warnings)
227 def undo(self):
228 super().undo()
230 if self.solr_commit_at_the_end:
231 ptf_cmds.do_solr_commit()
233 def add_objects_with_location(self, xobjs, resource, cmd_type):
234 seq = 1
236 for xobj in xobjs:
237 base = None
239 if xobj["base"]:
240 base_name = xobj["base"]
241 base = model_helpers.get_xmlbase(base_name)
242 if base is None:
243 cmd = ptf_cmds.addXmlBasePtfCmd({"base": xobj["base"], "solr_commit": False})
244 base = cmd.do(self)
246 rel = xobj["rel"]
247 location = xobj["location"]
249 params = {
250 "rel": rel,
251 "mimetype": xobj.get("mimetype", ""),
252 "location": location,
253 "seq": seq,
254 "solr_commit": False,
255 "from_folder": self.from_folder,
256 "to_folder": self.to_folder,
257 }
259 # Ignore XML file
260 if params["mimetype"] != "application/xml": 260 ↛ 236line 260 didn't jump to line 236, because the condition on line 260 was never false
261 if "metadata" in xobj:
262 params["metadata"] = xobj["metadata"]
264 if "text" in xobj:
265 params["text"] = xobj["text"]
267 # TODO: cmd factory ?
268 cmd = None
269 if cmd_type == "ExtLink":
270 cmd = ptf_cmds.addExtLinkPtfCmd(params)
271 elif cmd_type == "RelatedObject":
272 cmd = ptf_cmds.addRelatedObjectPtfCmd(params)
273 elif cmd_type == "SupplementaryMaterial": 273 ↛ 274line 273 didn't jump to line 274, because the condition on line 273 was never true
274 params["caption"] = xobj.get("caption", "")
275 params["supplementary_material"] = True
276 cmd = ptf_cmds.addSupplementaryMaterialPtfCmd(params)
277 elif cmd_type == "DataStream": 277 ↛ 283line 277 didn't jump to line 283, because the condition on line 277 was never false
278 cmd = ptf_cmds.addDataStreamPtfCmd(params)
280 # Always try to add an ExtLink or a RelatedObject
281 # May raise ResourceExists if the ExtLink/RelatedObject is added twice
283 if cmd is not None: 283 ↛ 289line 283 didn't jump to line 289, because the condition on line 283 was never false
284 cmd.set_base(base)
285 cmd.set_resource(resource)
287 cmd.do(self)
289 seq += 1
291 # def add_metadata_parts(self, xobj, resource):
292 # for (seq, name, data) in xobj.metadataparts:
293 # params = {"name": name,
294 # "data": data,
295 # "seq": seq,
296 # "solr_commit": False}
297 #
298 # cmd = ptf_cmds.addMetaDataPartPtfCmd(params)
299 # cmd.set_resource(resource)
300 # cmd.do(self)
302 @staticmethod
303 def remove_publisher(publisher):
304 cmd = ptf_cmds.addPublisherPtfCmd()
305 cmd.set_object_to_be_deleted(publisher)
306 cmd.undo()
308 # Update the published years of a collection (journal/acta/book-series...)
309 @staticmethod
310 def update_collection_years(pid, container, save=True):
311 collection = Collection.objects.get(pid=pid)
312 if container.year:
313 year = container.year
314 fyear, lyear = model_helpers.get_first_last_years(year)
315 fyear = int(fyear)
316 lyear = int(lyear)
318 if fyear < collection.fyear or not collection.fyear:
319 collection.fyear = fyear
321 if lyear > collection.lyear or not collection.lyear:
322 collection.lyear = lyear
324 if save:
325 collection.save()
328class addCollectionsXmlCmd(addXmlCmd):
329 """
330 addCollectionsXmlCmd: adds/remove a collection
332 TODO: merge Collection and Journal ?
334 Exception raised:
335 - exceptions.ResourceExists during do
336 if the Collection already exists
337 if the collection defines the same extlink/relatedobject multiple times
338 - exceptions.ResourceDoesNotExist
339 during undo if the Collection does not exist
340 during do of the provider does not exist
341 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value>
342 - RuntimeError during undo if resources are still published
343 """
345 provider = None
346 xml_format = None
348 def set_provider(self, provider):
349 self.provider = provider
351 def add_collection(self, xcol, update=False):
352 if not xcol: 352 ↛ 353line 352 didn't jump to line 353, because the condition on line 352 was never true
353 return None
355 if xcol.provider: 355 ↛ 358line 355 didn't jump to line 358, because the condition on line 355 was never false
356 provider = model_helpers.get_provider_by_name(xcol.provider)
357 else:
358 provider = self.provider
360 col_id = xcol.pid
361 collection = model_helpers.get_collection(col_id)
363 existing = False
365 if collection is not None:
366 existing = True
367 if not update: 367 ↛ 371line 367 didn't jump to line 371
368 raise exceptions.ResourceExists(f"Collection {collection.pid} already exists")
370 # Create a collection
371 params = {
372 "xobj": xcol,
373 "from_folder": self.from_folder,
374 "to_folder": self.to_folder,
375 "solr_commit": False,
376 }
378 cls = ptf_cmds.addCollectionPtfCmd
379 if update and existing: 379 ↛ 380line 379 didn't jump to line 380, because the condition on line 379 was never true
380 cls = ptf_cmds.updateCollectionPtfCmd
382 cmd = cls(params)
383 cmd.set_provider(provider)
384 collection = cmd.do(self)
386 self.add_objects_with_location(xcol.ext_links, collection, "ExtLink")
388 # if publisher:
389 # model_helpers.publish_resource(publisher, journal)
391 return collection
393 def internal_do(self):
394 super().internal_do()
396 collections = []
398 if self.tree.tag == "journal-meta": 398 ↛ 399line 398 didn't jump to line 399, because the condition on line 398 was never true
399 raise ValueError(
400 "Creation of a journal on the fly from an article is not yet supported"
401 )
402 # # Code used when a journal is created on the fly while parsing an article (GDML - OAI)
403 # # TODO 1 : Refactor all the JATS parsers (eudml/bdim/dmlcz/....)
404 # # to be compatible with jats_parser.py
405 # # TODO 2 : Prevent the creation of the collection on the fly ?
406 # # Shouldn't the collection be monitored/controlled ?
407 # xmldata = globals()[self.xml_format]
408 # xcol = xmldata.Journal(self.tree)
409 # collection = self.add_collection(xcol, update=True)
410 # collections.append(collection)
411 else:
412 for node in self.tree:
413 xcol = None
414 if node.tag == "collection-meta": 414 ↛ 415line 414 didn't jump to line 415, because the condition on line 414 was never true
415 raise ValueError("Collection can only be created from <publication-meta>")
416 # xcol = jats_parser.BitsCollection(tree=node)
417 elif node.tag == "journal-meta": 417 ↛ 418line 417 didn't jump to line 418, because the condition on line 417 was never true
418 raise ValueError(
419 "Collection can only be created from <publication-meta>, <journal-meta> are handled while parsing a <journal-issue>"
420 )
421 # xcol = jats_parser.JatsJournal(tree=node)
422 elif node.tag == "publication-meta": 422 ↛ 425line 422 didn't jump to line 425, because the condition on line 422 was never false
423 xcol = jats_parser.MathdocPublication(tree=node)
425 collection = self.add_collection(xcol)
426 collections.append(collection)
428 return collections
431class addIssueXmlCmd(addXmlCmd):
432 """
433 addIssueXmlCmd: adds/remove an issue
435 from_folder / to_folder (declared in addXmlCmd): location of binary files to copy
437 extra_folder: folder where extra data (extid false_positive...) are stored in a json
438 It is used
439 - when you call addIssueXmlCmd directly to import from an archive,
440 - when you call addOrUpdateIssueXmlCmd and we need to restore extra data after the import
442 Exception raised:
443 - exceptions.ResourceExists during do if the issue already exists
444 - exceptions.ResourceDoesNotExist
445 during undo if the Issue does not exist
446 during do if the serial/provider does not exist
447 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value>
448 - RuntimeError during undo if resources are still published
449 """
451 assign_doi = False
452 full_text_folder = ""
453 extra_folder = None
454 prod_deployed_date_iso_8601_date_str = None
455 xissue = None
456 count = 0
457 no_bib = False # Ignore the references during the import (used in Geodesic)
458 embargo = False # Import only the open articles (used in Geodesic)
460 def create_child_collection(self, xjournal, journal):
461 issn = xjournal.issn if xjournal.issn else xjournal.e_issn
463 new_xjournal = copy.deepcopy(xjournal)
464 new_xjournal.wall = 0
465 new_xjournal.pid = f"{xjournal.pid}-{issn}"
466 new_xjournal.coltype = journal.coltype
468 params = {"xobj": new_xjournal}
469 provider = model_helpers.get_provider_by_name("mathdoc")
471 cmd = ptf_cmds.addCollectionPtfCmd(params)
472 cmd.set_parent(journal)
473 cmd.set_provider(provider)
475 collection = cmd.do()
476 # collection.parent = journal
477 # journal = collection
478 return collection
480 def get_historic_collection(self, xjournal, journal):
481 use_meta_collections = (
482 settings.USE_META_COLLECTIONS if hasattr(settings, "USE_META_COLLECTIONS") else False
483 )
485 if not use_meta_collections: 485 ↛ 486line 485 didn't jump to line 486, because the condition on line 485 was never true
486 return journal
488 # meta-collections are used : journal may be the top collection or one of its children
490 value = id_type = None
492 # Take care of special case of STNB :
493 # For that, we ignore the issn of STNB 2nd series
494 if xjournal.pid == "JTNB" and xjournal.issn == "0989-5558": 494 ↛ 495line 494 didn't jump to line 495, because the condition on line 494 was never true
495 xjournal.issn = None
496 xjournal.e_issn = None
497 xjournal.ids = []
498 else:
499 if xjournal.issn:
500 value = xjournal.issn
501 id_type = "issn"
502 elif xjournal.e_issn:
503 value = xjournal.e_issn
504 id_type = "e-issn"
506 if value:
507 # collection has at least one issn
508 qs = Collection.objects.filter(resourceid__id_value=value, resourceid__id_type=id_type)
509 if qs.exists():
510 journal = qs.first()
511 else:
512 # xjournal does not exist yet.
513 journal = self.create_child_collection(xjournal, journal)
514 else:
515 # collection has no issn
516 possible_pids = [xjournal.pid, f"{xjournal.pid}-{value}"]
517 qs = Collection.objects.exclude(resourceid__id_value__isnull=False).filter(
518 pid__in=possible_pids
519 )
520 if qs.exists(): 520 ↛ 523line 520 didn't jump to line 523, because the condition on line 520 was never false
521 journal = qs.first()
522 else:
523 journal = self.create_child_collection(xjournal, journal)
525 return journal
527 def internal_do(self):
528 super().internal_do()
530 #######################################################################
531 # get xissue
533 if self.xissue:
534 xissue = self.xissue
535 else:
536 xissue = jats_parser.JatsIssue(tree=self.tree, no_bib=self.no_bib)
537 self.warnings.extend(xissue.warnings)
539 #######################################################################
540 # Check if there is an existing issue / journal
542 issue_id = xissue.pid
543 issue = model_helpers.get_container(issue_id)
545 if issue is not None:
546 raise exceptions.ResourceExists(f"Issue {issue_id} already exists")
548 xjournal = xissue.journal
549 journal_id = xjournal.pid
550 journal = model_helpers.get_collection(journal_id)
552 # Note: Why use <issue-meta><custom-meta-group><custom-meta> to find the provider and then the journal
553 # as there is a <journal-meta> with an id ?
554 # The ptf_resource table (Resource objects) are created with only 1 id.
555 # When you add a journal, the journal id is the one of its
556 # <custom-meta-group><custom-meta> provider.
557 # If you want to find the journal of an issue based on the <journal-meta> information, you might
558 # have to search among the other ids (ptf_resourceid table, ResourceId objects) : sql JOIN select
559 # To avoid the join select, it's better to use <issue-meta><custom-meta-group><custom-meta> to make sure
560 # we use the correct provider. A simple select in the ptf_resource table is then needed.
561 if journal is None: 561 ↛ 562line 561 didn't jump to line 562, because the condition on line 561 was never true
562 raise exceptions.ResourceDoesNotExist(f"Journal {journal_id} does not exist")
564 # Journal is the top collection (ex: AFST)
565 # We want to get (or create) the journal that corresponds to the issue
566 journal = self.get_historic_collection(xjournal, journal)
568 if self.embargo and journal.wall > 0: 568 ↛ 571line 568 didn't jump to line 571, because the condition on line 568 was never true
569 # Geodesic is for open access articles.
570 # We do not want to import the issues under embargo
571 if resolver.embargo(journal.wall, xissue.year):
572 print(f"Embargo, ignore {xissue.pid}")
573 return None
575 #######################################################################
576 # Get provider/publisher
578 provider_name = xissue.provider if xissue.provider else "mathdoc"
579 provider = model_helpers.get_provider_by_name(provider_name)
581 #######################################################################
582 # Add the issue
584 params = {
585 "xobj": xissue,
586 "pid": xissue.pid,
587 "from_folder": self.from_folder,
588 "to_folder": self.to_folder,
589 "solr_commit": False,
590 }
592 cmd = ptf_cmds.addContainerPtfCmd(params)
593 cmd.add_collection(journal)
594 cmd.set_provider(provider)
595 issue = cmd.do(self)
597 self.add_objects_with_location(xissue.ext_links, issue, "ExtLink")
598 self.add_objects_with_location(xissue.related_objects, issue, "RelatedObject")
599 self.add_objects_with_location(xissue.streams, issue, "DataStream")
601 #######################################################################
602 # Add the issue's articles
604 # JatsIssue is an iterator (has the __iter__ function)
605 # you simply iterate the xissue to get its articles
606 for seq, xarticle in enumerate(xissue, start=1):
607 params = {
608 "xarticle": xarticle,
609 "journal": journal,
610 "issue": issue,
611 "seq": seq,
612 "provider": provider,
613 "assign_doi": self.assign_doi,
614 "full_text_folder": self.full_text_folder,
615 "use_body": False,
616 "from_folder": self.from_folder,
617 "to_folder": self.to_folder,
618 "solr_commit_at_the_end": False,
619 }
620 cmd = addArticleXmlCmd(params)
621 cmd.do(self)
623 # Update the top journal first year and last year
624 self.update_collection_years(journal_id, issue)
626 # The collection maybe updated with update_collection_years and the assign_doi param (col.last_doi)
627 # Update issue before returning the object.
628 # Note that refresh_from_db does not update ForeignKey fields, we can't simply call issue.refresh_from_db()
629 issue.my_collection.refresh_from_db()
631 # Used in post_do
632 self._prod_deployed_date_iso_8601_date_str = xissue.prod_deployed_date_iso_8601_date_str
634 return issue
636 def post_do(self, resource=None):
637 super().post_do(resource)
639 # Si le XML de l'issue a une last-modified, on la garde, sinon on en créé une.
640 if resource.last_modified is None: 640 ↛ 641line 640 didn't jump to line 641, because the condition on line 640 was never true
641 resource.last_modified = timezone.now()
642 resource.save()
644 # Sur ptf-tools, si le XML de l'issue a une prod_deployed_date,
645 # On la propage aux Articles/Issue.
646 # La restoration éventuelle des données (avec importExtraDataPtfCmd) peut écraser prod_deployed_date
647 if self._prod_deployed_date_iso_8601_date_str and settings.SITE_NAME == "ptf_tools":
648 prod_deployed_date = model_helpers.parse_date_str(
649 self._prod_deployed_date_iso_8601_date_str
650 )
651 journal_site = model_helpers.get_site_mersenne(resource.my_collection.pid)
652 if journal_site: 652 ↛ 655line 652 didn't jump to line 655, because the condition on line 652 was never false
653 model_helpers.update_deployed_date(resource, journal_site, prod_deployed_date)
655 if self.extra_folder:
656 ptf_cmds.importExtraDataPtfCmd(
657 {"pid": resource.pid, "import_folder": self.extra_folder}
658 ).do()
661class addArticleXmlCmd(addXmlCmd):
662 """
663 addArticleXmlCmd: adds/remove an issue
665 Exception raised:
666 - exceptions.ResourceExists during do if the article already exists
667 - exceptions.ResourceDoesNotExist
668 during undo if the Article does not exist
669 during do if the serial/issue/provider does not exist
670 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value>
671 """
673 xarticle = None
674 journal = None
675 issue = None
676 provider = None
677 provider_col = None
678 assign_doi = False
679 full_text_folder = ""
680 xml_format = "xmldata_jats"
681 # restricted_mode is used by maxiDML. We do not try to import all the metadata, but only a subset
682 restricted_mode = False
683 # standalone is used to import isolated article, without issues
684 standalone = False
685 seq = (
686 0 # seq is used by the breadcrumbs. Generate it if it's not specified in the XML (ex: PCJ)
687 )
688 keep_translations = False
690 def set_collection(self, collection):
691 self.journal = collection
692 self.provider = collection.provider
694 def set_xml_format(self, xml_format):
695 self.xml_format = xml_format
697 def set_provider(self, provider):
698 self.provider = provider
700 def set_provider_col(self, provider_col):
701 self.provider_col = provider_col
703 def set_article_single_mode(self):
704 self.xarticle = jats_parser.JatsArticle(tree=self.tree)
705 self.warnings.extend(self.xarticle.warnings)
707 # TODO: MaxiDML: allow the creation of an issue on the fly
708 # if not self.provider:
709 # self.provider = model_helpers.get_provider_by_name(self.xarticle.provider)
710 #
711 # xmldata_jats.set_pid_type(self.provider.pid_type)
712 #
713 # bdy = etree.tostring(self.xarticle.journal.tree).decode("utf-8")
714 # cmd = addCollectionsXmlCmd({'body': bdy,
715 # 'xml_format': self.xml_format,
716 # 'coltype': "journal"})
717 # cmd.set_provider(self.provider_col if self.provider_col else self.provider)
718 # self.journal = cmd.do()[0]
719 #
720 # self.issue = model_helpers.get_container(self.xarticle.issue_id)
721 # if self.issue is None:
722 # # need to create the issue
723 # date = datetime.datetime.strptime(self.xarticle.date_published_iso_8601_date_str,
724 # '%Y-%m-%d')
725 # pid = "{name}_{year}".format(name=self.journal.pid, year=date.year)
726 # self.issue = model_helpers.get_container(pid)
727 # if self.issue is None:
728 # params = {'ctype': 'issue', 'year': date.year, 'pid': pid,
729 # 'last_modified_iso_8601_date_str': datetime.datetime.now().strftime(
730 # "%Y-%m-%d %H:%M:%S"), 'volume': self.xarticle.volume,
731 # # if copy binary, need from_folder / to_folder
732 # }
733 #
734 # cmd = ptf_cmds.addContainerPtfCmd(params)
735 # cmd.add_collection(self.journal)
736 # cmd.set_provider(self.provider)
737 # self.issue = cmd.do()
739 def get_oai_identifier(self):
740 return self.xarticle.oai_identifier
742 def update_xobj_with_body(self):
743 # Import CEDRICS, le plein texte provient d'un fichier séparé
744 if self.full_text_folder and not self.xarticle.body:
745 if self.full_text_folder == settings.CEDRAM_TEX_FOLDER: 745 ↛ 757line 745 didn't jump to line 757, because the condition on line 745 was never false
746 text = ""
747 locs = [
748 stream["location"]
749 for stream in self.xarticle.streams
750 if stream["mimetype"] == "application/pdf"
751 ]
752 if locs: 752 ↛ 755line 752 didn't jump to line 755, because the condition on line 752 was never false
753 full_pdf_location = os.path.join(self.full_text_folder, locs[0])
754 text = utils.pdf_to_text(full_pdf_location)
755 self.xarticle.body = text
756 else:
757 full_text_file = self.full_text_folder + self.xarticle.pid + ".xml"
759 with open(full_text_file, mode="rb") as file_:
760 body = file_.read()
762 parser = etree.XMLParser(huge_tree=True, recover=True)
763 tree = etree.fromstring(body, parser=parser)
764 node = tree.find("body")
765 self.xarticle.body = xml_utils.get_text_from_node(node)
766 self.xarticle.body_xml = xml_utils.get_xml_from_text("body", self.xarticle.body)
767 elif not self.xarticle.body_xml and hasattr(self.xarticle, "pii"): 767 ↛ 768line 767 didn't jump to line 768, because the condition on line 767 was never true
768 full_text_file = os.path.join(
769 "/numdam_dev/acquisition/donnees_traitees",
770 self.journal.pid,
771 self.issue.pid,
772 self.xarticle.pid,
773 self.xarticle.pid + ".xml",
774 )
775 if os.path.isfile(full_text_file):
776 with open(full_text_file, mode="rb") as file_:
777 body = file_.read()
779 parser = etree.XMLParser(huge_tree=True, recover=True)
780 tree = etree.fromstring(body, parser=parser)
781 node = tree.find("body")
782 self.xarticle.body = xml_utils.get_text_from_node(node)
783 self.xarticle.body_xml = xml_utils.get_xml_from_text("body", self.xarticle.body)
785 def internal_do(self):
786 super().internal_do()
788 if self.xarticle is None and self.journal is not None: 788 ↛ 790line 788 didn't jump to line 790, because the condition on line 788 was never true
789 # self.restricted_mode = True
790 self.set_article_single_mode()
791 self.update = True
792 else:
793 self.update = False
795 if self.xarticle.pid is None:
796 self.xarticle.pid = (
797 self.xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_")
798 )
800 for xtranslated_article in self.xarticle.translations: 800 ↛ 801line 800 didn't jump to line 801, because the loop on line 800 never started
801 for xtream in xtranslated_article.streams:
802 if xtream["mimetype"] == "text/html":
803 if self.from_folder is None:
804 raise ValueError(
805 "The article has its full text in a separate HTML file. You need to set from_folder"
806 )
808 location = os.path.join(self.from_folder, xtream["location"])
809 body_html = resolver.get_body(location)
810 body = xml_utils.get_text_from_xml_with_mathml(body_html)
811 xtranslated_article.body_html = body_html
812 xtranslated_article.body = body
814 for stream in self.xarticle.streams:
815 if stream["mimetype"] == "text/html":
816 location = os.path.join(self.from_folder, stream["location"])
817 body_html = resolver.get_body(location)
818 body = xml_utils.get_text_from_xml_with_mathml(body_html)
819 self.xarticle.body_html = body_html
820 self.xarticle.body = body
822 if self.xarticle.doi:
823 article = model_helpers.get_article_by_doi(self.xarticle.doi)
824 else:
825 article = model_helpers.get_article(self.xarticle.pid)
826 needs_to_restore_article = False
828 if article is not None: 828 ↛ 829line 828 didn't jump to line 829, because the condition on line 828 was never true
829 if self.update or self.standalone:
830 if self.standalone:
831 self.provider = article.provider
833 needs_to_restore_article = True
834 backup_obj_not_in_metadata(article)
836 if self.keep_translations:
837 backup_translation(article)
839 cmd = ptf_cmds.addArticlePtfCmd(
840 {
841 "pid": article.pid,
842 "to_folder": self.to_folder, # on supprime les fichiers pour être sûr
843 }
844 )
845 cmd.set_object_to_be_deleted(article)
846 cmd.undo()
847 else:
848 raise exceptions.ResourceExists(f"Article {self.xarticle.pid} already exists")
850 # Override seq
851 if self.standalone and article is not None: 851 ↛ 852line 851 didn't jump to line 852, because the condition on line 851 was never true
852 self.xarticle.seq = article.seq
853 elif (
854 not self.standalone and self.issue and int(self.xarticle.seq) == 0 and self.seq != 0
855 ) or (hasattr(self, "pii") and self.seq != 0):
856 self.xarticle.seq = self.seq
858 # Get the article's text (body) for SolR if it is empty from the PDF
859 self.update_xobj_with_body()
861 params = {
862 "xobj": self.xarticle,
863 "pid": self.xarticle.pid,
864 "from_folder": self.from_folder,
865 "to_folder": self.to_folder,
866 "assign_doi": self.assign_doi and not self.xarticle.doi,
867 "solr_commit": False,
868 }
870 cmd = ptf_cmds.addArticlePtfCmd(params)
871 if self.issue or not self.standalone: 871 ↛ 873line 871 didn't jump to line 873, because the condition on line 871 was never false
872 cmd.set_container(self.issue)
873 cmd.add_collection(self.journal)
874 article = cmd.do(self)
876 self.add_objects_with_location(self.xarticle.ext_links, article, "ExtLink")
877 self.add_objects_with_location(self.xarticle.streams, article, "DataStream")
878 if not self.restricted_mode: 878 ↛ 883line 878 didn't jump to line 883, because the condition on line 878 was never false
879 self.add_objects_with_location(
880 self.xarticle.supplementary_materials, article, "SupplementaryMaterial"
881 )
883 if (
884 hasattr(settings, "SHOW_BODY") and settings.SHOW_BODY
885 ) or settings.SITE_NAME == "ptf_tools":
886 self.add_objects_with_location(self.xarticle.figures, article, "RelatedObject")
888 for xtrans_article, trans_article in zip( 888 ↛ 891line 888 didn't jump to line 891, because the loop on line 888 never started
889 self.xarticle.translations, cmd.cmd.translated_articles
890 ):
891 self.add_objects_with_location(xtrans_article.streams, trans_article, "DataStream")
893 if needs_to_restore_article: 893 ↛ 894line 893 didn't jump to line 894, because the condition on line 893 was never true
894 restore_obj_not_in_metadata(article)
896 if self.keep_translations:
897 restore_translation(article)
899 return article
902class addTranslatedArticleXmlCmd(addXmlCmd):
903 """
904 addTranslatedArticleXmlCmd: adds/remove translations.
905 The original article is not changed
906 The current translations are first removed
907 """
909 lang = ""
910 html_file_name = ""
911 pdf_file_name = ""
912 date_published_str = ""
914 def internal_do(self):
915 super().internal_do()
917 xarticle = jats_parser.JatsArticle(tree=self.tree)
918 article = model_helpers.get_article(xarticle.pid)
920 if article is None:
921 raise exceptions.ResourceDoesNotExist(f"Article {self.xarticle.pid} does not exist")
923 # Merge existing article with new translation
924 data_article = model_data_converter.db_to_article_data(article)
925 new_translations = [
926 translation
927 for translation in data_article.translations
928 if translation.lang != self.lang
929 ]
931 for xtrans_article in xarticle.translations:
932 if xtrans_article.lang == self.lang:
933 # Upload/views has copied the HTML file on disk
934 # Add a DataStream.
935 # TODO: check if the datastream is not already present
936 if self.html_file_name:
937 data = model_data.create_datastream()
938 data["rel"] = "full-text"
939 data["mimetype"] = "text/html"
940 data["location"] = self.html_file_name
941 xtrans_article.streams.append(data)
943 if self.pdf_file_name:
944 # Create a pdf file
945 # pdf-translate needs the article/sub-article XML
946 # Simply add a datastream for now
947 # The new Article created in Django will be complete
948 # But generate the PDF file at the end
949 data = model_data.create_datastream()
950 data["rel"] = "full-text"
951 data["mimetype"] = "application/pdf"
952 data["location"] = self.pdf_file_name
953 xtrans_article.streams.append(data)
955 if self.date_published_str:
956 xtrans_article.date_published_iso_8601_date_str = self.date_published_str
958 new_translations.append(xtrans_article)
960 data_article.translations = new_translations
962 cmd = addArticleXmlCmd(
963 {
964 "xarticle": data_article,
965 "use_body": False,
966 "issue": article.my_container,
967 "standalone": True,
968 "from_folder": self.from_folder,
969 }
970 )
971 cmd.set_collection(article.get_collection())
972 article = cmd.do()
974 # pdf-translate needs the article/sub-article XML
975 xml = ptf_cmds.exportPtfCmd(
976 {
977 "pid": article.pid,
978 "with_body": False,
979 "with_djvu": False,
980 "article_standalone": True,
981 "collection_pid": settings.COLLECTION_PID,
982 }
983 ).do()
985 tex.create_translated_pdf(
986 article,
987 xml,
988 self.lang,
989 os.path.join(self.from_folder, self.pdf_file_name),
990 os.path.join(self.from_folder, self.html_file_name),
991 # If the date_published is specified, we assume that the PDF already exists
992 skip_compilation=self.date_published_str != "",
993 )
995 return article
998class addPCJArticleXmlCmd(addXmlCmd):
999 """
1000 addPCJArticleXmlCmd:
1001 """
1003 html_file_name = ""
1005 def internal_do(self):
1006 super().internal_do()
1008 xarticle = jats_parser.JatsArticle(tree=self.tree)
1010 if self.html_file_name: 1010 ↛ 1017line 1010 didn't jump to line 1017, because the condition on line 1010 was never false
1011 data = model_data.create_datastream()
1012 data["rel"] = "full-text"
1013 data["mimetype"] = "text/html"
1014 data["location"] = self.html_file_name
1015 xarticle.streams.append(data)
1017 cmd = addArticleXmlCmd(
1018 {
1019 "xarticle": xarticle,
1020 "use_body": False,
1021 "issue": self.issue,
1022 "standalone": True,
1023 "from_folder": self.from_folder,
1024 }
1025 )
1026 cmd.set_collection(self.collection)
1027 article = cmd.do()
1029 return article
1032class addBookXmlCmd(addXmlCmd):
1033 """
1034 addBookXmlCmd: adds/remove a book
1036 Exception raised:
1037 - exceptions.ResourceExists during do if the book already exists
1038 - exceptions.ResourceDoesNotExist
1039 during undo if the Book does not exist
1040 during do if the serial/provider does not exist
1041 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value>
1042 - RuntimeError during undo if resources are still published
1043 """
1045 provider = None
1046 import_oai_mode = False
1047 journal = None
1048 xml_format = "xmldata_jats"
1049 xbook = None
1050 _collection = None
1052 def set_provider(self, provider):
1053 self.provider = provider
1055 def add_parts(self, xparts, pseq):
1056 if xparts:
1057 seq = 1
1058 for xpart in xparts:
1059 self.add_part(xpart, seq, pseq)
1060 seq += 1
1062 def add_part(self, xpart, seq, pseq):
1063 if xpart is None: 1063 ↛ 1064line 1063 didn't jump to line 1064, because the condition on line 1063 was never true
1064 return
1066 # An Article is used to store a book part in the database
1067 article = model_helpers.get_article(xpart.pid)
1069 if article is not None: 1069 ↛ 1070line 1069 didn't jump to line 1070, because the condition on line 1069 was never true
1070 raise exceptions.ResourceExists(f"BookPart {xpart.pid} already exists")
1072 params = {
1073 "xobj": xpart,
1074 "pid": xpart.pid,
1075 "seq": seq,
1076 "pseq": pseq,
1077 # "deployed": deployed,
1078 "from_folder": self.from_folder,
1079 "to_folder": self.to_folder,
1080 "solr_commit": False,
1081 }
1083 cmd = ptf_cmds.addBookPartPtfCmd(params)
1084 cmd.set_container(self.book)
1085 cmd.add_collection(self._collection)
1086 article = cmd.do(self)
1088 self.add_objects_with_location(xpart.ext_links, article, "ExtLink")
1089 self.add_objects_with_location(xpart.streams, article, "DataStream")
1091 self.add_parts(xpart.parts, seq)
1093 def set_import_oai_mode(self):
1094 self.import_oai_mode = True
1096 def internal_do(self):
1097 super().internal_do()
1099 #######################################################################
1100 # Get xbook
1102 if self.import_oai_mode: 1102 ↛ 1103line 1102 didn't jump to line 1103, because the condition on line 1102 was never true
1103 xmldata = globals()[self.xml_format]
1104 xbook = xmldata.Book(self.tree)
1105 self.journal = model_helpers.get_collection("GDML_Books")
1107 else:
1108 if self.xbook:
1109 xbook = self.xbook
1110 else:
1111 xbook = jats_parser.BitsBook(tree=self.tree)
1112 self.warnings.extend(xbook.warnings)
1114 #######################################################################
1115 # Get existing book if any
1117 if not self.provider: 1117 ↛ 1121line 1117 didn't jump to line 1121, because the condition on line 1117 was never false
1118 provider = model_helpers.get_provider_by_name(xbook.provider)
1119 self.provider = provider
1121 book_id = xbook.pid
1122 book = model_helpers.get_container(book_id)
1124 #######################################################################
1125 # Delete any existing book
1127 if book is not None:
1128 if self.import_oai_mode: 1128 ↛ 1129line 1128 didn't jump to line 1129, because the condition on line 1128 was never true
1129 publisher = book.my_publisher
1131 # Note: the existing collection is not removed even if it no longer has a resource
1132 # TODO: urls/commands to add/update/delete a collection
1134 # Removes the book
1135 cmd = ptf_cmds.addContainerPtfCmd()
1136 cmd.set_object_to_be_deleted(book)
1137 cmd.undo()
1139 if publisher and publisher.publishes.count() == 0:
1140 self.remove_publisher(publisher)
1141 else:
1142 raise exceptions.ResourceExists("Book %s already exists" % book_id)
1144 #######################################################################
1145 # Add new book
1147 if xbook.incollection: 1147 ↛ 1152line 1147 didn't jump to line 1152, because the condition on line 1147 was never false
1148 colid = xbook.incollection[0].pid
1149 self._collection = model_helpers.get_collection(colid)
1150 if self._collection is None:
1151 raise exceptions.ResourceDoesNotExist(f"The collection {colid} does not exist")
1152 elif self.import_oai_mode:
1153 self._collection = self.journal
1155 params = {
1156 "xobj": xbook,
1157 "pid": xbook.pid,
1158 "from_folder": self.from_folder,
1159 "to_folder": self.to_folder,
1160 "solr_commit": False,
1161 }
1163 cmd = ptf_cmds.addContainerPtfCmd(params)
1164 cmd.add_collection(self._collection)
1165 cmd.set_provider(provider)
1167 book = cmd.do(self)
1168 self.book = book
1170 self.add_objects_with_location(xbook.ext_links, book, "ExtLink")
1171 self.add_objects_with_location(xbook.related_objects, book, "RelatedObject")
1172 self.add_objects_with_location(xbook.streams, book, "DataStream")
1174 # self.add_metadata_parts(xbook, book) TODO support Metadataparts ?
1176 #######################################################################
1177 # Add Book parts
1179 # JatsIssue is an iterator (has the __iter__ function)
1180 # TODO make JatsBook an iterator as well ?
1181 self.add_parts(xbook.parts, 0)
1183 # Update the collection first year and last year
1184 for incol in xbook.incollection:
1185 self.update_collection_years(incol.pid, book)
1187 return book
1190######################################################################################
1191######################################################################################
1192#
1193# Update Commands
1194#
1195######################################################################################
1196######################################################################################
1199class updateCollectionsXmlCmd(addXmlCmd):
1200 """
1201 updateSerialsXmlCmd: updates one or more journals
1203 Exception raised:
1204 - exceptions.ResourceDoesNotExist during do if the Collection does not exist
1205 - RuntimeError if undo is called
1206 """
1208 def update_collection(self, xcol, do_update=True):
1209 if not xcol: 1209 ↛ 1210line 1209 didn't jump to line 1210, because the condition on line 1209 was never true
1210 return None
1212 provider = model_helpers.get_provider_by_name(xcol.provider)
1214 col_id = xcol.pid
1215 col = model_helpers.get_collection(col_id)
1217 if col is None:
1218 raise exceptions.ResourceDoesNotExist("Collection %s does not exist" % xcol.pid)
1220 if do_update:
1221 params = {
1222 "xobj": xcol,
1223 "solr_commit": False,
1224 "from_folder": self.from_folder,
1225 "to_folder": self.to_folder,
1226 }
1228 # The existing other_ids, abstracts are removed in updateCollectionDatabaseCmd::internal_do
1229 # and the new ones are added in the post_do (addResourceDatabaseCmd)
1231 cmd = ptf_cmds.updateCollectionPtfCmd(params)
1232 cmd.set_provider(provider)
1233 # cmd.set_publisher(publisher)
1234 col = cmd.do()
1236 # The existing extlinks are removed in updateCollectionDatabaseCmd::internal_do
1237 self.add_objects_with_location(xcol.ext_links, col, "ExtLink")
1238 resolver.copy_binary_files(col, self.from_folder, self.to_folder)
1240 # if publisher:
1241 # model_helpers.publish_resource(publisher, col)
1243 return col
1245 def internal_do(self):
1246 super().internal_do()
1248 collections = []
1250 # First, check that all journals exist
1251 for node in self.tree:
1252 xcol = None
1253 if node.tag == "collection-meta": 1253 ↛ 1254line 1253 didn't jump to line 1254, because the condition on line 1253 was never true
1254 xcol = jats_parser.BitsCollection(tree=node)
1255 elif node.tag == "journal-meta": 1255 ↛ 1256line 1255 didn't jump to line 1256, because the condition on line 1255 was never true
1256 xcol = jats_parser.JatsJournal(tree=node)
1257 elif node.tag == "publication-meta": 1257 ↛ 1259line 1257 didn't jump to line 1259, because the condition on line 1257 was never false
1258 xcol = jats_parser.MathdocPublication(tree=node)
1259 self.update_collection(xcol, False)
1261 for node in self.tree:
1262 xcol = None
1263 if node.tag == "collection-meta": 1263 ↛ 1264line 1263 didn't jump to line 1264, because the condition on line 1263 was never true
1264 xcol = jats_parser.BitsCollection(tree=node)
1265 elif node.tag == "journal-meta": 1265 ↛ 1266line 1265 didn't jump to line 1266, because the condition on line 1265 was never true
1266 xcol = jats_parser.JatsJournal(tree=node)
1267 elif node.tag == "publication-meta": 1267 ↛ 1269line 1267 didn't jump to line 1269, because the condition on line 1267 was never false
1268 xcol = jats_parser.MathdocPublication(tree=node)
1269 self.warnings.extend(xcol.warnings)
1270 xcol = self.update_collection(xcol)
1271 collections.append(xcol)
1273 return collections
1275 def internal_undo(self):
1276 raise RuntimeError("update commands do not support the undo")
1279#####################################################################
1280#
1281# replaceIssueXmlCmd: updates an issue
1282#
1283# Exception raised:
1284# - exceptions.ResourceDoesNotExist during do if the Collection/Issue/Provider does not exist
1285# <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value>
1286# - RuntimeError if undo is called
1287#
1288######################################################################
1289class replaceIssueXmlCmd(addXmlCmd):
1290 def internal_do(self):
1291 super().internal_do()
1293 xissue = jats_parser.JatsIssue(tree=self.tree)
1294 self.warnings.extend(xissue.warnings)
1296 xjournal = xissue.journal
1297 journal_id = xjournal.pid
1298 journal = model_helpers.get_collection(journal_id)
1300 if journal is None: 1300 ↛ 1301line 1300 didn't jump to line 1301, because the condition on line 1300 was never true
1301 raise exceptions.ResourceDoesNotExist("Journal %s does not exist" % xjournal.pid)
1303 issue_id = xissue.pid
1304 issue = model_helpers.get_container(issue_id)
1306 if issue is None: 1306 ↛ 1307line 1306 didn't jump to line 1307, because the condition on line 1306 was never true
1307 raise exceptions.ResourceDoesNotExist("Issue %s does not exist" % issue_id)
1309 publisher = issue.my_publisher
1311 cmd = ptf_cmds.addContainerPtfCmd()
1312 cmd.set_object_to_be_deleted(issue)
1313 cmd.undo()
1315 if publisher.publishes.count() == 0:
1316 self.remove_publisher(publisher)
1318 # update the journal first and last year
1319 for the_issue in journal.content.all():
1320 self.update_collection_years(journal_id, the_issue, False)
1322 journal.save()
1324 cmd = addIssueXmlCmd(
1325 {
1326 "xissue": xissue,
1327 "use_body": False,
1328 "solr_commit": False,
1329 "extra_folder": self.from_folder,
1330 "to_folder": self.to_folder,
1331 }
1332 )
1333 issue = cmd.do()
1335 return issue
1337 # node_tag = self.tree.tag
1338 # for child in self.tree:
1339 # node_tag = child.tag
1341 def internal_undo(self):
1342 raise RuntimeError("update commands do not support the undo")
1345class updateBookXmlCmd(addXmlCmd):
1346 """
1347 updateBookXmlCmd: updates a book
1349 Exception raised:
1350 - exceptions.ResourceDoesNotExist during do if the Book does not exist
1351 - RuntimeError if undo is called
1352 """
1354 def internal_do(self):
1355 super().internal_do()
1357 xbook = jats_parser.BitsBook(tree=self.tree)
1358 self.warnings.extend(xbook.warnings)
1360 book_id = xbook.pid
1361 book = model_helpers.get_container(book_id)
1363 if book is None: 1363 ↛ 1364line 1363 didn't jump to line 1364, because the condition on line 1363 was never true
1364 raise exceptions.ResourceDoesNotExist("Book %s does not exist" % xbook.pid)
1366 # unpublish and delete the existing publisher if necessary
1367 # self.update_publisher(xbook, book)
1369 # Note: the existing collection is not removed even if it no longer has a resource
1370 # TODO: urls/commands to add/update/delete a collection
1372 # Removes the book
1373 cmd = ptf_cmds.addContainerPtfCmd()
1374 cmd.set_object_to_be_deleted(book)
1375 cmd.undo()
1377 cmd = addBookXmlCmd(
1378 {
1379 "xbook": xbook,
1380 "use_body": False,
1381 "solr_commit": False,
1382 "from_folder": self.from_folder,
1383 "to_folder": self.to_folder,
1384 }
1385 )
1386 book = cmd.do()
1388 return book
1390 def internal_undo(self):
1391 raise RuntimeError("update commands do not support the undo")
1394class addOrUpdateContainerXmlCmd(addXmlCmd):
1395 """
1396 addOrUpdateContainerXmlCmd: detects Container type from xml and adds or updates an issue or a book
1398 just detect Container type (do not check params etc.)
1399 """
1401 keep_metadata = False
1402 keep_translations = False
1403 backup_folder = None
1404 full_text_folder = ""
1405 fake = False # Parse the XML but do not import
1406 no_bib = False # Ignore the references during the import (used in Geodesic)
1407 embargo = False # Import only the open articles (used in Geodesic)
1409 def check_params(self):
1410 super().check_params()
1412 def internal_do(self):
1413 super().internal_do()
1415 tag = normalize(self.tree.tag)
1417 if tag == "journal-issue": 1417 ↛ 1435line 1417 didn't jump to line 1435, because the condition on line 1417 was never false
1418 cmd = addOrUpdateIssueXmlCmd(
1419 {
1420 "body": self.body,
1421 "keep_metadata": self.keep_metadata,
1422 "keep_translations": self.keep_translations,
1423 "backup_folder": self.backup_folder,
1424 "to_folder": self.to_folder,
1425 "from_folder": self.from_folder,
1426 "xml_file_folder": self.xml_file_folder,
1427 "fake": self.fake,
1428 "no_bib": self.no_bib,
1429 "embargo": self.embargo,
1430 }
1431 )
1432 obj = cmd.do()
1433 self.warnings.extend(cmd.warnings)
1434 return obj
1435 elif tag == "book":
1436 cmd = addOrUpdateBookXmlCmd(
1437 {
1438 "body": self.body,
1439 "from_folder": self.from_folder,
1440 "to_folder": self.to_folder,
1441 "no_bib": self.no_bib,
1442 "embargo": self.embargo,
1443 }
1444 )
1445 obj = cmd.do()
1446 self.warnings.extend(cmd.warnings)
1447 return obj
1448 else:
1449 raise RuntimeError("addOrupdateContainer command can't detect container type")
1451 def internal_undo(self):
1452 raise RuntimeError("update commands do not support the undo")
1455class addOrUpdateIssueXmlCmd(addXmlCmd):
1456 """
1457 addOrUpdateIssueXmlCmd: adds or updates an issue
1459 Adds an issue if it is not in the system or updates the issue if it is already there.
1460 By default, no DOI is assigned for the articles. Set assign_doi to True.
1462 from_folder / to_folder (declared in addXmlCmd): location of binary files to copy
1463 backup_folder: folder where extra data (extid false_positive...) are (to be) stored in a json
1465 keep_metadata:
1466 True if you want to back up extra data (icon, dates, matching ids, ...) in the backup_folder
1467 Default: False
1468 Note: backup_obj_not_in_metadata / restore_obj_not_in_metadata is always called
1469 We always want to preserve GraphicalAbstracts (they are not in the issue XML)
1471 keep_translations:
1472 True if you want back up/restore translations.
1473 Default: False
1474 Note: When you post an article to a journal (test) website, the translation is declared in the XML
1475 But if you import a Cedrics article in Trammel, the XML does not list translations
1477 Exception raised:
1478 - exceptions.ResourceDoesNotExist during do if the Collection/Issue/Provider does not exist
1479 <custom-meta-group><custom-meta><meta-name>provider</meta-name><meta-value>
1480 - RuntimeError if undo is called
1481 """
1483 keep_metadata = False
1484 keep_translations = False
1485 backup_folder = None
1486 assign_doi = False
1487 full_text_folder = ""
1489 xissue = None
1490 fake = False # Parse the XML but do not import
1491 no_bib = False # Ignore the references during the import (used in Geodesic)
1492 embargo = False # Import only the open articles (used in Geodesic)
1494 def check_params(self):
1495 super().check_params()
1497 if self.keep_metadata and self.assign_doi: 1497 ↛ 1498line 1497 didn't jump to line 1498, because the condition on line 1497 was never true
1498 raise ValueError("keep_metadata and assign_doi cannot both be true.")
1500 if self.keep_metadata and self.backup_folder is None: 1500 ↛ 1501line 1500 didn't jump to line 1501, because the condition on line 1500 was never true
1501 raise ValueError("backup_folder needs to be set when keep_metadata is true.")
1503 def internal_do(self):
1504 super().internal_do()
1506 if not self.xissue:
1507 self.xissue = xissue = jats_parser.JatsIssue(
1508 tree=self.tree, from_folder=self.from_folder, no_bib=self.no_bib
1509 )
1510 if len(xissue.warnings) > 0 and self.xml_file_folder:
1511 warnings = []
1512 warning_keys = []
1513 for warning in xissue.warnings:
1514 for key, value in warning.items():
1515 if value not in warning_keys:
1516 warning_keys.append(value)
1517 warnings.append({key: value})
1518 for warning in warnings:
1519 print(warning)
1520 self.warnings.extend(xissue.warnings)
1521 else:
1522 xissue = self.xissue
1524 if self.fake: 1524 ↛ 1525line 1524 didn't jump to line 1525, because the condition on line 1524 was never true
1525 return
1527 xjournal = xissue.journal
1528 journal_id = xjournal.pid
1529 journal = model_helpers.get_collection(journal_id)
1531 if journal is None: 1531 ↛ 1532line 1531 didn't jump to line 1532, because the condition on line 1531 was never true
1532 raise exceptions.ResourceDoesNotExist("Journal %s does not exist" % xjournal.pid)
1534 existing_issue = model_helpers.get_container(xissue.pid)
1536 if existing_issue:
1537 if self.embargo and existing_issue.embargo(): 1537 ↛ 1540line 1537 didn't jump to line 1540, because the condition on line 1537 was never true
1538 # Geodesic is for open access articles.
1539 # We do not want to import the issues under embargo
1540 print(f"Embargo, ignore {xissue.pid}")
1541 return None
1543 if self.keep_metadata:
1544 # On commence par faire un backup de l'existant en cas de bug.
1545 ptf_cmds.exportPtfCmd(
1546 {
1547 "pid": existing_issue.pid,
1548 "with_internal_data": True,
1549 "with_binary_files": False,
1550 "for_archive": False,
1551 "export_folder": os.path.join(settings.MERSENNE_TMP_FOLDER, "backup"),
1552 }
1553 ).do()
1555 # On sauvegarde les données additionnelles (extid, deployed_date,...)
1556 # dans un json qui sera ré-importé avec l'import du nouvel issue
1557 params = {
1558 "pid": existing_issue.pid,
1559 "export_folder": self.backup_folder,
1560 "export_all": True,
1561 "with_binary_files": True,
1562 }
1563 ptf_cmds.exportExtraDataPtfCmd(params).do()
1565 for article in existing_issue.article_set.all():
1566 backup_obj_not_in_metadata(article)
1567 if self.keep_translations:
1568 backup_translation(article)
1570 # On efface l'issue existant, sinon l'import va se plaindre d'articles existants
1571 cmd = ptf_cmds.addContainerPtfCmd()
1572 cmd.set_object_to_be_deleted(existing_issue)
1573 cmd.undo()
1575 # update the journal first and last year
1576 for the_issue in journal.content.all():
1577 self.update_collection_years(journal_id, the_issue, False)
1579 journal.save()
1580 else:
1581 issue_to_appear = model_helpers.get_issue_to_appear(journal_id)
1583 # Dans le cas des AIF, les articles du volume à paraitre sont déplacés
1584 # dans un nouveau volume avant publication (de AIF_0__0_ vers AIF_2018... par ex)
1585 # La 1ère fois, AIF_2018_ n'est pas encore dans PTF et existing_issue vaut None.
1586 # Exemple : AIF_0_0 contient doi1, doi2 et doi3, AIF_2018 contient doi1 et doi2.
1587 # L'import va échouer car on ne peut avoir 2 fois le même article.
1588 # La solution d'effacer AIF_0_0 n'est pas bonne car on perd doi3.
1589 # Il faut supprimer les articles en commun (de _0__0 et 2018_) avant l'import
1590 # du nouveau volume sinon il va y avoir des conflits.
1592 if issue_to_appear and xissue.pid != issue_to_appear.pid:
1593 # On sauvegarde les données additionnelles (extid, deployed_date,...)
1594 # dans un json qui sera ré-importé avec l'import du nouvel issue
1595 # ainsi que image associée via ptf-tools
1596 if self.keep_metadata:
1597 params = {
1598 "pid": issue_to_appear.pid,
1599 "force_pid": xissue.pid,
1600 "export_folder": self.backup_folder,
1601 "export_all": True,
1602 "with_binary_files": True,
1603 }
1604 ptf_cmds.exportExtraDataPtfCmd(params).do()
1606 for xarticle in xissue:
1607 xdoi = getattr(xarticle, "doi")
1608 article = issue_to_appear.article_set.filter(doi=xdoi).first()
1609 if article:
1610 backup_obj_not_in_metadata(article)
1611 if self.keep_translations:
1612 backup_translation(article)
1614 params = {"to_folder": self.to_folder} # pour suppression des binaires
1615 cmd = ptf_cmds.addArticlePtfCmd(params)
1616 cmd.set_object_to_be_deleted(article)
1617 cmd.undo()
1619 # si backup_folder est différent de None, alors addIssueXmlCmd.post_do() utilise importExtraDataPtfCmd
1620 cmd = addIssueXmlCmd(
1621 {
1622 "xissue": xissue,
1623 "use_body": False,
1624 # "body": self.body,
1625 "assign_doi": self.assign_doi,
1626 "full_text_folder": self.full_text_folder, # Cedrics: the full text for SolR is in a separate file
1627 "extra_folder": self.backup_folder,
1628 "from_folder": self.from_folder,
1629 "to_folder": self.to_folder,
1630 "no_bib": self.no_bib,
1631 "embargo": self.embargo,
1632 "solr_commit": False,
1633 }
1634 )
1635 new_issue = cmd.do()
1637 if new_issue: 1637 ↛ 1650line 1637 didn't jump to line 1650, because the condition on line 1637 was never false
1638 new_articles = new_issue.article_set.all()
1640 # Avec l'option self.assign_doi, on vérifie que les doi ont bien été assignés
1641 for article in new_articles:
1642 if self.assign_doi and article.doi is None: 1642 ↛ 1643line 1642 didn't jump to line 1643, because the condition on line 1642 was never true
1643 raise exceptions.ResourceHasNoDoi("The article %s has no DOI" % article.pid)
1645 # TODO garbage collector on articles no longer in the issue
1646 restore_obj_not_in_metadata(article)
1647 if self.keep_translations:
1648 restore_translation(article)
1650 return new_issue
1652 def internal_undo(self):
1653 raise RuntimeError("update commands do not support the undo")
1656class addOrUpdateBookXmlCmd(addXmlCmd):
1657 xbook = None
1659 def internal_do(self):
1660 super().internal_do()
1662 if not self.xbook: 1662 ↛ 1666line 1662 didn't jump to line 1666, because the condition on line 1662 was never false
1663 xbook = jats_parser.BitsBook(tree=self.tree)
1664 self.warnings.extend(xbook.warnings)
1665 else:
1666 xbook = self.xbook
1668 book_id = xbook.pid
1669 book = model_helpers.get_container(book_id)
1671 if book: 1671 ↛ 1672line 1671 didn't jump to line 1672, because the condition on line 1671 was never true
1672 cmd = ptf_cmds.addContainerPtfCmd()
1673 cmd.set_object_to_be_deleted(book)
1674 cmd.undo()
1676 collection = book.get_collection()
1678 # update the collection first and last year
1679 for container in collection.content.all():
1680 self.update_collection_years(collection.pid, container, False)
1682 collection.save()
1684 cmd = addBookXmlCmd(
1685 {
1686 "xbook": xbook,
1687 "use_body": False,
1688 # "body": self.body,
1689 "from_folder": self.from_folder,
1690 "to_folder": self.to_folder,
1691 "solr_commit": False,
1692 }
1693 )
1694 book = cmd.do()
1695 return book
1698class updateBibitemCitationXmlCmd(baseCmd):
1699 """ """
1701 def __init__(self, params=None):
1702 self.bibitem = None
1704 super().__init__(params)
1706 self.required_params.extend(["bibitem"])
1708 def set_bibitem(self, bibitem):
1709 self.bibitem = bibitem
1711 def internal_do(self):
1712 super().internal_do()
1714 new_ids = {}
1715 for bibitemid in self.bibitem.bibitemid_set.all():
1716 new_ids[bibitemid.id_type] = {
1717 "id_type": bibitemid.id_type,
1718 "id_value": bibitemid.id_value,
1719 "checked": bibitemid.checked,
1720 "false_positive": bibitemid.false_positive,
1721 }
1723 xbibitem = jats_parser.update_bibitem_xml(self.bibitem, new_ids)
1724 self.warnings.extend(xbibitem.warnings)
1726 self.bibitem.citation_xml = xbibitem.citation_xml
1727 self.bibitem.citation_html = xbibitem.citation_html
1728 self.bibitem.citation_tex = xbibitem.citation_tex
1729 self.bibitem.save()
1731 def internal_undo(self):
1732 raise RuntimeError("update commands do not support the undo")
1735######################################################################################
1736######################################################################################
1737#
1738# Import Commands
1739#
1740######################################################################################
1741######################################################################################
1744class collectEntireCollectionXmlCmd(baseCmd):
1745 """
1746 Get the PIDs of all the XML of a collection (collection.xml, issues.xml) of a given folder
1748 results:
1749 """
1751 def __init__(self, params=None):
1752 self.pid = None
1753 self.folder = None
1755 super().__init__(params)
1757 self.required_params.extend(["pid", "folder"])
1759 def internal_do(self):
1760 super().internal_do()
1761 pids = [pid for pid, _ in resolver.iterate_collection_folder(self.folder, self.pid)]
1762 return pids
1765class importEntireCollectionXmlCmd(baseCmd):
1766 """
1767 Import all the XML of a collection (collection.xml, issues.xml) of a given folder
1769 results:
1770 """
1772 def __init__(self, params=None):
1773 self.pid = None
1774 self.from_folder = None
1775 self.to_folder = None
1776 self.backup_folder = None
1777 self.keep_metadata = False
1778 self.keep_translations = False
1780 self.with_cedrics = True
1781 self.from_cedrics = False # The entire collection is in Cedrics format
1782 self.date_for_pii = False # Fetch publication_date for Elsevier articles
1783 self.first_issue = ""
1784 self.fake = False # Parse the XML but do not import
1786 self.no_bib = False # Ignore the references during the import (used in Geodesic)
1787 self.embargo = False # Import only the open articles (used in Geodesic)
1789 self.caller = None
1790 self.callback = None
1791 self.job = None
1793 super().__init__(params)
1795 self.required_params.extend(["pid", "from_folder"])
1797 def internal_do(self):
1798 super().internal_do()
1800 pid = self.pid
1801 resource = model_helpers.get_resource(pid)
1802 if not resource and not self.fake: 1802 ↛ 1811line 1802 didn't jump to line 1811, because the condition on line 1802 was never false
1803 body = resolver.get_archive_body(self.from_folder, pid, None)
1804 journals = addCollectionsXmlCmd(
1805 {"body": body, "from_folder": self.from_folder, "to_folder": self.to_folder}
1806 ).do()
1807 if not journals: 1807 ↛ 1808line 1807 didn't jump to line 1808, because the condition on line 1807 was never true
1808 raise ValueError(self.from_folder + " does not contain a collection")
1809 resource = journals[0]
1811 obj = resource.cast()
1813 if obj.classname != "Collection": 1813 ↛ 1814line 1813 didn't jump to line 1814, because the condition on line 1813 was never true
1814 raise ValueError(pid + " does not contain a collection")
1816 if self.with_cedrics: 1816 ↛ 1819line 1816 didn't jump to line 1819, because the condition on line 1816 was never true
1817 # with_cedrics means that you want to import everything from scratch
1818 # Delete solr documents (01/28/2020: Solr can have multiple docs with the same PID)
1819 cmd = solr_cmds.solrDeleteCmd({"q": "pid:" + self.pid + "*"})
1820 cmd.do()
1822 i = 0
1823 for pid, file_ in resolver.iterate_collection_folder(
1824 self.from_folder, self.pid, self.first_issue
1825 ):
1826 if self.callback is None: 1826 ↛ 1829line 1826 didn't jump to line 1829, because the condition on line 1826 was never false
1827 print(pid)
1829 if self.from_cedrics: 1829 ↛ 1830line 1829 didn't jump to line 1830, because the condition on line 1829 was never true
1830 cmd = importCedricsIssueDirectlyXmlCmd(
1831 {
1832 "colid": self.pid,
1833 "input_file": file_,
1834 "remove_email": False,
1835 "remove_date_prod": True,
1836 "copy_files": True,
1837 "force_dois": False,
1838 }
1839 )
1840 else:
1841 body = resolver.get_body(file_)
1842 xml_file_folder = os.path.dirname(file_)
1843 cmd = addOrUpdateContainerXmlCmd(
1844 {
1845 "body": body,
1846 "from_folder": self.from_folder,
1847 "to_folder": self.to_folder,
1848 "backup_folder": self.backup_folder, # Read extra data (if any) stored in a json file
1849 "xml_file_folder": xml_file_folder, # when article.XML are in separate files
1850 "keep_metadata": self.keep_metadata, # Backup/Restore existing data not in the XML
1851 "keep_translations": self.keep_translations, # Backup/Restore existing translations
1852 "no_bib": self.no_bib,
1853 "embargo": self.embargo,
1854 # Needed in Trammel
1855 "fake": self.fake,
1856 }
1857 )
1858 cmd.do()
1860 i += 1
1861 if self.callback: 1861 ↛ 1862line 1861 didn't jump to line 1862, because the condition on line 1861 was never true
1862 self.callback(self.job, i)
1864 if self.with_cedrics: 1864 ↛ 1865line 1864 didn't jump to line 1865, because the condition on line 1864 was never true
1865 src_folder = os.path.join(settings.CEDRAM_XML_FOLDER, self.pid, "metadata")
1867 xml_files = [
1868 os.path.join(src_folder, f)
1869 for f in os.listdir(src_folder)
1870 if os.path.isfile(os.path.join(src_folder, f)) and f.endswith(".xml")
1871 ]
1872 for xml_file in xml_files:
1873 if self.callback is None:
1874 print(xml_file)
1876 cmd = importCedricsIssueXmlCmd(
1877 {
1878 "colid": self.pid,
1879 "input_file": xml_file,
1880 "from_folder": self.from_folder,
1881 "to_folder": self.to_folder,
1882 }
1883 )
1884 cmd.do()
1887class importCedricsIssueXmlCmd(baseCmd):
1888 def __init__(self, params=None):
1889 self.colid = None
1890 self.input_file = None
1891 self.remove_email = True
1892 self.remove_date_prod = True
1893 self.diff_only = False
1894 self.body = None
1895 self.xissue = None
1896 self.copy_files = True
1898 super().__init__(params)
1900 self.required_params.extend(["colid"])
1902 def import_full_text(self, issue):
1903 """
1904 Some journals want to display the full text in HTML (CRCHIM/CRGEOS/CEBIOL)
1905 Read the XML file and convert the body in HTML
1906 """
1907 tex_src_folder = resolver.get_cedram_issue_tex_folder(self.colid, issue.pid)
1908 tex_folders, _ = resolver.get_cedram_tex_folders(self.colid, issue.pid)
1910 if len(tex_folders) > 0: 1910 ↛ exitline 1910 didn't return from function 'import_full_text', because the condition on line 1910 was never false
1911 i = 0
1912 for article in issue.article_set.all():
1913 article_folder = tex_folders[i]
1914 xml_file = os.path.join(
1915 tex_src_folder, article_folder, "FullText", article_folder + ".xml"
1916 )
1918 cmd = ptf_cmds.updateResourceIdPtfCmd(
1919 {"id_type": "ojs-id", "id_value": article_folder}
1920 )
1921 cmd.set_resource(article)
1922 cmd.do()
1924 if os.path.isfile(xml_file):
1925 with open(xml_file, encoding="utf-8") as f:
1926 body = f.read()
1928 cmd = addBodyInHtmlXmlCmd(
1929 {
1930 "body": body,
1931 "from_folder": settings.CEDRAM_XML_FOLDER,
1932 # nécessaire pour la copie des binaires type image
1933 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER, # idem
1934 }
1935 )
1936 cmd.set_article(article)
1937 cmd.do()
1939 i += 1
1941 def import_in_db(self):
1942 """
1943 Import Cedrics issue from /cedram_dev/exploitation/cedram
1944 This worflow is no longer used.
1945 """
1947 # Cedrics: the full text for SolR is in a separate file
1948 full_text_folder = os.path.dirname(os.path.dirname(self.input_file)) + "/plaintext/"
1950 params = {
1951 "assign_doi": False,
1952 "full_text_folder": full_text_folder,
1953 "keep_metadata": True,
1954 "keep_translations": True,
1955 "use_body": False,
1956 "xissue": self.xissue,
1957 "backup_folder": settings.MERSENNE_TMP_FOLDER,
1958 "from_folder": settings.CEDRAM_XML_FOLDER,
1959 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER if self.copy_files else None,
1960 }
1962 # params['body'] = self.body
1964 cmd = addOrUpdateIssueXmlCmd(params)
1965 issue = cmd.do()
1966 self.warnings.extend(cmd.get_warnings())
1968 # resolver.copy_binary_files(
1969 # issue,
1970 # settings.CEDRAM_XML_FOLDER,
1971 # settings.MERSENNE_TEST_DATA_FOLDER)
1973 self.import_full_text(issue)
1975 return issue
1977 def compare_issue(self):
1978 xissue = self.xissue
1979 issues_diff = {}
1980 result = True
1982 time1 = timezone.now()
1984 new_dois = [article.doi for article in xissue.articles]
1986 article_qs = Article.objects.filter(doi__in=new_dois).prefetch_related(
1987 "abstract_set",
1988 "kwd_set",
1989 "subj_set",
1990 "datastream_set",
1991 "relatedobject_set",
1992 "resourcecount_set",
1993 "contributions",
1994 "contributions__contribaddress_set",
1995 "bibitem_set__bibitemid_set",
1996 "bibitem_set__contributions",
1997 "bibitem_set__contributions__contribaddress_set",
1998 )
2000 issue = None
2001 try:
2002 issue = (
2003 Container.objects.select_related("my_collection", "my_publisher")
2004 .prefetch_related(
2005 Prefetch("article_set", queryset=article_qs, to_attr="articles_from_doi")
2006 )
2007 .get(sites__id=settings.SITE_ID, pid=xissue.pid)
2008 )
2009 except Container.DoesNotExist:
2010 pass
2012 if issue:
2013 data_issue = model_data_converter.db_to_issue_data(issue, issue.articles_from_doi)
2015 time2 = timezone.now()
2016 delta = time2 - time1
2018 delta.seconds + delta.microseconds / 1e6
2019 print(delta)
2021 # Handle xml cmds side effects (ex: "numdam" changed into "mathdoc", ...)
2022 model_data_comparator.prepare_issue_for_comparison(xissue)
2024 issue_comparator = model_data_comparator.IssueDataComparator()
2026 result = issue_comparator.compare(data_issue, xissue, issues_diff)
2028 return (result, issues_diff, xissue)
2030 def delete_previous_file(self, output_folder):
2031 basename = os.path.basename(self.input_file)
2033 output_file = os.path.join(output_folder, self.colid, basename)
2034 if os.path.isfile(output_file):
2035 os.remove(output_file)
2037 os.makedirs(output_folder, exist_ok=True)
2038 os.makedirs(os.path.dirname(output_file), exist_ok=True)
2040 return output_file
2042 def import_cedrics_issue(self):
2043 """
2044 Import Cedrics issue from /cedram_dev/exploitation/cedram
2045 This worflow is no longer used.
2046 Cedrics issues are imported from /cedram_dev/production_tex/CEDRAM
2047 (see importCedricsIssueDirectlyXmlCmd below)
2048 """
2050 output_folder = settings.MERSENNE_TMP_FOLDER
2051 ptf_xsl_folder = settings.PTF_XSL_FOLDER
2052 log_file = os.path.join(output_folder, settings.MERSENNE_LOG_FILE)
2054 # 1. Delete the previous file
2055 output_file = self.delete_previous_file(output_folder)
2057 # 2. Transform the cedrics XML into JATS
2058 cmd_folder = os.path.join(ptf_xsl_folder, "cedram")
2060 cmd_str = 'cd {}; {} cedram2ptf.py -v -x {} -p {} -o {} -b "" -l {} {} {} > {} 2>&1'.format(
2061 cmd_folder,
2062 os.path.join(settings.VIRTUALENV_DIR, "bin/python"),
2063 "-s" if self.colid in settings.MERSENNE_SEMINARS else "",
2064 self.input_file,
2065 output_folder,
2066 log_file + "1",
2067 # option -e for cedram2ptf.py for not removing email
2068 "-e" if not self.remove_email else "",
2069 "-t" if self.remove_date_prod else "",
2070 log_file,
2071 )
2073 log_file2 = log_file + "2"
2074 with open(log_file2, "w", encoding="ascii") as file_:
2075 file_.write(cmd_str + "\n")
2077 sys.path.append(ptf_xsl_folder + "/lib")
2079 try:
2080 result = subprocess.check_output(cmd_str, shell=True)
2081 except Exception as e:
2082 with open(log_file) as logfile_:
2083 logfile_body = logfile_.read()
2084 message = str(e) + "\n" + logfile_body + "\n"
2085 file_.write(message)
2086 file_.close()
2087 raise RuntimeError(message)
2089 file_.write(str(result) + "\n")
2091 # Check if the output_file has been created
2092 if not os.path.isfile(output_file):
2093 raise RuntimeError("The file was not converted in JATS")
2095 with open(output_file, encoding="utf-8") as f:
2096 self.body = f.read()
2098 parser = etree.XMLParser(
2099 huge_tree=True, recover=True, remove_blank_text=True, remove_comments=True
2100 )
2101 tree = etree.fromstring(self.body.encode("utf-8"), parser=parser)
2102 self.xissue = jats_parser.JatsIssue(tree=tree)
2103 self.warnings.extend(self.xissue.warnings)
2105 def internal_do(self):
2106 super().internal_do()
2108 if not self.xissue: 2108 ↛ 2111line 2108 didn't jump to line 2111, because the condition on line 2108 was never false
2109 self.import_cedrics_issue()
2111 result = None
2113 if self.diff_only: 2113 ↛ 2114line 2113 didn't jump to line 2114, because the condition on line 2113 was never true
2114 result = self.compare_issue()
2115 else:
2116 result = self.import_in_db()
2118 return result
2121# import from /cedram_dev/production_tex/CEDRAM
2122class importCedricsIssueDirectlyXmlCmd(importCedricsIssueXmlCmd):
2123 def __init__(self, params=None):
2124 self.is_seminar = False
2125 self.article_folders = None
2126 self.force_dois = True
2127 super().__init__(params)
2129 def read_file(self, filename, skip_lines=2):
2130 i = 0
2131 lines = []
2132 try:
2133 with open(filename, encoding="utf-8") as fr:
2134 for line in fr:
2135 if i > skip_lines:
2136 lines.append(line)
2137 i += 1
2138 except UnicodeDecodeError:
2139 i = 0
2140 lines = []
2141 with open(filename, encoding="iso-8859-1") as fr:
2142 for line in fr:
2143 if i > skip_lines:
2144 lines.append(line)
2145 i += 1
2147 return lines
2149 def import_cedrics_issue(self):
2150 """
2151 Parse the Cedrics XML directly, without Cedrics -> JATS transformation
2152 The deplace_fasc script is no longer needed, but the Cedrics issue XML has to be created
2153 Workflow
2154 1. Get the list of articles from /cedram_dev/production_tex/CEDRAM
2155 2. Cat the article XML files into one issue.XML
2156 3. Read the Cedrics issue.XML
2158 :return:
2159 """
2161 output_folder = settings.MERSENNE_TMP_FOLDER
2162 output_file = self.delete_previous_file(output_folder)
2164 basename = os.path.basename(self.input_file)
2165 if "-cdrxml" in basename: 2165 ↛ 2168line 2165 didn't jump to line 2168, because the condition on line 2165 was never false
2166 pid = basename.split("-cdrxml.")[0]
2167 else:
2168 pid = basename.split(".xml")[0]
2170 # 1. Get the list of articles
2171 tex_src_folder = resolver.get_cedram_issue_tex_folder(self.colid, pid)
2172 self.article_folders, self.dois = resolver.get_cedram_tex_folders(self.colid, pid)
2174 # 2. Create the issue XML file
2175 with open(output_file, "w", encoding="utf-8") as fw:
2176 # 2.a. Start the issue.xml based on @pid-cdrxml.xml
2177 fw.write('<?xml version="1.0" encoding="utf-8" standalone="no"?>\n')
2178 fw.write('<!DOCTYPE cedram SYSTEM "/home/cedram/XML/dtd/cedram.dtd">\n')
2179 fw.write("<cedram>\n")
2181 lines = self.read_file(self.input_file)
2182 for line in lines:
2183 fw.write(line)
2185 # 2.b. Cat the article XML files
2186 for basename in self.article_folders:
2187 src_file = os.path.join(tex_src_folder, basename, basename + "-cdrxml.xml")
2189 lines = self.read_file(src_file)
2190 for line in lines:
2191 fw.write(line)
2193 fw.write("</cedram>\n")
2195 # 3. Read the Cedrics issue.XML
2196 with open(output_file, encoding="utf-8") as f:
2197 self.body = f.read()
2199 parser = etree.XMLParser(
2200 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True
2201 )
2202 tree = etree.fromstring(self.body.encode("utf-8"), parser=parser)
2203 self.xissue = cedrics_parser.CedricsIssue(
2204 tree=tree,
2205 is_seminar=self.is_seminar,
2206 ignore_date_published=self.remove_date_prod,
2207 article_folders=self.article_folders,
2208 dois=self.dois,
2209 )
2210 if self.force_dois: 2210 ↛ 2215line 2210 didn't jump to line 2215, because the condition on line 2210 was never false
2211 for xarticle in self.xissue.articles:
2212 if xarticle.doi is None: 2212 ↛ 2213line 2212 didn't jump to line 2213, because the condition on line 2212 was never true
2213 raise ValueError(xarticle.pid, "n'a pas de doi")
2215 self.warnings.extend(self.xissue.warnings)
2217 def import_in_db(self):
2218 params = {
2219 "assign_doi": False,
2220 "full_text_folder": settings.CEDRAM_TEX_FOLDER, # the full text for SolR is in a separate file
2221 "keep_metadata": True,
2222 "keep_translations": True, # The cedrics XML does not have the translations. backup/restore them.
2223 "use_body": False,
2224 "xissue": self.xissue,
2225 "backup_folder": settings.MERSENNE_TMP_FOLDER, # temp folder used to backup/restore info during the import
2226 "from_folder": settings.CEDRAM_TEX_FOLDER,
2227 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER if self.copy_files else None,
2228 }
2230 cmd = addOrUpdateIssueXmlCmd(params)
2231 issue = cmd.do()
2232 self.warnings.extend(cmd.get_warnings())
2234 self.import_full_text(issue)
2236 return issue
2239class addCedricsIssueXmlCmd(addXmlCmd):
2240 assign_doi = False
2241 full_text_folder = ""
2242 import_folder = None
2243 prod_deployed_date_iso_8601_date_str = None
2244 xissue = None
2245 remove_blank_text = False
2246 is_seminar = False
2248 def internal_do(self):
2249 super().internal_do()
2251 self.xissue = cedrics_parser.CedricsIssue(tree=self.tree, is_seminar=self.is_seminar)
2253 return self.xissue
2256class addorUpdateCedricsArticleXmlCmd(baseCmd):
2257 def __init__(self, params=None):
2258 self.container_pid = None
2259 self.article_folder_name = None
2261 super().__init__(params)
2263 self.required_params.extend(["container_pid", "article_folder_name"])
2265 def internal_do(self):
2266 super().internal_do()
2268 issue = model_helpers.get_container(self.container_pid)
2269 if not issue:
2270 raise exceptions.ResourceDoesNotExist(f"Issue {self.container_pid} does not exist")
2272 colid = issue.my_collection.pid
2273 article_folder = os.path.join(
2274 settings.CEDRAM_TEX_FOLDER, colid, self.container_pid, self.article_folder_name
2275 )
2277 # 1. Read the Cedrics article.XML
2278 input_file = os.path.join(article_folder, f"{self.article_folder_name}-cdrxml.xml")
2279 with open(input_file, encoding="utf-8") as f:
2280 body = f.read()
2282 # 2. Parse the file and create an xarticle
2283 is_seminar = colid in settings.MERSENNE_SEMINARS
2284 parser = etree.XMLParser(
2285 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True
2286 )
2287 tree = etree.fromstring(body.encode("utf-8"), parser=parser)
2288 xarticle = cedrics_parser.CedricsArticle(
2289 tree=tree,
2290 colid=colid,
2291 issue_id=self.container_pid,
2292 is_seminar=is_seminar,
2293 ignore_date_published=True,
2294 article_folder=self.article_folder_name,
2295 )
2296 if xarticle.doi is None:
2297 raise ValueError(xarticle.pid, "n'a pas de doi")
2299 # Get the article position in its issue (seq) to preserve its order
2300 article_folders, dois = resolver.get_cedram_tex_folders(colid, self.container_pid)
2301 i = 1
2302 for folder in article_folders:
2303 if folder == self.article_folder_name:
2304 xarticle.seq = i
2305 i += 1
2307 existing_article = model_helpers.get_article(xarticle.pid)
2308 temp_folder = settings.MERSENNE_TMP_FOLDER
2310 # 3. Backup/Suppression de l'article existant
2311 if existing_article:
2312 # On commence par faire un backup de l'existant en cas de bug.
2313 ptf_cmds.exportPtfCmd(
2314 {
2315 "pid": self.container_pid,
2316 "with_internal_data": True,
2317 "with_binary_files": False,
2318 "for_archive": False,
2319 "export_folder": os.path.join(temp_folder, "backup"),
2320 }
2321 ).do()
2323 # On sauvegarde les données additionnelles (extid, deployed_date,...) dans un json
2324 params = {
2325 "pid": existing_article.pid,
2326 "export_folder": temp_folder,
2327 "export_all": True,
2328 "with_binary_files": True,
2329 }
2330 ptf_cmds.exportExtraDataPtfCmd(params).do()
2332 backup_obj_not_in_metadata(existing_article)
2333 backup_translation(existing_article)
2335 # Inutile d'effacer l'article existant, addArticleXmlCmd le fait en mode standalone
2337 # 4. Ajout de l'article dans Django/SolR
2338 params = {
2339 "xarticle": xarticle,
2340 "issue": issue,
2341 "standalone": True,
2342 "use_body": False, # No self.body with the content of the XML file; xarticle is passed directly
2343 "full_text_folder": settings.CEDRAM_TEX_FOLDER, # the full text for SolR is in a separate file
2344 # temp folder used to backup/restore info during the import
2345 "from_folder": settings.CEDRAM_TEX_FOLDER,
2346 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER,
2347 "keep_translations": True,
2348 }
2350 cmd = addArticleXmlCmd(params)
2351 cmd.set_collection(issue.my_collection)
2352 article = cmd.do()
2354 # 5. Lecture du full text en HTML
2355 xml_file = os.path.join(article_folder, "FullText", self.article_folder_name + ".xml")
2356 if os.path.isfile(xml_file):
2357 with open(xml_file, encoding="utf-8") as f:
2358 body = f.read()
2360 cmd = addBodyInHtmlXmlCmd(
2361 {
2362 "body": body,
2363 "from_folder": settings.CEDRAM_XML_FOLDER,
2364 # nécessaire pour la copie des binaires type image
2365 "to_folder": settings.MERSENNE_TEST_DATA_FOLDER, # idem
2366 "remove_blank_text": False,
2367 }
2368 )
2369 cmd.set_article(article)
2370 cmd.do()
2372 # 6. On ajoute l'ojs-id pour ptf-tools
2373 cmd = ptf_cmds.updateResourceIdPtfCmd(
2374 {"id_type": "ojs-id", "id_value": self.article_folder_name}
2375 )
2376 cmd.set_resource(article)
2377 cmd.do()
2379 # 7. On restaure les données additionnelles (extid, deployed_date,...)
2380 if existing_article:
2381 ptf_cmds.importExtraDataPtfCmd(
2382 {"pid": existing_article.pid, "import_folder": temp_folder}
2383 ).do()
2385 restore_obj_not_in_metadata(article)
2386 restore_translation(article)
2388 return article
2391class transformBodyInHtmlXmlCmd(addXmlCmd):
2392 """
2393 transformBodyInHtmlXmlCmd: transform the JATS body in HTML
2395 TODO: handle images,...
2397 """
2399 use_body = False
2401 def internal_do(self):
2402 super().internal_do()
2404 xsl_file = settings.PTF_HTML_XSL
2405 xslt_doc = etree.parse(xsl_file)
2406 t = etree.XSLT(xslt_doc)
2408 html_tree = t(self.tree).getroot()
2410 body = html_tree.find("body/article/main")
2411 text = xmldata_jats.innerxml(body).decode("utf-8")
2413 return text
2416class addBodyInHtmlXmlCmd(addXmlCmd):
2417 """
2418 addBodyInHtmlXmlCmd: read the JATS body of an article
2419 and create the corresponding HTML
2421 TODO: handle images,... manage warnings for unused tag ?
2423 """
2425 def __init__(self, params=None):
2426 self.article = None
2427 self.pid = None
2429 super().__init__(params)
2431 def set_article(self, article):
2432 self.article = article
2434 def pre_do(self):
2435 super().pre_do()
2437 if self.pid is None and self.article is None: 2437 ↛ 2438line 2437 didn't jump to line 2438, because the condition on line 2437 was never true
2438 raise ValueError("pid et article sont vides")
2440 if self.article is None: 2440 ↛ 2441line 2440 didn't jump to line 2441, because the condition on line 2440 was never true
2441 self.article = model_helpers.get_article(self.pid)
2443 if self.pid is None: 2443 ↛ exitline 2443 didn't return from function 'pre_do', because the condition on line 2443 was never false
2444 self.pid = self.article.pid
2446 def internal_do(self):
2447 super().internal_do()
2449 xarticle = jats_parser.JatsArticle(tree=self.tree, pid=self.pid)
2450 # faut il récupérer les warnings du parseHTML ?
2451 # self.warnings.extend(xarticle.warnings)
2452 self.article.relatedobject_set.filter(rel="html-image").delete()
2453 self.add_objects_with_location(xarticle.figures, self.article, "RelatedObject")
2455 params = {
2456 "body_html": xarticle.body_html,
2457 "body_tex": xarticle.body_tex,
2458 "body_xml": xarticle.body_xml,
2459 "use_page_count": False,
2460 }
2462 cmd = ptf_cmds.updateArticlePtfCmd(params)
2463 cmd.set_article(self.article)
2464 cmd.do()
2466 # copy_binary_files will call resolver.copy_html_images
2467 # to copy the article images
2468 # because updateArticlePtfCmd is not from addPtfCmd, need to copy files here
2470 resolver.copy_html_images(
2471 self.article, settings.MERSENNE_TEST_DATA_FOLDER, settings.CEDRAM_XML_FOLDER
2472 )
2475class updateCacheXmlCmd(baseCmd):
2476 """
2477 recreate the citation_html field of the bibitems
2479 Params: colid: pid of the collection to process
2480 """
2482 def __init__(self, params=None):
2483 self.colid = None
2484 self.start_id = None
2486 super().__init__(params)
2488 self.required_params.extend(["colid"])
2490 def update_article(self, xarticle):
2491 article = model_helpers.get_article(xarticle.pid)
2492 if article is None:
2493 raise exceptions.ResourceDoesNotExist(f"Article {xarticle.pid} does not exist")
2495 article.title_html = xarticle.title_html
2496 article.title_tex = xarticle.title_tex
2497 article.trans_title_html = xarticle.trans_title_html
2498 article.trans_title_tex = xarticle.trans_title_tex
2499 article.save()
2501 for xabstract, abstract in zip(xarticle.abstracts, article.abstract_set.all()):
2502 abstract.value_html = xabstract["value_html"]
2503 abstract.value_tex = xabstract["value_tex"]
2504 abstract.save()
2506 # for xkwd_group, kwd_group in zip(xarticle.kwd_groups, article.kwdgroup_set.all()):
2507 # kwd_group.value_html = xkwd_group['value_html']
2508 # kwd_group.value_tex = xkwd_group['value_tex']
2509 # kwd_group.save()
2511 for xbib, bib in zip(xarticle.bibitems, article.bibitem_set.all()):
2512 bib.citation_html = xbib.citation_html
2513 bib.citation_tex = xbib.citation_tex
2514 bib.article_title_tex = xbib.article_title_tex
2515 bib.chapter_title_tex = xbib.chapter_title_tex
2516 bib.source_tex = xbib.source_tex
2517 bib.volume = xbib.volume
2518 bib.save()
2520 if hasattr(settings, "SHOW_BODY") and settings.SHOW_BODY:
2521 params = {
2522 "body_html": xarticle.body_html,
2523 "body_tex": xarticle.body_tex,
2524 "body_xml": xarticle.body_xml,
2525 "use_page_count": False,
2526 }
2528 cmd = ptf_cmds.updateArticlePtfCmd(params)
2529 cmd.set_article(article)
2530 cmd.do()
2532 def internal_do(self):
2533 super().internal_do()
2535 collection = model_helpers.get_collection(self.colid)
2536 if collection is None:
2537 raise exceptions.ResourceDoesNotExist(f"Collection {self.colid} does not exist")
2539 qs = collection.content.all().order_by("pid")
2540 start = self.start_id is None
2541 for container in qs:
2542 if not start and container.pid == self.start_id:
2543 start = True
2545 if start:
2546 print(container.pid)
2547 with_body = hasattr(settings, "SHOW_BODY") and settings.SHOW_BODY
2548 xml_body = ptf_cmds.exportPtfCmd(
2549 {"pid": container.pid, "with_body": with_body}
2550 ).do()
2552 parser = etree.XMLParser(
2553 huge_tree=True,
2554 recover=True,
2555 remove_blank_text=False,
2556 remove_comments=True,
2557 resolve_entities=True,
2558 )
2559 tree = etree.fromstring(xml_body.encode("utf-8"), parser=parser)
2560 xissue = jats_parser.JatsIssue(tree=tree)
2562 for xarticle in xissue:
2563 self.update_article(xarticle)