Coverage for apps/ptf/display/resolver.py: 74%
364 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-05-19 19:20 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-05-19 19:20 +0000
1import os
2import shutil
3import time
5from django.conf import settings
7from ptf.cmds.xml import xml_utils
9NOW = time.gmtime()[0]
12def resolve_id(id_type, id_value, force_numdam=False):
13 if id_type == "doi":
14 href = "https://doi.org/" + id_value
15 elif id_type == "zbl-item-id" or id_type == "jfm-item-id":
16 href = "https://zbmath.org/?q=an:" + id_value
17 elif id_type == "mr-item-id":
18 if "#" in id_value: 18 ↛ 19line 18 didn't jump to line 19, because the condition on line 18 was never true
19 id_value = id_value.replace(" #", ":")
20 href = "https://mathscinet.ams.org/mathscinet-getitem?mr=" + id_value
21 elif id_type == "nmid" or id_type == "numdam-id" or id_type == "mathdoc-id":
22 if force_numdam: 22 ↛ 23line 22 didn't jump to line 23, because the condition on line 22 was never true
23 href = f"http://www.numdam.org/item/{id_value}"
24 else:
25 href = f"/item/{id_value}"
26 elif id_type == "eudml-item-id":
27 values = id_value.split(":")
28 if len(values) > 0: 28 ↛ 30line 28 didn't jump to line 30, because the condition on line 28 was never false
29 id_value = values[-1]
30 href = "https://eudml.org/doc/" + id_value
31 elif id_type == "sps-id": 31 ↛ 32line 31 didn't jump to line 32, because the condition on line 31 was never true
32 href = "http://sites.mathdoc.fr/cgi-bin/spitem?id=" + id_value
33 elif id_type == "arxiv":
34 href = "https://arxiv.org/abs/" + id_value
35 elif id_type == "hal": 35 ↛ 36line 35 didn't jump to line 36, because the condition on line 35 was never true
36 href = "https://hal.archives-ouvertes.fr/" + id_value
37 elif id_type == "tel": 37 ↛ 38line 37 didn't jump to line 38, because the condition on line 37 was never true
38 href = "https://tel.archives-ouvertes.fr/" + id_value
39 elif id_type == "theses.fr": 39 ↛ 40line 39 didn't jump to line 40, because the condition on line 39 was never true
40 href = "https://theses.fr/" + id_value
41 elif id_type == "orcid": 41 ↛ 42line 41 didn't jump to line 42, because the condition on line 41 was never true
42 href = "https://orcid.org/" + id_value
43 elif id_type == "idref": 43 ↛ 44line 43 didn't jump to line 44, because the condition on line 43 was never true
44 href = "https://www.idref.fr/" + id_value
45 elif id_type == "semantic-scholar": 45 ↛ 47line 45 didn't jump to line 47, because the condition on line 45 was never false
46 href = "https://www.semanticscholar.org/paper/" + id_value
47 elif id_type == "pmid":
48 href = "https://pubmed.ncbi.nlm.nih.gov/" + id_value
49 elif id_type == "ark":
50 href = "http://ark.bnf.fr/" + id_value
51 else:
52 href = ""
53 return href
56def find_id_type(id):
57 id_type = None
58 if id.find("10.") == 0:
59 id_type = "doi"
60 elif id.find("hal-") == 0:
61 id_type = "hal"
62 elif id.lower().find("arxiv:") == 0: 62 ↛ 72line 62 didn't jump to line 72, because the condition on line 62 was never false
63 id_type = "arxiv"
65 # if (len(id) == 9 or len(id) == 10) and id.find(".") == 5:
66 # year = id[0:1]
67 # month = id[2:3]
68 # sequence = id[5:]
69 # if year.is_numeric() and month.is_numeric() and 1 < int(month) < 13 and sequence.is_numeric():
70 # id_type = "arxiv"
72 return id_type
75def get_mimetype(filename):
76 type_extension = {
77 "pdf": "application/pdf",
78 "djvu": "image/x.djvu",
79 "tex": "application/x-tex",
80 "png": "image/png",
81 "jpg": "image/jpeg",
82 }
84 basename = os.path.basename(filename)
85 lower_basename = basename.lower()
86 extension = os.path.splitext(lower_basename)[1][1:]
87 mimetype = type_extension.get(extension, "")
88 return mimetype
91def get_article_base_url():
92 return settings.ARTICLE_BASE_URL
95def get_issue_base_url():
96 return settings.ISSUE_BASE_URL
99def get_icon_base_url():
100 return settings.ICON_BASE_URL
103def get_icon_url(id_, filename):
104 href = get_icon_base_url() + filename
105 # path = get_relative_file_path(id, filename)
106 # if os.path.isabs(path):
107 # path = path[1:]
108 # href = os.path.join(get_icon_base_url(), path)
109 return href
112def get_doi_url(doi):
113 href = settings.DOI_BASE_URL + doi
114 return href
117def get_relative_folder(collection_id, container_id=None, article_id=None):
118 folder = collection_id
119 if container_id:
120 folder += "/" + container_id
121 if article_id:
122 folder += "/" + article_id
123 return folder
126def embargo(wall, year):
127 result = False
128 y = NOW
130 if wall:
131 try:
132 y = int(year.split("-")[0])
133 except BaseException:
134 pass
136 result = NOW - y <= wall
138 return result
141# Iterate a folder with a collection
142# The folder must look like @COL/@ISSUE/@ISSUE.XML
145def iterate_collection_folder(folder, pid, first_issue=""):
146 root_folder = os.path.join(folder, pid)
148 start = len(first_issue) == 0
149 # first_issue = 'CRMATH_2008__346_1-2'
150 for item in sorted(os.listdir(root_folder)):
151 if not start and item == first_issue: 151 ↛ 152line 151 didn't jump to line 152, because the condition on line 151 was never true
152 start = True
153 if start: # and item != 'CRMATH_2015__353_2': 153 ↛ 150line 153 didn't jump to line 150, because the condition on line 153 was never false
154 dir = os.path.join(root_folder, item)
155 if os.path.isdir(dir):
156 file = os.path.join(root_folder, item, item + ".xml")
157 if os.path.isfile(file): 157 ↛ 159line 157 didn't jump to line 159, because the condition on line 157 was never false
158 yield item, file
159 file = os.path.join(root_folder, item, item + "-cdrxml.xml")
160 if os.path.isfile(file): 160 ↛ 161line 160 didn't jump to line 161, because the condition on line 160 was never true
161 yield item, file
164def create_folder(folder):
165 try:
166 os.makedirs(folder)
167 except BaseException:
168 pass
170 if not os.path.isdir(folder): 170 ↛ 171line 170 didn't jump to line 171, because the condition on line 170 was never true
171 raise RuntimeError("Unable to create " + folder)
174def copy_folder(from_dir, to_dir):
175 if os.path.isdir(from_dir): 175 ↛ 178line 175 didn't jump to line 178, because the condition on line 175 was never false
176 create_folder(to_dir)
178 for f in os.listdir(from_dir):
179 from_path = os.path.join(from_dir, f)
180 if os.path.isfile(from_path): 180 ↛ 182line 180 didn't jump to line 182, because the condition on line 180 was never false
181 copy_file(from_path, to_dir)
182 if os.path.isdir(from_path): 182 ↛ 183line 182 didn't jump to line 183, because the condition on line 182 was never true
183 copy_folder(from_path, os.path.join(to_dir, f))
186def copy_file(from_path, to_path):
187 if os.path.isfile(from_path):
188 if os.path.isdir(to_path):
189 to_path = os.path.join(to_path, os.path.basename(from_path))
190 if to_path.startswith(settings.MATHDOC_ARCHIVE_FOLDER):
191 # copy2 attempts to preserve all file metadata
192 # on /mathdoc_archive, we don't want to preserve the mode, just the dates
193 shutil.copyfile(from_path, to_path)
194 shutil.copystat(from_path, to_path)
195 else:
196 shutil.copy2(from_path, to_path)
199def copy_html_images(resource, to_folder, from_folder):
200 """
201 Copy the figures associated with the HTML body of an article
202 if from_archive:
203 Images are in settings.MATHDOC/@colid/@issue_id/@a_id/src/tex/figures/
204 if from_cedram:
205 Images are in settings.CEDRAM_TEX_FOLDER/@colid/@issue_id/@tex_aid/Fulltext/figures/
207 @param resource:
208 @param to_folder:
209 @param from_folder:
210 @return: nothing
211 """
213 if resource.classname != "Article":
214 return
216 article_to_copy = resource
217 issue = article_to_copy.my_container
218 colid = article_to_copy.get_collection().pid
220 if from_folder == settings.CEDRAM_XML_FOLDER:
221 tex_src_folder = get_cedram_issue_tex_folder(colid, issue.pid)
222 tex_folders, _ = get_cedram_tex_folders(colid, issue.pid)
224 if len(tex_folders) > 0: 224 ↛ exitline 224 didn't return from function 'copy_html_images', because the condition on line 224 was never false
225 i = 0
226 for article in issue.article_set.all():
227 if article_to_copy.pid == article.pid: 227 ↛ 254line 227 didn't jump to line 254, because the condition on line 227 was never false
228 # l'ordre d'enregistrement des articles dans la bdd est important : l'ordre du tex est SENSE correspondre au xml de l'issue
230 dest_folder = os.path.join(
231 to_folder,
232 get_relative_folder(colid, issue.pid, article.pid),
233 "src/tex/figures",
234 )
236 if os.path.isdir(dest_folder):
237 try:
238 shutil.rmtree(dest_folder)
239 except OSError:
240 message = "Unable to remove " + dest_folder
241 raise RuntimeError(message)
243 src_folder = os.path.join(
244 tex_src_folder, tex_folders[i], "FullText", "figures"
245 )
246 qs = article.relatedobject_set.filter(rel="html-image")
247 if qs.count() > 0: 247 ↛ 250line 247 didn't jump to line 250, because the condition on line 247 was never false
248 create_folder(dest_folder)
250 for related_obj in qs:
251 img_file = os.path.join(src_folder, os.path.basename(related_obj.location))
252 copy_file(img_file, dest_folder)
254 i += 1
256 else:
257 # copy depuis archive, directement tout le répertoire contenant les images
258 dest_folder = os.path.join(
259 to_folder,
260 get_relative_folder(colid, issue.pid, article_to_copy.pid),
261 "src/tex/figures",
262 )
263 if os.path.isdir(dest_folder):
264 try:
265 shutil.rmtree(dest_folder)
266 except OSError:
267 message = "Unable to remove " + dest_folder
268 raise RuntimeError(message)
270 src_folder = os.path.join(
271 from_folder,
272 get_relative_folder(colid, issue.pid, article_to_copy.pid),
273 "src/tex/figures",
274 )
275 if os.path.isdir(src_folder):
276 copy_folder(src_folder, dest_folder)
279def copy_file_obj_to_article_folder(
280 file_obj, colid, issue_pid, article_pid, is_image=False, article_container_pid=None
281):
282 if not is_image:
283 name, extension = os.path.splitext(file_obj.name)
284 relative_folder = get_relative_folder(colid, issue_pid, article_pid)
285 folder = os.path.join(settings.RESOURCES_ROOT, relative_folder)
286 create_folder(folder)
287 full_file_name = os.path.join(folder, article_pid + extension)
288 relative_file_name = os.path.join(relative_folder, article_pid + extension)
290 with open(full_file_name, "wb+") as destination:
291 for chunk in file_obj.chunks():
292 destination.write(chunk)
294 else:
295 name, extension = os.path.splitext(file_obj.name)
296 relative_folder = get_relative_folder(colid, issue_pid, article_pid)
297 folder = os.path.join(settings.RESOURCES_ROOT, relative_folder + "/src/media")
298 create_folder(folder)
299 full_file_name = os.path.join(folder, name + extension)
300 with open(full_file_name, "wb+") as destination:
301 for chunk in file_obj.chunks():
302 destination.write(chunk)
304 relative_file_name = os.path.join(relative_folder, article_pid + extension)
306 return relative_file_name
309def copy_binary_files(resource, from_folder, to_folder, binary_files=None):
310 if not from_folder == to_folder:
311 if binary_files is None: 311 ↛ 315line 311 didn't jump to line 315, because the condition on line 311 was never false
312 copy_html_images(resource, to_folder, from_folder)
313 binary_files = resource.get_binary_files_location()
315 for file in binary_files:
316 to_path = os.path.join(to_folder, file)
317 dest_folder = os.path.dirname(to_path)
319 os.makedirs(dest_folder, exist_ok=True)
320 skip_copy = False
322 if "http" in file: 322 ↛ 323line 322 didn't jump to line 323, because the condition on line 322 was never true
323 skip_copy = True
324 from_path = os.path.join(from_folder, file)
326 if not skip_copy and os.path.isfile(from_path): 326 ↛ 315line 326 didn't jump to line 315, because the condition on line 326 was never false
327 copy_file(from_path, to_path)
330def delete_object_folder(object_folder, to_folder):
331 folder = os.path.join(to_folder, object_folder)
333 # pas de sécurité car pour garder le mode CASCADE de la db, on supprime le rép sans s'occuper de ce qu'il y a dedans
334 # si on veut vérifier, décommenter :
335 # for entry in os.listdir(folder):
336 # if entry.startswith(colid) and os.path.isdir(os.path.join(folder, entry)):
337 # print(entry)
338 # os.path.join(folder, entry)
339 # raise Exception('Le répertoire a supprimer : ' + folder + ' semble encore contenir des articles/containers')
340 #
341 # if verify == True:
342 # for root, dirs, files in os.walk(folder):
343 # if len(files) > 0:
344 # raise Exception('Le répertoire a supprimer : ' + folder + ' semble encore contenir des objects')
346 folder = os.path.normpath(folder)
347 # garde fous :)
348 if folder in [ 348 ↛ 353line 348 didn't jump to line 353, because the condition on line 348 was never true
349 "/mersenne_prod_data",
350 "/mersenne_test_data",
351 "/mathdoc_archive",
352 ] or folder.startswith("/cedram_dev"):
353 raise Exception("Attention, pb avec la suppression de " + folder)
355 if os.path.isdir(folder):
356 shutil.rmtree(folder)
359def delete_file(path):
360 if os.path.isfile(path):
361 os.remove(path)
364def get_disk_location(
365 root_folder, collection_id, ext, container_id=None, article_id=None, do_create_folder=False
366):
367 if do_create_folder: 367 ↛ 368line 367 didn't jump to line 368, because the condition on line 367 was never true
368 folder = os.path.join(root_folder, collection_id)
369 create_folder(folder)
371 if container_id:
372 folder = os.path.join(root_folder, collection_id, container_id)
373 create_folder(folder)
375 if article_id:
376 folder = os.path.join(root_folder, collection_id, container_id, article_id)
377 create_folder(folder)
379 last_id = collection_id
380 filename = os.path.join(root_folder, collection_id)
381 if container_id: 381 ↛ 384line 381 didn't jump to line 384, because the condition on line 381 was never false
382 filename = os.path.join(filename, container_id)
383 last_id = container_id
384 if article_id: 384 ↛ 388line 384 didn't jump to line 388, because the condition on line 384 was never false
385 filename = os.path.join(filename, article_id)
386 last_id = article_id
388 filename = os.path.join(filename, last_id + "." + ext)
390 return filename
393def get_body(filename):
394 with open(filename, encoding="utf-8") as file_:
395 body = file_.read()
396 return body
399def get_archive_filename(root_folder, colid, pid, ext, do_create_folder=False, article_pid=None):
400 """
402 :param root_folder: root_folder of the archive. Ex: /mathdoc_archive
403 :param colid: collection id
404 :param pid: issue id
405 :param ext: filename extension ("xml" or "json")
406 :param create_folder: option to recursively create sub folders
407 :return:
408 """
410 # TODO: call get_disk_location(root_folder, colid, ext, pid, None, do_create_folder)
412 if do_create_folder:
413 folder = os.path.join(root_folder, colid)
414 create_folder(folder)
416 if pid:
417 folder = os.path.join(root_folder, colid, pid)
418 create_folder(folder)
420 if article_pid: 420 ↛ 421line 420 didn't jump to line 421, because the condition on line 420 was never true
421 folder = os.path.join(folder, article_pid)
422 create_folder(folder)
424 if pid and article_pid: 424 ↛ 425line 424 didn't jump to line 425, because the condition on line 424 was never true
425 filename = os.path.join(root_folder, colid, pid, article_pid, article_pid + "." + ext)
426 elif pid:
427 filename = os.path.join(root_folder, colid, pid, pid + "." + ext)
428 else:
429 filename = os.path.join(root_folder, colid, colid + "." + ext)
431 return filename
434# Read the XML of an issue/collection within an archive folder
435# The folder must look like @COL/@ISSUE/@ISSUE.XML
436# @COL/@COL.XML
439def get_archive_body(root_folder, colid, pid):
440 filename = get_archive_filename(root_folder, colid, pid, "xml")
441 return get_body(filename)
444def is_tex_comment(text, i):
445 is_comment = False
446 while i > 0 and text[i] == " ": 446 ↛ 447line 446 didn't jump to line 447, because the condition on line 446 was never true
447 i -= 1
449 if i >= 0 and text[i] == "%": 449 ↛ 450line 449 didn't jump to line 450, because the condition on line 449 was never true
450 is_comment = True
451 elif i > 0 and text[i] == "~" and text[i - 1] == "%": 451 ↛ 452line 451 didn't jump to line 452, because the condition on line 451 was never true
452 is_comment = True
454 return is_comment
457def is_tex_def(text, i):
458 is_def = False
460 if text[i - 5 : i - 1] == "\\def": 460 ↛ 461line 460 didn't jump to line 461, because the condition on line 460 was never true
461 is_def = True
463 return is_def
466def is_tex_newcommand(text, i):
467 is_newcommand = False
469 if text[i - 12 : i - 1] == "\\newcommand": 469 ↛ 470line 469 didn't jump to line 470, because the condition on line 469 was never true
470 is_newcommand = True
472 return is_newcommand
475def get_cedram_issue_tex_folder(colid, issue_id):
476 return os.path.join(settings.CEDRAM_TEX_FOLDER, colid, issue_id)
479def get_cedram_tex_folders(colid, issue_id):
480 """
481 return article filenames in cedram tex issue folder and corresponding doi if present, extracted from issue tex file
482 @param colid:
483 @param issue_id:
484 @return: list of filename, list of doi
485 """
486 filenames = []
487 dois = []
489 body = ""
490 issue_filename = os.path.join(get_cedram_issue_tex_folder(colid, issue_id), issue_id + ".tex")
491 if os.path.isfile(issue_filename):
492 try:
493 with open(issue_filename, encoding="utf-8") as f:
494 body = f.read()
495 except UnicodeDecodeError:
496 with open(issue_filename, encoding="iso-8859-1") as f:
497 body = f.read()
499 lower_body = body.lower()
501 li = []
502 j = body.find("includearticle")
503 if j >= 0: 503 ↛ 505line 503 didn't jump to line 505, because the condition on line 503 was never false
504 li.append(j)
505 j = body.find("includeprearticle")
506 if j >= 0: 506 ↛ 507line 506 didn't jump to line 507, because the condition on line 506 was never true
507 li.append(j)
508 j = lower_body.find("includepreface")
509 if j >= 0: 509 ↛ 510line 509 didn't jump to line 510, because the condition on line 509 was never true
510 li.append(j)
511 i = min(li) if len(li) > 0 else -1
513 while i >= 0:
514 if ( 514 ↛ 537line 514 didn't jump to line 537
515 i > 1
516 and not is_tex_comment(body, i - 2)
517 and not is_tex_def(body, i)
518 and not is_tex_newcommand(body, i)
519 ):
520 doi = None
521 while body[i] != "{":
522 if len(body) > i + 4 and body[i : i + 4] == "doi=":
523 j = i + 4
524 while body[i] != "," and body[i] != "]":
525 i += 1
526 doi = xml_utils.normalize_space(body[j:i])
527 i += 1
528 i += 1
529 filename = ""
530 while body[i] != "}":
531 filename += body[i]
532 i += 1
533 if len(filename) > 0: 533 ↛ 539line 533 didn't jump to line 539, because the condition on line 533 was never false
534 filenames.append(filename)
535 dois.append(doi)
536 else:
537 i += 1
539 li = []
540 j = body.find("includearticle", i)
541 if j >= 0:
542 li.append(j)
543 j = body.find("includeprearticle", i)
544 if j >= 0: 544 ↛ 545line 544 didn't jump to line 545, because the condition on line 544 was never true
545 li.append(j)
546 j = lower_body.find("includepreface", i)
547 if j >= 0: 547 ↛ 548line 547 didn't jump to line 548, because the condition on line 547 was never true
548 li.append(j)
549 i = min(li) if len(li) > 0 else -1
551 return filenames, dois
554def get_bibtex_from_tex(tex_filename):
555 bibtex_filename = ""
557 body = ""
558 if os.path.isfile(tex_filename): 558 ↛ 580line 558 didn't jump to line 580, because the condition on line 558 was never false
559 try:
560 with open(tex_filename, encoding="utf-8") as f:
561 body = f.read()
562 except UnicodeDecodeError:
563 with open(tex_filename, encoding="iso-8859-1") as f:
564 body = f.read()
566 i = body.find("\\bibliography")
567 while i >= 0:
568 if i > 1 and not is_tex_comment(body, i - 2): 568 ↛ 576line 568 didn't jump to line 576, because the condition on line 568 was never false
569 while body[i] != "{":
570 i += 1
571 i += 1
572 while body[i] != "}":
573 bibtex_filename += body[i]
574 i += 1
575 else:
576 i += 1
578 i = body.find("\\bibliography", i)
580 return bibtex_filename
583PCJ_SECTIONS = {
584 "animsci": "Animal Science",
585 "archaeo": "Archaeology",
586 "ecology": "Ecology",
587 "ecotoxenvchem": "Ecotoxicology & Environmental Chemistry",
588 "evolbiol": "Evolutionary Biology",
589 "forestwoodsci": "Forest & Wood Sciences",
590 "genomics": "Genomics",
591 "healthmovsci": "Health & Movement Sciences",
592 "infections": "Infections",
593 "mcb": "Mathematical & Computational Biology",
594 "microbiol": "Microbiology",
595 "networksci": "Network Science",
596 "neuro": "Neuroscience",
597 "paleo": "Paleontology",
598 "rr": "Registered Reports",
599 "zool": "Zoology",
600}
602PCJ_UGA_SECTION = ["healthmovsci", "rr"]
603PCJ_CONFERENCES = ["Euring 2023"]
604PCJ_MANDATORY_TOPICS = {
605 "ecology": "Ecology",
606 "evolbiol": "Evolution",
607 "genomics": "Genetics/genomics",
608 "paleo": "Paleontology",
609 "archaeo": "Archaeology",
610 "microbiol": "Microbiology",
611 "neuro": "Neuroscience",
612}
615def get_pci(value):
616 if value in PCJ_SECTIONS: 616 ↛ 618line 616 didn't jump to line 618, because the condition on line 616 was never false
617 return PCJ_SECTIONS[value]
618 return ""
621ARTICLE_TYPES = {
622 "biographical-note": "Notice biographique",
623 "book-review": "Recension d’ouvrage",
624 "clarification": "Mise au point",
625 "congress": "Intervention en colloque",
626 "corrigendum": "Corrigendum",
627 "editorial": "Éditorial",
628 "erratum": "Erratum",
629 "expression-of-concern": "Avertissement des éditeurs",
630 "foreword": "Avant-propos",
631 "guest-editors": "Rédacteurs invités",
632 "historical-commentary": "Commentaire historique",
633 "history-of-sciences": "Histoire des sciences et des idées",
634 "letter": "Commentaire et réponse",
635 "news": "C'est apparu dans la presse",
636 "opinion": "Opinion / Perspective",
637 "preliminary-communication": "Communication préliminaire",
638 "research-article": "Article de recherche",
639 "retraction": "Rétractation",
640 "review": "Article de synthèse",
641 "software-tool": "Outil logiciel",
642}