Coverage for apps/ptf/display/resolver.py: 74%

1import os

2import shutil

3import time

5from django.conf import settings

7from ptf.cmds.xml import xml_utils

9NOW = time.gmtime()[0]

12def resolve_id(id_type, id_value, force_numdam=False):

13 if id_type == "doi":

14 href = "https://doi.org/" + id_value

15 elif id_type == "zbl-item-id" or id_type == "jfm-item-id":

16 href = "https://zbmath.org/?q=an:" + id_value

17 elif id_type == "mr-item-id":

18 if "#" in id_value: 18 ↛ 19line 18 didn't jump to line 19, because the condition on line 18 was never true

19 id_value = id_value.replace(" #", ":")

20 href = "https://mathscinet.ams.org/mathscinet-getitem?mr=" + id_value

21 elif id_type == "nmid" or id_type == "numdam-id" or id_type == "mathdoc-id":

22 if force_numdam: 22 ↛ 23line 22 didn't jump to line 23, because the condition on line 22 was never true

23 href = f"http://www.numdam.org/item/{id_value}"

24 else:

25 href = f"/item/{id_value}"

26 elif id_type == "eudml-item-id":

27 values = id_value.split(":")

28 if len(values) > 0: 28 ↛ 30line 28 didn't jump to line 30, because the condition on line 28 was never false

29 id_value = values[-1]

30 href = "https://eudml.org/doc/" + id_value

31 elif id_type == "sps-id": 31 ↛ 32line 31 didn't jump to line 32, because the condition on line 31 was never true

32 href = "http://sites.mathdoc.fr/cgi-bin/spitem?id=" + id_value

33 elif id_type == "arxiv":

34 href = "https://arxiv.org/abs/" + id_value

35 elif id_type == "hal": 35 ↛ 36line 35 didn't jump to line 36, because the condition on line 35 was never true

36 href = "https://hal.archives-ouvertes.fr/" + id_value

37 elif id_type == "tel": 37 ↛ 38line 37 didn't jump to line 38, because the condition on line 37 was never true

38 href = "https://tel.archives-ouvertes.fr/" + id_value

39 elif id_type == "theses.fr": 39 ↛ 40line 39 didn't jump to line 40, because the condition on line 39 was never true

40 href = "https://theses.fr/" + id_value

41 elif id_type == "orcid": 41 ↛ 42line 41 didn't jump to line 42, because the condition on line 41 was never true

42 href = "https://orcid.org/" + id_value

43 elif id_type == "idref": 43 ↛ 44line 43 didn't jump to line 44, because the condition on line 43 was never true

44 href = "https://www.idref.fr/" + id_value

45 elif id_type == "semantic-scholar": 45 ↛ 47line 45 didn't jump to line 47, because the condition on line 45 was never false

46 href = "https://www.semanticscholar.org/paper/" + id_value

47 elif id_type == "pmid":

48 href = "https://pubmed.ncbi.nlm.nih.gov/" + id_value

49 elif id_type == "ark":

50 href = "http://ark.bnf.fr/" + id_value

51 else:

52 href = ""

53 return href

56def find_id_type(id):

57 id_type = None

58 if id.find("10.") == 0:

59 id_type = "doi"

60 elif id.find("hal-") == 0:

61 id_type = "hal"

62 elif id.lower().find("arxiv:") == 0: 62 ↛ 72line 62 didn't jump to line 72, because the condition on line 62 was never false

63 id_type = "arxiv"

65 # if (len(id) == 9 or len(id) == 10) and id.find(".") == 5:

66 # year = id[0:1]

67 # month = id[2:3]

68 # sequence = id[5:]

69 # if year.is_numeric() and month.is_numeric() and 1 < int(month) < 13 and sequence.is_numeric():

70 # id_type = "arxiv"

72 return id_type

75def get_mimetype(filename):

76 type_extension = {

77 "pdf": "application/pdf",

78 "djvu": "image/x.djvu",

79 "tex": "application/x-tex",

80 "png": "image/png",

81 "jpg": "image/jpeg",

82 }

84 basename = os.path.basename(filename)

85 lower_basename = basename.lower()

86 extension = os.path.splitext(lower_basename)[1][1:]

87 mimetype = type_extension.get(extension, "")

88 return mimetype

91def get_article_base_url():

92 return settings.ARTICLE_BASE_URL

95def get_issue_base_url():

96 return settings.ISSUE_BASE_URL

99def get_icon_base_url():

100 return settings.ICON_BASE_URL

101

102

103def get_icon_url(id_, filename):

104 href = get_icon_base_url() + filename

105 # path = get_relative_file_path(id, filename)

106 # if os.path.isabs(path):

107 # path = path[1:]

108 # href = os.path.join(get_icon_base_url(), path)

109 return href

110

111

112def get_doi_url(doi):

113 href = settings.DOI_BASE_URL + doi

114 return href

115

116

117def get_relative_folder(collection_id, container_id=None, article_id=None):

118 folder = collection_id

119 if container_id:

120 folder += "/" + container_id

121 if article_id:

122 folder += "/" + article_id

123 return folder

124

125

126def embargo(wall, year):

127 result = False

128 y = NOW

129

130 if wall:

131 try:

132 y = int(year.split("-")[0])

133 except BaseException:

134 pass

135

136 result = NOW - y <= wall

137

138 return result

139

140

141# Iterate a folder with a collection

142# The folder must look like @COL/@ISSUE/@ISSUE.XML

143

144

145def iterate_collection_folder(folder, pid, first_issue=""):

146 root_folder = os.path.join(folder, pid)

147

148 start = len(first_issue) == 0

149 # first_issue = 'CRMATH_2008__346_1-2'

150 for item in sorted(os.listdir(root_folder)):

151 if not start and item == first_issue: 151 ↛ 152line 151 didn't jump to line 152, because the condition on line 151 was never true

152 start = True

153 if start: # and item != 'CRMATH_2015__353_2': 153 ↛ 150line 153 didn't jump to line 150, because the condition on line 153 was never false

154 dir = os.path.join(root_folder, item)

155 if os.path.isdir(dir):

156 file = os.path.join(root_folder, item, item + ".xml")

157 if os.path.isfile(file): 157 ↛ 159line 157 didn't jump to line 159, because the condition on line 157 was never false

158 yield item, file

159 file = os.path.join(root_folder, item, item + "-cdrxml.xml")

160 if os.path.isfile(file): 160 ↛ 161line 160 didn't jump to line 161, because the condition on line 160 was never true

161 yield item, file

162

163

164def create_folder(folder):

165 try:

166 os.makedirs(folder)

167 except BaseException:

168 pass

169

170 if not os.path.isdir(folder): 170 ↛ 171line 170 didn't jump to line 171, because the condition on line 170 was never true

171 raise RuntimeError("Unable to create " + folder)

172

173

174def copy_folder(from_dir, to_dir):

175 if os.path.isdir(from_dir): 175 ↛ 178line 175 didn't jump to line 178, because the condition on line 175 was never false

176 create_folder(to_dir)

177

178 for f in os.listdir(from_dir):

179 from_path = os.path.join(from_dir, f)

180 if os.path.isfile(from_path): 180 ↛ 182line 180 didn't jump to line 182, because the condition on line 180 was never false

181 copy_file(from_path, to_dir)

182 if os.path.isdir(from_path): 182 ↛ 183line 182 didn't jump to line 183, because the condition on line 182 was never true

183 copy_folder(from_path, os.path.join(to_dir, f))

184

185

186def copy_file(from_path, to_path):

187 if os.path.isfile(from_path):

188 if os.path.isdir(to_path):

189 to_path = os.path.join(to_path, os.path.basename(from_path))

190 if to_path.startswith(settings.MATHDOC_ARCHIVE_FOLDER):

191 # copy2 attempts to preserve all file metadata

192 # on /mathdoc_archive, we don't want to preserve the mode, just the dates

193 shutil.copyfile(from_path, to_path)

194 shutil.copystat(from_path, to_path)

195 else:

196 shutil.copy2(from_path, to_path)

197

198

199def copy_html_images(resource, to_folder, from_folder):

200 """

201 Copy the figures associated with the HTML body of an article

202 if from_archive:

203 Images are in settings.MATHDOC/@colid/@issue_id/@a_id/src/tex/figures/

204 if from_cedram:

205 Images are in settings.CEDRAM_TEX_FOLDER/@colid/@issue_id/@tex_aid/Fulltext/figures/

206

207 @param resource:

208 @param to_folder:

209 @param from_folder:

210 @return: nothing

211 """

212

213 if resource.classname != "Article":

214 return

215

216 article_to_copy = resource

217 issue = article_to_copy.my_container

218 colid = article_to_copy.get_collection().pid

219

220 if from_folder == settings.CEDRAM_XML_FOLDER:

221 tex_src_folder = get_cedram_issue_tex_folder(colid, issue.pid)

222 tex_folders, _ = get_cedram_tex_folders(colid, issue.pid)

223

224 if len(tex_folders) > 0: 224 ↛ exitline 224 didn't return from function 'copy_html_images', because the condition on line 224 was never false

225 i = 0

226 for article in issue.article_set.all():

227 if article_to_copy.pid == article.pid: 227 ↛ 254line 227 didn't jump to line 254, because the condition on line 227 was never false

228 # l'ordre d'enregistrement des articles dans la bdd est important : l'ordre du tex est SENSE correspondre au xml de l'issue

229

230 dest_folder = os.path.join(

231 to_folder,

232 get_relative_folder(colid, issue.pid, article.pid),

233 "src/tex/figures",

234 )

235

236 if os.path.isdir(dest_folder):

237 try:

238 shutil.rmtree(dest_folder)

239 except OSError:

240 message = "Unable to remove " + dest_folder

241 raise RuntimeError(message)

242

243 src_folder = os.path.join(

244 tex_src_folder, tex_folders[i], "FullText", "figures"

245 )

246 qs = article.relatedobject_set.filter(rel="html-image")

247 if qs.count() > 0: 247 ↛ 250line 247 didn't jump to line 250, because the condition on line 247 was never false

248 create_folder(dest_folder)

249

250 for related_obj in qs:

251 img_file = os.path.join(src_folder, os.path.basename(related_obj.location))

252 copy_file(img_file, dest_folder)

253

254 i += 1

255

256 else:

257 # copy depuis archive, directement tout le répertoire contenant les images

258 dest_folder = os.path.join(

259 to_folder,

260 get_relative_folder(colid, issue.pid, article_to_copy.pid),

261 "src/tex/figures",

262 )

263 if os.path.isdir(dest_folder):

264 try:

265 shutil.rmtree(dest_folder)

266 except OSError:

267 message = "Unable to remove " + dest_folder

268 raise RuntimeError(message)

269

270 src_folder = os.path.join(

271 from_folder,

272 get_relative_folder(colid, issue.pid, article_to_copy.pid),

273 "src/tex/figures",

274 )

275 if os.path.isdir(src_folder):

276 copy_folder(src_folder, dest_folder)

277

278

279def copy_file_obj_to_article_folder(

280 file_obj, colid, issue_pid, article_pid, is_image=False, article_container_pid=None

281):

282 if not is_image:

283 name, extension = os.path.splitext(file_obj.name)

284 relative_folder = get_relative_folder(colid, issue_pid, article_pid)

285 folder = os.path.join(settings.RESOURCES_ROOT, relative_folder)

286 create_folder(folder)

287 full_file_name = os.path.join(folder, article_pid + extension)

288 relative_file_name = os.path.join(relative_folder, article_pid + extension)

289

290 with open(full_file_name, "wb+") as destination:

291 for chunk in file_obj.chunks():

292 destination.write(chunk)

293

294 else:

295 name, extension = os.path.splitext(file_obj.name)

296 relative_folder = get_relative_folder(colid, issue_pid, article_pid)

297 folder = os.path.join(settings.RESOURCES_ROOT, relative_folder + "/src/media")

298 create_folder(folder)

299 full_file_name = os.path.join(folder, name + extension)

300 with open(full_file_name, "wb+") as destination:

301 for chunk in file_obj.chunks():

302 destination.write(chunk)

303

304 relative_file_name = os.path.join(relative_folder, article_pid + extension)

305

306 return relative_file_name

307

308

309def copy_binary_files(resource, from_folder, to_folder, binary_files=None):

310 if not from_folder == to_folder:

311 if binary_files is None: 311 ↛ 315line 311 didn't jump to line 315, because the condition on line 311 was never false

312 copy_html_images(resource, to_folder, from_folder)

313 binary_files = resource.get_binary_files_location()

314

315 for file in binary_files:

316 to_path = os.path.join(to_folder, file)

317 dest_folder = os.path.dirname(to_path)

318

319 os.makedirs(dest_folder, exist_ok=True)

320 skip_copy = False

321

322 if "http" in file: 322 ↛ 323line 322 didn't jump to line 323, because the condition on line 322 was never true

323 skip_copy = True

324 from_path = os.path.join(from_folder, file)

325

326 if not skip_copy and os.path.isfile(from_path): 326 ↛ 315line 326 didn't jump to line 315, because the condition on line 326 was never false

327 copy_file(from_path, to_path)

328

329

330def delete_object_folder(object_folder, to_folder):

331 folder = os.path.join(to_folder, object_folder)

332

333 # pas de sécurité car pour garder le mode CASCADE de la db, on supprime le rép sans s'occuper de ce qu'il y a dedans

334 # si on veut vérifier, décommenter :

335 # for entry in os.listdir(folder):

336 # if entry.startswith(colid) and os.path.isdir(os.path.join(folder, entry)):

337 # print(entry)

338 # os.path.join(folder, entry)

339 # raise Exception('Le répertoire a supprimer : ' + folder + ' semble encore contenir des articles/containers')

340 #

341 # if verify == True:

342 # for root, dirs, files in os.walk(folder):

343 # if len(files) > 0:

344 # raise Exception('Le répertoire a supprimer : ' + folder + ' semble encore contenir des objects')

345

346 folder = os.path.normpath(folder)

347 # garde fous :)

348 if folder in [ 348 ↛ 353line 348 didn't jump to line 353, because the condition on line 348 was never true

349 "/mersenne_prod_data",

350 "/mersenne_test_data",

351 "/mathdoc_archive",

352 ] or folder.startswith("/cedram_dev"):

353 raise Exception("Attention, pb avec la suppression de " + folder)

354

355 if os.path.isdir(folder):

356 shutil.rmtree(folder)

357

358

359def delete_file(path):

360 if os.path.isfile(path):

361 os.remove(path)

362

363

364def get_disk_location(

365 root_folder, collection_id, ext, container_id=None, article_id=None, do_create_folder=False

366):

367 if do_create_folder: 367 ↛ 368line 367 didn't jump to line 368, because the condition on line 367 was never true

368 folder = os.path.join(root_folder, collection_id)

369 create_folder(folder)

370

371 if container_id:

372 folder = os.path.join(root_folder, collection_id, container_id)

373 create_folder(folder)

374

375 if article_id:

376 folder = os.path.join(root_folder, collection_id, container_id, article_id)

377 create_folder(folder)

378

379 last_id = collection_id

380 filename = os.path.join(root_folder, collection_id)

381 if container_id: 381 ↛ 384line 381 didn't jump to line 384, because the condition on line 381 was never false

382 filename = os.path.join(filename, container_id)

383 last_id = container_id

384 if article_id: 384 ↛ 388line 384 didn't jump to line 388, because the condition on line 384 was never false

385 filename = os.path.join(filename, article_id)

386 last_id = article_id

387

388 filename = os.path.join(filename, last_id + "." + ext)

389

390 return filename

391

392

393def get_body(filename):

394 with open(filename, encoding="utf-8") as file_:

395 body = file_.read()

396 return body

397

398

399def get_archive_filename(root_folder, colid, pid, ext, do_create_folder=False, article_pid=None):

400 """

401

402 :param root_folder: root_folder of the archive. Ex: /mathdoc_archive

403 :param colid: collection id

404 :param pid: issue id

405 :param ext: filename extension ("xml" or "json")

406 :param create_folder: option to recursively create sub folders

407 :return:

408 """

409

410 # TODO: call get_disk_location(root_folder, colid, ext, pid, None, do_create_folder)

411

412 if do_create_folder:

413 folder = os.path.join(root_folder, colid)

414 create_folder(folder)

415

416 if pid:

417 folder = os.path.join(root_folder, colid, pid)

418 create_folder(folder)

419

420 if article_pid: 420 ↛ 421line 420 didn't jump to line 421, because the condition on line 420 was never true

421 folder = os.path.join(folder, article_pid)

422 create_folder(folder)

423

424 if pid and article_pid: 424 ↛ 425line 424 didn't jump to line 425, because the condition on line 424 was never true

425 filename = os.path.join(root_folder, colid, pid, article_pid, article_pid + "." + ext)

426 elif pid:

427 filename = os.path.join(root_folder, colid, pid, pid + "." + ext)

428 else:

429 filename = os.path.join(root_folder, colid, colid + "." + ext)

430

431 return filename

432

433

434# Read the XML of an issue/collection within an archive folder

435# The folder must look like @COL/@ISSUE/@ISSUE.XML

436# @COL/@COL.XML

437

438

439def get_archive_body(root_folder, colid, pid):

440 filename = get_archive_filename(root_folder, colid, pid, "xml")

441 return get_body(filename)

442

443

444def is_tex_comment(text, i):

445 is_comment = False

446 while i > 0 and text[i] == " ": 446 ↛ 447line 446 didn't jump to line 447, because the condition on line 446 was never true

447 i -= 1

448

449 if i >= 0 and text[i] == "%": 449 ↛ 450line 449 didn't jump to line 450, because the condition on line 449 was never true

450 is_comment = True

451 elif i > 0 and text[i] == "~" and text[i - 1] == "%": 451 ↛ 452line 451 didn't jump to line 452, because the condition on line 451 was never true

452 is_comment = True

453

454 return is_comment

455

456

457def is_tex_def(text, i):

458 is_def = False

459

460 if text[i - 5 : i - 1] == "\\def": 460 ↛ 461line 460 didn't jump to line 461, because the condition on line 460 was never true

461 is_def = True

462

463 return is_def

464

465

466def is_tex_newcommand(text, i):

467 is_newcommand = False

468

469 if text[i - 12 : i - 1] == "\\newcommand": 469 ↛ 470line 469 didn't jump to line 470, because the condition on line 469 was never true

470 is_newcommand = True

471

472 return is_newcommand

473

474

475def get_cedram_issue_tex_folder(colid, issue_id):

476 return os.path.join(settings.CEDRAM_TEX_FOLDER, colid, issue_id)

477

478

479def get_cedram_tex_folders(colid, issue_id):

480 """

481 return article filenames in cedram tex issue folder and corresponding doi if present, extracted from issue tex file

482 @param colid:

483 @param issue_id:

484 @return: list of filename, list of doi

485 """

486 filenames = []

487 dois = []

488

489 body = ""

490 issue_filename = os.path.join(get_cedram_issue_tex_folder(colid, issue_id), issue_id + ".tex")

491 if os.path.isfile(issue_filename):

492 try:

493 with open(issue_filename, encoding="utf-8") as f:

494 body = f.read()

495 except UnicodeDecodeError:

496 with open(issue_filename, encoding="iso-8859-1") as f:

497 body = f.read()

498

499 lower_body = body.lower()

500

501 li = []

502 j = body.find("includearticle")

503 if j >= 0: 503 ↛ 505line 503 didn't jump to line 505, because the condition on line 503 was never false

504 li.append(j)

505 j = body.find("includeprearticle")

506 if j >= 0: 506 ↛ 507line 506 didn't jump to line 507, because the condition on line 506 was never true

507 li.append(j)

508 j = lower_body.find("includepreface")

509 if j >= 0: 509 ↛ 510line 509 didn't jump to line 510, because the condition on line 509 was never true

510 li.append(j)

511 i = min(li) if len(li) > 0 else -1

512

513 while i >= 0:

514 if ( 514 ↛ 537line 514 didn't jump to line 537

515 i > 1

516 and not is_tex_comment(body, i - 2)

517 and not is_tex_def(body, i)

518 and not is_tex_newcommand(body, i)

519 ):

520 doi = None

521 while body[i] != "{":

522 if len(body) > i + 4 and body[i : i + 4] == "doi=":

523 j = i + 4

524 while body[i] != "," and body[i] != "]":

525 i += 1

526 doi = xml_utils.normalize_space(body[j:i])

527 i += 1

528 i += 1

529 filename = ""

530 while body[i] != "}":

531 filename += body[i]

532 i += 1

533 if len(filename) > 0: 533 ↛ 539line 533 didn't jump to line 539, because the condition on line 533 was never false

534 filenames.append(filename)

535 dois.append(doi)

536 else:

537 i += 1

538

539 li = []

540 j = body.find("includearticle", i)

541 if j >= 0:

542 li.append(j)

543 j = body.find("includeprearticle", i)

544 if j >= 0: 544 ↛ 545line 544 didn't jump to line 545, because the condition on line 544 was never true

545 li.append(j)

546 j = lower_body.find("includepreface", i)

547 if j >= 0: 547 ↛ 548line 547 didn't jump to line 548, because the condition on line 547 was never true

548 li.append(j)

549 i = min(li) if len(li) > 0 else -1

550

551 return filenames, dois

552

553

554def get_bibtex_from_tex(tex_filename):

555 bibtex_filename = ""

556

557 body = ""

558 if os.path.isfile(tex_filename): 558 ↛ 580line 558 didn't jump to line 580, because the condition on line 558 was never false

559 try:

560 with open(tex_filename, encoding="utf-8") as f:

561 body = f.read()

562 except UnicodeDecodeError:

563 with open(tex_filename, encoding="iso-8859-1") as f:

564 body = f.read()

565

566 i = body.find("\\bibliography")

567 while i >= 0:

568 if i > 1 and not is_tex_comment(body, i - 2): 568 ↛ 576line 568 didn't jump to line 576, because the condition on line 568 was never false

569 while body[i] != "{":

570 i += 1

571 i += 1

572 while body[i] != "}":

573 bibtex_filename += body[i]

574 i += 1

575 else:

576 i += 1

577

578 i = body.find("\\bibliography", i)

579

580 return bibtex_filename

581

582

583PCJ_SECTIONS = {

584 "animsci": "Animal Science",

585 "archaeo": "Archaeology",

586 "ecology": "Ecology",

587 "ecotoxenvchem": "Ecotoxicology & Environmental Chemistry",

588 "evolbiol": "Evolutionary Biology",

589 "forestwoodsci": "Forest & Wood Sciences",

590 "genomics": "Genomics",

591 "healthmovsci": "Health & Movement Sciences",

592 "infections": "Infections",

593 "mcb": "Mathematical & Computational Biology",

594 "microbiol": "Microbiology",

595 "networksci": "Network Science",

596 "neuro": "Neuroscience",

597 "paleo": "Paleontology",

598 "rr": "Registered Reports",

599 "zool": "Zoology",

600}

601

602PCJ_UGA_SECTION = ["healthmovsci", "rr"]

603PCJ_CONFERENCES = ["Euring 2023"]

604PCJ_MANDATORY_TOPICS = {

605 "ecology": "Ecology",

606 "evolbiol": "Evolution",

607 "genomics": "Genetics/genomics",

608 "paleo": "Paleontology",

609 "archaeo": "Archaeology",

610 "microbiol": "Microbiology",

611 "neuro": "Neuroscience",

612}

613

614

615def get_pci(value):

616 if value in PCJ_SECTIONS: 616 ↛ 618line 616 didn't jump to line 618, because the condition on line 616 was never false

617 return PCJ_SECTIONS[value]

618 return ""

619

620

621ARTICLE_TYPES = {

622 "biographical-note": "Notice biographique",

623 "book-review": "Recension d’ouvrage",

624 "clarification": "Mise au point",

625 "congress": "Intervention en colloque",

626 "corrigendum": "Corrigendum",

627 "editorial": "Éditorial",

628 "erratum": "Erratum",

629 "expression-of-concern": "Avertissement des éditeurs",

630 "foreword": "Avant-propos",

631 "guest-editors": "Rédacteurs invités",

632 "historical-commentary": "Commentaire historique",

633 "history-of-sciences": "Histoire des sciences et des idées",

634 "letter": "Commentaire et réponse",

635 "news": "C'est apparu dans la presse",

636 "opinion": "Opinion / Perspective",

637 "preliminary-communication": "Communication préliminaire",

638 "research-article": "Article de recherche",

639 "retraction": "Rétractation",

640 "review": "Article de synthèse",

641 "software-tool": "Outil logiciel",

642}