Coverage for apps/ptf/cmds/ptf_cmds/archive_ptf_cmds.py: 29%

317 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2024-02-28 09:09 +0000

1import datetime 

2import os 

3import shutil 

4import subprocess 

5 

6import lxml.etree as etree 

7import requests 

8 

9from django.conf import settings 

10 

11from ptf import model_helpers 

12from ptf.cmds.database_cmds import baseCmd 

13from ptf.display import resolver 

14 

15from .base_ptf_cmds import exportExtraDataPtfCmd 

16from .base_ptf_cmds import exportPtfCmd 

17 

18 

19def create_toc_xml(colid, issues): 

20 """ 

21 Create the toc.xml file for the collection 

22 """ 

23 if os.access(os.path.join(settings.MATHDOC_ARCHIVE_FOLDER, colid), os.R_OK): 

24 os.chdir(os.path.join(settings.MATHDOC_ARCHIVE_FOLDER, colid)) 

25 if os.access("toc.xml", os.R_OK): 

26 os.remove("toc.xml") 

27 journal = etree.Element("journal") 

28 

29 for issue in issues: 

30 new_node = etree.Element("journal-meta") 

31 

32 issue_id = etree.Element("journal-id") # Par exemple : JEP 

33 issue_id.text = colid 

34 new_node.append(issue_id) 

35 

36 year = etree.Element("year") # Par exemple : 2019 

37 year.text = issue.year 

38 new_node.append(year) 

39 

40 if len(issue.vseries) != 0: 

41 series = etree.Element("series") # Peut être null 

42 series.text = issue.vseries 

43 new_node.append(series) 

44 

45 if len(issue.volume) != 0: 

46 volume = etree.Element("volume") # Par exemple : 1 

47 volume.text = issue.volume 

48 new_node.append(volume) 

49 

50 if len(issue.number) != 0: 

51 number = etree.Element("number") # Peut être null 

52 number.text = issue.number 

53 new_node.append(number) 

54 

55 folder = etree.Element("folder") 

56 folder.text = issue.pid 

57 new_node.append(folder) 

58 

59 journal.append(new_node) 

60 

61 node_str = etree.tostring(journal, pretty_print=True, encoding="unicode") 

62 toc_file = open("toc.xml", "w+") # Création du fichier toc.xml 

63 toc_file.write(node_str) 

64 toc_file.close() 

65 

66 

67class archiveCollectionPtfCmd(baseCmd): 

68 """ 

69 Archive the collection on disk 

70 """ 

71 

72 def __init__(self, params=None): 

73 super().__init__(params) 

74 if params is None: 

75 params = {} 

76 else: 

77 self.pid = params["colid"] 

78 self.mathdoc_archive = settings.MATHDOC_ARCHIVE_FOLDER 

79 self.binary_files_folder = settings.MERSENNE_PROD_DATA_FOLDER 

80 self.issues = params["issues"] 

81 self.required_params.extend(["colid"]) 

82 

83 def internal_do(self): 

84 exportPtfCmd( 

85 { 

86 "pid": self.pid, # On exporte la collection 

87 "export_folder": self.mathdoc_archive, # On exporte dans le répertoire de la collection 

88 "for_archive": True, # On exporte pour l'archive 

89 "with_internal_data": False, # On n'exporte pas les données internes 

90 "with_binary_files": True, # On exporte les fichiers binaires (ex : img) 

91 "binary_files_folder": self.binary_files_folder, 

92 } 

93 ).do() # On exporte dans le répertoire temporaire 

94 

95 create_toc_xml(self.pid, self.issues) 

96 

97 

98class archiveIssuePtfCmd(baseCmd): 

99 """ 

100 Archive the issue on disk 

101 """ 

102 

103 def __init__(self, params=None): 

104 self.pid = None # container pid 

105 self.article = None # Allow archiving of only 1 article 

106 self.skip_pdfa = False 

107 if "issue" in params.keys(): 107 ↛ 108line 107 didn't jump to line 108, because the condition on line 107 was never true

108 self.issue = params["issue"] 

109 else: 

110 self.issue = None 

111 

112 # The derived archiveNumdamIssuePtfCmd class has other default values 

113 if not hasattr(self, "export_folder"): 113 ↛ 115line 113 didn't jump to line 115, because the condition on line 113 was never false

114 self.export_folder = settings.MATHDOC_ARCHIVE_FOLDER 

115 if not hasattr(self, "binary_files_folder"): 115 ↛ 118line 115 didn't jump to line 118, because the condition on line 115 was never false

116 self.binary_files_folder = settings.MERSENNE_PROD_DATA_FOLDER 

117 

118 super().__init__(params) 

119 

120 self.required_params.extend(["pid"]) 

121 

122 # Pass an article to archive only 1 article 

123 def set_article(self, article): 

124 self.article = article 

125 

126 def archive_tex_src(self, article, colid, tex_src_folder, tex_article_folder): 

127 src_folder = os.path.join(tex_src_folder, tex_article_folder) 

128 dest_folder = os.path.join(self.export_folder, article.get_relative_folder(), "src/tex/") 

129 

130 resolver.create_folder(dest_folder) 

131 

132 with open(os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8") as file_: 

133 file_.write(f"Create {dest_folder}\n") 

134 

135 # 1. Copy tex file 

136 tex_file = os.path.join(src_folder, tex_article_folder + ".tex") 

137 resolver.copy_file(tex_file, dest_folder) 

138 

139 # 2. sty files 

140 sty_files = [ 

141 os.path.join(src_folder, f) 

142 for f in os.listdir(src_folder) 

143 if os.path.isfile(os.path.join(src_folder, f)) and f.endswith(".sty") 

144 ] 

145 for sty_file in sty_files: 

146 resolver.copy_file(sty_file, dest_folder) 

147 

148 # 3. bib file 

149 aux_file = os.path.join(src_folder, tex_article_folder + ".aux") 

150 if os.path.isfile(aux_file): 

151 cmd_str = "cd " + src_folder + "; grep bibdata " + aux_file 

152 try: 

153 result = subprocess.check_output(cmd_str, shell=True).decode(encoding="utf-8") 

154 for line in result.split("\n"): 

155 words = line.split("{") 

156 if len(words) > 1: 

157 line = words[1] 

158 words = line.split("}") 

159 if len(words) > 1: 

160 base_file = words[0] 

161 

162 full_src_file = os.path.join(src_folder, base_file + ".bib") 

163 if os.path.isfile(full_src_file): 

164 full_dest_file = os.path.join(dest_folder, base_file + ".bib") 

165 resolver.copy_file(full_src_file, full_dest_file) 

166 except subprocess.CalledProcessError as e: 

167 if e.returncode != 1: # grep returns 1 if nothing was found 

168 message = f'Error {e.returncode} with "{cmd_str}": {e.output}' 

169 raise RuntimeError(message) 

170 else: 

171 base_file = resolver.get_bibtex_from_tex(tex_file) 

172 if base_file: 

173 full_src_file = os.path.join(src_folder, base_file + ".bib") 

174 if os.path.isfile(full_src_file): 

175 full_dest_file = os.path.join(dest_folder, base_file + ".bib") 

176 resolver.copy_file(full_src_file, full_dest_file) 

177 

178 # 4. cdrdoidates, figures/ folder,... 

179 cmd_str = ( 

180 "cd " 

181 + src_folder 

182 + r"; grep -v /usr/local/texlive/ *.fls | grep '\./' | grep -v '\.out$' | sort -u" 

183 ) 

184 

185 try: 

186 result = subprocess.check_output(cmd_str, shell=True).decode(encoding="utf-8") 

187 for line in result.split("\n"): 

188 words = line.split(" ") 

189 if len(words) > 1: 

190 file = words[1] 

191 file_folder = os.path.dirname(file) 

192 

193 full_src_file = os.path.join(src_folder, file) 

194 if os.path.isfile(full_src_file): 

195 file_dest_folder = os.path.join(dest_folder, file_folder) 

196 resolver.create_folder(file_dest_folder) 

197 

198 full_dest_file = os.path.join(dest_folder, file) 

199 resolver.copy_file(full_src_file, full_dest_file) 

200 except subprocess.CalledProcessError as e: 

201 if e.returncode != 1: # grep returns 1 if nothing was found 

202 message = f'Error {e.returncode} with "{cmd_str}": {e.output}' 

203 raise RuntimeError(message) 

204 

205 def create_pdfa(self, colid, article_pid): 

206 """ 

207 Create the pdfa files of the pdfs associated with the collection 

208 """ 

209 

210 in_file = os.path.join( 

211 self.export_folder, colid, self.pid, article_pid, article_pid + ".pdf" 

212 ) 

213 out_file = os.path.join( 

214 self.export_folder, colid, self.pid, article_pid, article_pid + "_PDFA.pdf" 

215 ) 

216 

217 code_return = os.system( 

218 f"gs -dSAFER -dBATCH -DNOPAUSE -sPAPERSIZE=halfletter -dPDFFitPage -dFIXEDMEDIA " 

219 f"-dEmbedAllFonts=true -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPrinted=false -dPDFSETTINGS=/printer " 

220 f"-q -o {out_file} {in_file}" 

221 ) # On convertit le fichier pdf en fichier pdfa 

222 if code_return != 0: # Si le code retour de la commande n'est pas 0, il y a eu une erreur 

223 with open( 

224 os.path.join(settings.LOG_DIR, "archive_error.log"), "a", encoding="utf-8" 

225 ) as file_: 

226 file_.write(article_pid + " , PDF/A\n") 

227 

228 raise RuntimeError(f"Le fichier {in_file} n'a pas pu être converti en PDFA") 

229 

230 with open(os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8") as file_: 

231 file_.write(f"Write {out_file}\n") 

232 

233 def backup_pdfa_if_necessary(self, colid, article_pids): 

234 """ 

235 Since it is time consuming to create a PDF/A, check if they have to be re-created 

236 before deleting the /mathdoc_archive issue folder. 

237 

238 If so, backup the PDF/A in a temp folder. They will be put back in /mathdoc_archive in create_or_restore_pdfa() 

239 

240 To check, we compare the date of the article PDF between 

241 - /mathdoc_archive and 

242 - self.binary_files_folder (/mersenne_prod_data or /numdam_data) 

243 """ 

244 

245 pdfas = {} 

246 tmp_folder = os.path.join(settings.LOG_DIR, "tmp/archive", colid, self.pid) 

247 resolver.create_folder(tmp_folder) 

248 

249 with open(os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8") as file_: 

250 file_.write(f"Create {tmp_folder}\n") 

251 

252 # Before deleting the issue folder, check if we have to recreate the PDF/A (save them to a tmp folder) 

253 # If the PDF has been updated in /mersenne_prod_data, we recreate the PDF/A 

254 for article_pid in article_pids: 

255 pdf_in_archive = resolver.get_disk_location( 

256 self.export_folder, colid, "pdf", self.pid, article_pid 

257 ) 

258 pdf_in_prod = resolver.get_disk_location( 

259 self.binary_files_folder, colid, "pdf", self.pid, article_pid 

260 ) 

261 

262 do_create_pdfa = True 

263 

264 if os.path.isfile(pdf_in_prod) and os.path.isfile(pdf_in_archive): 

265 pdfa_in_archive = os.path.join( 

266 self.export_folder, colid, self.pid, article_pid, article_pid + "_PDFA.pdf" 

267 ) 

268 

269 date_pdf_in_archive = datetime.datetime.fromtimestamp( 

270 os.stat(pdf_in_prod).st_mtime 

271 ).strftime("%Y-%m-%d") 

272 date_pdf_in_prod = datetime.datetime.fromtimestamp( 

273 os.stat(pdf_in_archive).st_mtime 

274 ).strftime("%Y-%m-%d") 

275 

276 do_create_pdfa = ( 

277 not os.path.isfile(pdfa_in_archive) or date_pdf_in_prod != date_pdf_in_archive 

278 ) 

279 

280 pdfas[article_pid] = do_create_pdfa 

281 if not do_create_pdfa: 

282 # Copy the PDF/A in the temp folder 

283 src_pdfa = os.path.join( 

284 self.export_folder, colid, self.pid, article_pid, article_pid + "_PDFA.pdf" 

285 ) 

286 dest_pdfa = os.path.join(tmp_folder, article_pid + "_PDFA.pdf") 

287 resolver.copy_file(src_pdfa, dest_pdfa) 

288 

289 with open( 

290 os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8" 

291 ) as file_: 

292 file_.write(f"Backup {dest_pdfa}\n") 

293 

294 return pdfas 

295 

296 def create_or_restore_pdfa(self, colid, article_pids, pdfas): 

297 tmp_folder = os.path.join(settings.LOG_DIR, "tmp/archive", colid, self.pid) 

298 

299 for article_pid in article_pids: 

300 if pdfas[article_pid]: 

301 self.create_pdfa(colid, article_pid) 

302 else: 

303 src_pdfa = os.path.join(tmp_folder, article_pid + "_PDFA.pdf") 

304 dest_pdfa = os.path.join( 

305 self.export_folder, colid, self.pid, article_pid, article_pid + "_PDFA.pdf" 

306 ) 

307 resolver.copy_file(src_pdfa, dest_pdfa) 

308 

309 with open( 

310 os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8" 

311 ) as file_: 

312 file_.write(f"Restore {dest_pdfa}\n") 

313 

314 if os.path.isdir(tmp_folder): 

315 shutil.rmtree(tmp_folder) 

316 

317 with open( 

318 os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8" 

319 ) as file_: 

320 file_.write(f"Delete {tmp_folder}\n") 

321 

322 def internal_do(self): 

323 super().internal_do() 

324 

325 if self.article is None: 325 ↛ 331line 325 didn't jump to line 331, because the condition on line 325 was never false

326 container = model_helpers.get_container(self.pid, prefetch=False) 

327 qs = container.article_set.all() 

328 article_pids = list(qs.values_list("pid", flat=True)) 

329 articles = qs 

330 else: 

331 container = self.article.my_container 

332 article_pids = [self.article.pid] 

333 articles = [self.article] 

334 

335 colid = container.get_top_collection().pid 

336 self.pid = container.pid 

337 

338 if not self.skip_pdfa: 338 ↛ 339line 338 didn't jump to line 339, because the condition on line 338 was never true

339 pdfas = self.backup_pdfa_if_necessary(colid, article_pids) 

340 

341 # Delete the issue folder if we archive an issue 

342 if self.article is None: 342 ↛ 353line 342 didn't jump to line 353, because the condition on line 342 was never false

343 issue_folder = container.get_relative_folder() 

344 

345 with open( 

346 os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8" 

347 ) as file_: 

348 file_.write(f"Delete {self.export_folder}/{issue_folder}\n") 

349 

350 resolver.delete_object_folder(issue_folder, to_folder=self.export_folder) 

351 

352 # I. Always archive the issue XML, even if we archive only 1 article 

353 exportPtfCmd( 

354 { 

355 "pid": self.pid, 

356 "export_folder": self.export_folder, 

357 "with_binary_files": self.article 

358 is None, # binary files for 1 article are copied below 

359 "for_archive": True, 

360 "binary_files_folder": self.binary_files_folder, 

361 } 

362 ).do() 

363 

364 # II. Copy binary files (PDF...) 

365 for a in articles: 

366 article_folder = a.get_relative_folder() 

367 

368 with open( 

369 os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8" 

370 ) as file_: 

371 file_.write(f"Delete {article_folder}\n") 

372 

373 resolver.delete_object_folder(article_folder, to_folder=self.export_folder) 

374 resolver.copy_binary_files(a, self.binary_files_folder, self.export_folder) 

375 

376 params = {"pid": self.pid, "export_folder": self.export_folder, "export_all": False} 

377 exportExtraDataPtfCmd(params).do() 

378 

379 tex_src_folder = resolver.get_cedram_issue_tex_folder(colid, self.pid) 

380 tex_folders, _ = resolver.get_cedram_tex_folders(colid, self.pid) 

381 

382 # III. Articles written in LaTeX. We need to archive files needed to re-compile the LaTex source code 

383 if len(tex_folders) > 0: 383 ↛ 384line 383 didn't jump to line 384, because the condition on line 383 was never true

384 i = 0 

385 for article in container.article_set.exclude(do_not_publish=True): 

386 if self.article is None or self.article.pid == article.pid: 

387 self.archive_tex_src(article, colid, tex_src_folder, tex_folders[i]) 

388 

389 i += 1 

390 

391 # IV. Digitized papers (Numdam). We basically need to archive the images (*.tif) 

392 copy_numdam_src_files(colid, self.pid, article_pids, self.export_folder) 

393 

394 # V. Create PDF/A 

395 if not self.skip_pdfa: 395 ↛ 396line 395 didn't jump to line 396, because the condition on line 395 was never true

396 self.create_or_restore_pdfa(colid, article_pids, pdfas) 

397 

398 

399def archive_numdam_xml(colid, pid, export_folder): 

400 """ 

401 Get the XML of a collection or an issue 

402 """ 

403 url = settings.NUMDAM_URL + "/api-item-xml/" 

404 if pid is None: 

405 url += colid 

406 else: 

407 url += pid 

408 response = requests.get(url) 

409 response.raise_for_status() 

410 

411 xml_body = response.content.decode("utf-8") 

412 

413 if xml_body: 

414 file = resolver.get_archive_filename(export_folder, colid, pid, "xml", True) 

415 

416 with open(file, "w", encoding="utf-8") as f: 

417 f.write(xml_body) 

418 

419 

420def get_numdam_issues_list(colid): 

421 """ 

422 Get the list of issues of a collection from numdam.org 

423 """ 

424 response = requests.get(f"{settings.NUMDAM_URL}/api-issues/{colid}") 

425 response.raise_for_status() 

426 

427 return response.json()["issues"] 

428 

429 

430def get_numdam_file_list(colid, pid): 

431 """ 

432 Get the list of files to archive (of a collection or an issue) 

433 The files are those visible by the user, like PDF or DjVus 

434 """ 

435 url = settings.NUMDAM_URL + "/api-item-file-list/" 

436 if pid is None: 

437 url += colid 

438 else: 

439 url += pid 

440 response = requests.get(url) 

441 response.raise_for_status() 

442 

443 data = response.json() 

444 return data 

445 

446 

447def copy_numdam_src_files(colid, pid, article_pids, export_folder, log_file=None): 

448 src_folder = os.path.join(settings.NUMDAM_ISSUE_SRC_FOLDER, colid, pid) 

449 

450 if not os.path.isdir(src_folder): 

451 return 

452 

453 # 1. Files related to the issue 

454 dest_folder = os.path.join( 

455 export_folder, resolver.get_relative_folder(colid, pid), "src/digitisation/" 

456 ) 

457 

458 if log_file: 458 ↛ 459line 458 didn't jump to line 459, because the condition on line 458 was never true

459 log_file.write("Create " + dest_folder + "...") 

460 

461 resolver.create_folder(dest_folder) 

462 

463 if log_file: 463 ↛ 464line 463 didn't jump to line 464, because the condition on line 463 was never true

464 log_file.write("done\n") 

465 

466 # 1a. issue.xml 

467 full_src_file = os.path.join(src_folder, pid + ".xml") 

468 if os.path.isfile(full_src_file): 468 ↛ 480line 468 didn't jump to line 480, because the condition on line 468 was never false

469 full_dest_file = os.path.join(dest_folder, pid + ".xml") 

470 

471 if log_file: 471 ↛ 472line 471 didn't jump to line 472, because the condition on line 471 was never true

472 log_file.write(f"Copy {os.path.basename(full_src_file)} ...") 

473 

474 resolver.copy_file(full_src_file, full_dest_file) 

475 

476 if log_file: 476 ↛ 477line 476 didn't jump to line 477, because the condition on line 476 was never true

477 log_file.write("done\n") 

478 

479 # 1b. tif, jpg files 

480 img_files = [ 

481 os.path.join(src_folder, f) 

482 for f in os.listdir(src_folder) 

483 if ( 

484 os.path.isfile(os.path.join(src_folder, f)) 

485 and (f.endswith(".tif") or f.endswith(".jpg")) 

486 ) 

487 ] 

488 

489 for img_file in img_files: 

490 if log_file: 490 ↛ 491line 490 didn't jump to line 491, because the condition on line 490 was never true

491 log_file.write(f"Copy {os.path.basename(img_file)} ...") 

492 

493 resolver.copy_file(img_file, dest_folder) 

494 

495 if log_file: 495 ↛ 496line 495 didn't jump to line 496, because the condition on line 495 was never true

496 log_file.write("done\n") 

497 

498 # 2. Files related to articles 

499 for article_pid in article_pids: 

500 src_folder = os.path.join(settings.NUMDAM_ARTICLE_SRC_FOLDER, colid, pid, article_pid) 

501 

502 dest_folder = os.path.join( 

503 export_folder, 

504 resolver.get_relative_folder(colid, pid, article_pid), 

505 "src/digitisation/", 

506 ) 

507 

508 if log_file: 508 ↛ 509line 508 didn't jump to line 509, because the condition on line 508 was never true

509 log_file.write("Create " + dest_folder + "...") 

510 

511 resolver.create_folder(dest_folder) 

512 

513 if log_file: 513 ↛ 514line 513 didn't jump to line 514, because the condition on line 513 was never true

514 log_file.write("done\n") 

515 

516 # 2a. article.xml (Full Text) 

517 full_src_file = os.path.join(src_folder, article_pid + ".xml") 

518 if os.path.isfile(full_src_file): 518 ↛ 530line 518 didn't jump to line 530, because the condition on line 518 was never false

519 full_dest_file = os.path.join(dest_folder, article_pid + ".xml") 

520 

521 if log_file: 521 ↛ 522line 521 didn't jump to line 522, because the condition on line 521 was never true

522 log_file.write(f"Copy {os.path.basename(full_src_file)} ...") 

523 

524 resolver.copy_file(full_src_file, full_dest_file) 

525 

526 if log_file: 526 ↛ 527line 526 didn't jump to line 527, because the condition on line 526 was never true

527 log_file.write("done\n") 

528 

529 # 2b. tif, jpg files 

530 img_files = [ 

531 os.path.join(src_folder, f) 

532 for f in os.listdir(src_folder) 

533 if ( 

534 os.path.isfile(os.path.join(src_folder, f)) 

535 and (f.endswith(".tif") or f.endswith(".jpg")) 

536 ) 

537 ] 

538 

539 for img_file in img_files: 539 ↛ 540line 539 didn't jump to line 540, because the loop on line 539 never started

540 if log_file: 

541 log_file.write(f"Copy {os.path.basename(img_file)} ...") 

542 

543 resolver.copy_file(img_file, dest_folder) 

544 

545 if log_file: 

546 log_file.write("done\n") 

547 

548 # PDF/DJVU without headers 

549 # Olivier 09/05/2019: these files are almost identicals to the final PDFs 

550 # There is no need to archive them. Just use the final PDF and remove the first page 

551 # if needed 

552 

553 # # 2c. pdf files (without header) 

554 # full_src_file = os.path.join(src_folder, article_pid + '.pdf') 

555 # if os.path.isfile(full_src_file): 

556 # full_dest_file = os.path.join(dest_folder, article_pid + '.pdf') 

557 # if file: 

558 # file.write("Copy {} ...".format(os.path.basename(full_src_file))) 

559 # copy_file(full_src_file, full_dest_file) 

560 # if file: 

561 # file.write("done\n") 

562 # 

563 

564 

565# def copy_numdam_djvu(colid, pid, article_pids, export_folder, log_file=None): 

566# """ 

567# Djvu might not be visible/listed in centre Mersenne articles, but might exist in Numdam 

568# """ 

569# 

570# if hasattr(settings, "NUMDAM_DATA_ROOT"): 

571# for article_pid in article_pids: 

572# article_folder = resolver.get_relative_folder(colid, pid, article_pid) 

573# full_src_file = os.path.join( 

574# settings.NUMDAM_DATA_ROOT, article_folder, article_pid + ".djvu" 

575# ) 

576# if os.path.isfile(full_src_file): 

577# full_dest_file = os.path.join(export_folder, article_folder, article_pid + ".djvu") 

578# if log_file: 

579# log_file.write(f"Copy {os.path.basename(full_src_file)} ...") 

580# resolver.copy_file(full_src_file, full_dest_file) 

581# if log_file: 

582# log_file.write("done\n") 

583 

584 

585class archiveNumdamResourcePtfCmd(archiveIssuePtfCmd): 

586 """ 

587 Archive a Container or a Collection (just the collection level) stored in Numdam 

588 """ 

589 

590 def __init__(self, params=None): 

591 self.colid = None # self.pid from the base class is the id of the container 

592 self.export_folder = settings.MATHDOC_ARCHIVE_FOLDER 

593 self.binary_files_folder = settings.NUMDAM_DATA_ROOT 

594 

595 super().__init__(params) 

596 

597 self.required_params.extend(["colid"]) 

598 # self.pid is optional when you want to archive a Collection 

599 self.required_params = [id for id in self.required_params if id != "pid"] 

600 

601 def internal_do(self): 

602 """ 

603 Archive files of Numdam. 

604 - Send http requests to numdam.org to get the list of user files (PDF/DjVu/XML) to preserve 

605 - Copy these files to self.export_older (/mathdoc_archive) 

606 - Copy src files (mainly digitized TIF/JPG files) from /numdam_dev 

607 

608 Warning: this class does not operate with Resource objects stored in the database (Collection, Container...) 

609 since the data comes from numdam.org 

610 Information is only based on pids (collection, issue, article) 

611 """ 

612 

613 # 1. Get the list of user files to archive (files visible by the user, like PDF or DjVus) from numdam.org 

614 data = get_numdam_file_list(self.colid, self.pid) 

615 pdfas = {} 

616 article_pids = [] 

617 

618 # 2. Prepare the backup or an Issue (backup, delete previous folder) 

619 if self.pid is not None: # Archive an issue 

620 article_pids = [item["pid"] for item in data["articles"]] if "articles" in data else [] 

621 

622 # Backup PDF/A before deleting the issue folder 

623 pdfas = self.backup_pdfa_if_necessary(self.colid, article_pids) 

624 

625 issue_folder = resolver.get_relative_folder(self.colid, self.pid) 

626 

627 with open( 

628 os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8" 

629 ) as file_: 

630 file_.write(f"Delete {self.export_folder}/{issue_folder}\n") 

631 

632 # Delete the issue folder 

633 resolver.delete_object_folder(issue_folder, self.export_folder) 

634 

635 # 3. Archive the JATS XML of the pid 

636 # TODO: The XML coming from Numdam does not list PDF/A in the <self-uri> of the articles, only PDF/DjVu 

637 # create_or_restore_pdfa is going to add the PDF/A on disk 

638 # We should modify the xml to add the PDF/A 

639 archive_numdam_xml(self.colid, self.pid, self.export_folder) 

640 

641 # 4. Archive the user files (list gotten in 2.) 

642 if "files" in data: 

643 # Files of a Collection or a Container 

644 resolver.copy_binary_files( 

645 None, 

646 self.binary_files_folder, 

647 self.export_folder, 

648 data["files"], 

649 ) 

650 

651 if "articles" in data: 

652 # In case of a Container, files of each article 

653 for article_data in data["articles"]: 

654 resolver.copy_binary_files( 

655 None, 

656 self.binary_files_folder, 

657 self.export_folder, 

658 article_data["files"], 

659 ) 

660 

661 if self.pid is not None: 

662 # 5. Archive the src files (tiff, pdf/djvu without headers,...) 

663 copy_numdam_src_files(self.colid, self.pid, article_pids, self.export_folder) 

664 

665 # 6. Create PDF/A 

666 self.create_or_restore_pdfa(self.colid, article_pids, pdfas)