Coverage for apps/ptf/cmds/ptf_cmds/archive_ptf

1import datetime

2import os

3import shutil

4import subprocess

6import lxml.etree as etree

7import requests

9from django.conf import settings

11from ptf import model_helpers

12from ptf.cmds.database_cmds import baseCmd

13from ptf.display import resolver

15from .base_ptf_cmds import exportExtraDataPtfCmd

16from .base_ptf_cmds import exportPtfCmd

19def create_toc_xml(colid, issues):

20 """

21 Create the toc.xml file for the collection

22 """

23 if os.access(os.path.join(settings.MATHDOC_ARCHIVE_FOLDER, colid), os.R_OK):

24 os.chdir(os.path.join(settings.MATHDOC_ARCHIVE_FOLDER, colid))

25 if os.access("toc.xml", os.R_OK):

26 os.remove("toc.xml")

27 journal = etree.Element("journal")

29 for issue in issues:

30 new_node = etree.Element("journal-meta")

32 issue_id = etree.Element("journal-id") # Par exemple : JEP

33 issue_id.text = colid

34 new_node.append(issue_id)

36 year = etree.Element("year") # Par exemple : 2019

37 year.text = issue.year

38 new_node.append(year)

40 if len(issue.vseries) != 0:

41 series = etree.Element("series") # Peut être null

42 series.text = issue.vseries

43 new_node.append(series)

45 if len(issue.volume) != 0:

46 volume = etree.Element("volume") # Par exemple : 1

47 volume.text = issue.volume

48 new_node.append(volume)

50 if len(issue.number) != 0:

51 number = etree.Element("number") # Peut être null

52 number.text = issue.number

53 new_node.append(number)

55 folder = etree.Element("folder")

56 folder.text = issue.pid

57 new_node.append(folder)

59 journal.append(new_node)

61 node_str = etree.tostring(journal, pretty_print=True, encoding="unicode")

62 toc_file = open("toc.xml", "w+") # Création du fichier toc.xml

63 toc_file.write(node_str)

64 toc_file.close()

67class archiveCollectionPtfCmd(baseCmd):

68 """

69 Archive the collection on disk

70 """

72 def __init__(self, params=None):

73 super().__init__(params)

74 if params is None:

75 params = {}

76 else:

77 self.pid = params["colid"]

78 self.mathdoc_archive = settings.MATHDOC_ARCHIVE_FOLDER

79 self.binary_files_folder = settings.MERSENNE_PROD_DATA_FOLDER

80 self.issues = params["issues"]

81 self.required_params.extend(["colid"])

83 def internal_do(self):

84 exportPtfCmd(

85 {

86 "pid": self.pid, # On exporte la collection

87 "export_folder": self.mathdoc_archive, # On exporte dans le répertoire de la collection

88 "for_archive": True, # On exporte pour l'archive

89 "with_internal_data": False, # On n'exporte pas les données internes

90 "with_binary_files": True, # On exporte les fichiers binaires (ex : img)

91 "binary_files_folder": self.binary_files_folder,

92 }

93 ).do() # On exporte dans le répertoire temporaire

95 create_toc_xml(self.pid, self.issues)

98class archiveIssuePtfCmd(baseCmd):

99 """

100 Archive the issue on disk

101 """

102

103 def __init__(self, params=None):

104 self.pid = None # container pid

105 self.article = None # Allow archiving of only 1 article

106 self.skip_pdfa = False

107 self.xml_only = False # Geodesic needs to archive only the XML

108 if "issue" in params.keys(): 108 ↛ 109line 108 didn't jump to line 109, because the condition on line 108 was never true

109 self.issue = params["issue"]

110 else:

111 self.issue = None

112

113 # The derived archiveNumdamIssuePtfCmd class has other default values

114 if not hasattr(self, "export_folder"): 114 ↛ 116line 114 didn't jump to line 116, because the condition on line 114 was never false

115 self.export_folder = settings.MATHDOC_ARCHIVE_FOLDER

116 if not hasattr(self, "binary_files_folder"): 116 ↛ 119line 116 didn't jump to line 119, because the condition on line 116 was never false

117 self.binary_files_folder = settings.MERSENNE_PROD_DATA_FOLDER

118

119 super().__init__(params)

120

121 self.required_params.extend(["pid"])

122

123 # Pass an article to archive only 1 article

124 def set_article(self, article):

125 self.article = article

126

127 def archive_tex_src(self, article, colid, tex_src_folder, tex_article_folder):

128 src_folder = os.path.join(tex_src_folder, tex_article_folder)

129 dest_folder = os.path.join(self.export_folder, article.get_relative_folder(), "src/tex/")

130

131 resolver.create_folder(dest_folder)

132

133 with open(os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8") as file_:

134 file_.write(f"Create {dest_folder}\n")

135

136 # 1. Copy tex file

137 tex_file = os.path.join(src_folder, tex_article_folder + ".tex")

138 resolver.copy_file(tex_file, dest_folder)

139

140 # 2. sty files

141 sty_files = [

142 os.path.join(src_folder, f)

143 for f in os.listdir(src_folder)

144 if os.path.isfile(os.path.join(src_folder, f)) and f.endswith(".sty")

145 ]

146 for sty_file in sty_files:

147 resolver.copy_file(sty_file, dest_folder)

148

149 # 3. bib file

150 aux_file = os.path.join(src_folder, tex_article_folder + ".aux")

151 if os.path.isfile(aux_file):

152 cmd_str = "cd " + src_folder + "; grep bibdata " + aux_file

153 try:

154 result = subprocess.check_output(cmd_str, shell=True).decode(encoding="utf-8")

155 for line in result.split("\n"):

156 words = line.split("{")

157 if len(words) > 1:

158 line = words[1]

159 words = line.split("}")

160 if len(words) > 1:

161 base_file = words[0]

162

163 full_src_file = os.path.join(src_folder, base_file + ".bib")

164 if os.path.isfile(full_src_file):

165 full_dest_file = os.path.join(dest_folder, base_file + ".bib")

166 resolver.copy_file(full_src_file, full_dest_file)

167 except subprocess.CalledProcessError as e:

168 if e.returncode != 1: # grep returns 1 if nothing was found

169 message = f'Error {e.returncode} with "{cmd_str}": {e.output}'

170 raise RuntimeError(message)

171 else:

172 base_file = resolver.get_bibtex_from_tex(tex_file)

173 if base_file:

174 full_src_file = os.path.join(src_folder, base_file + ".bib")

175 if os.path.isfile(full_src_file):

176 full_dest_file = os.path.join(dest_folder, base_file + ".bib")

177 resolver.copy_file(full_src_file, full_dest_file)

178

179 # 4. cdrdoidates, figures/ folder,...

180 cmd_str = (

181 "cd "

182 + src_folder

183 + r"; grep -v /usr/local/texlive/ *.fls | grep '\./' | grep -v '\.out$' | sort -u"

184 )

185

186 try:

187 result = subprocess.check_output(cmd_str, shell=True).decode(encoding="utf-8")

188 for line in result.split("\n"):

189 words = line.split(" ")

190 if len(words) > 1:

191 file = words[1]

192 file_folder = os.path.dirname(file)

193

194 full_src_file = os.path.join(src_folder, file)

195 if os.path.isfile(full_src_file):

196 file_dest_folder = os.path.join(dest_folder, file_folder)

197 resolver.create_folder(file_dest_folder)

198

199 full_dest_file = os.path.join(dest_folder, file)

200 resolver.copy_file(full_src_file, full_dest_file)

201 except subprocess.CalledProcessError as e:

202 if e.returncode != 1: # grep returns 1 if nothing was found

203 message = f'Error {e.returncode} with "{cmd_str}": {e.output}'

204 raise RuntimeError(message)

205

206 def create_pdfa(self, colid, article_pid):

207 """

208 Create the pdfa files of the pdfs associated with the collection

209 """

210

211 in_file = os.path.join(

212 self.export_folder, colid, self.pid, article_pid, article_pid + ".pdf"

213 )

214 out_file = os.path.join(

215 self.export_folder, colid, self.pid, article_pid, article_pid + "_PDFA.pdf"

216 )

217

218 code_return = os.system(

219 f"gs -dSAFER -dBATCH -DNOPAUSE -sPAPERSIZE=halfletter -dPDFFitPage -dFIXEDMEDIA "

220 f"-dEmbedAllFonts=true -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPrinted=false -dPDFSETTINGS=/printer "

221 f"-q -o {out_file} {in_file}"

222 ) # On convertit le fichier pdf en fichier pdfa

223 if code_return != 0: # Si le code retour de la commande n'est pas 0, il y a eu une erreur

224 with open(

225 os.path.join(settings.LOG_DIR, "archive_error.log"), "a", encoding="utf-8"

226 ) as file_:

227 file_.write(article_pid + " , PDF/A\n")

228

229 raise RuntimeError(f"Le fichier {in_file} n'a pas pu être converti en PDFA")

230

231 with open(os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8") as file_:

232 file_.write(f"Write {out_file}\n")

233

234 def backup_pdfa_if_necessary(self, colid, article_pids):

235 """

236 Since it is time consuming to create a PDF/A, check if they have to be re-created

237 before deleting the /mathdoc_archive issue folder.

238

239 If so, backup the PDF/A in a temp folder. They will be put back in /mathdoc_archive in create_or_restore_pdfa()

240

241 To check, we compare the date of the article PDF between

242 - /mathdoc_archive and

243 - self.binary_files_folder (/mersenne_prod_data or /numdam_data)

244 """

245

246 pdfas = {}

247 tmp_folder = os.path.join(settings.LOG_DIR, "tmp/archive", colid, self.pid)

248 resolver.create_folder(tmp_folder)

249

250 with open(os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8") as file_:

251 file_.write(f"Create {tmp_folder}\n")

252

253 # Before deleting the issue folder, check if we have to recreate the PDF/A (save them to a tmp folder)

254 # If the PDF has been updated in /mersenne_prod_data, we recreate the PDF/A

255 for article_pid in article_pids:

256 pdf_in_archive = resolver.get_disk_location(

257 self.export_folder, colid, "pdf", self.pid, article_pid

258 )

259 pdf_in_prod = resolver.get_disk_location(

260 self.binary_files_folder, colid, "pdf", self.pid, article_pid

261 )

262

263 do_create_pdfa = True

264

265 if os.path.isfile(pdf_in_prod) and os.path.isfile(pdf_in_archive):

266 pdfa_in_archive = os.path.join(

267 self.export_folder, colid, self.pid, article_pid, article_pid + "_PDFA.pdf"

268 )

269

270 date_pdf_in_archive = datetime.datetime.fromtimestamp(

271 os.stat(pdf_in_prod).st_mtime

272 ).strftime("%Y-%m-%d")

273 date_pdf_in_prod = datetime.datetime.fromtimestamp(

274 os.stat(pdf_in_archive).st_mtime

275 ).strftime("%Y-%m-%d")

276

277 do_create_pdfa = (

278 not os.path.isfile(pdfa_in_archive) or date_pdf_in_prod != date_pdf_in_archive

279 )

280

281 pdfas[article_pid] = do_create_pdfa

282 if not do_create_pdfa:

283 # Copy the PDF/A in the temp folder

284 src_pdfa = os.path.join(

285 self.export_folder, colid, self.pid, article_pid, article_pid + "_PDFA.pdf"

286 )

287 dest_pdfa = os.path.join(tmp_folder, article_pid + "_PDFA.pdf")

288 resolver.copy_file(src_pdfa, dest_pdfa)

289

290 with open(

291 os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8"

292 ) as file_:

293 file_.write(f"Backup {dest_pdfa}\n")

294

295 return pdfas

296

297 def create_or_restore_pdfa(self, colid, article_pids, pdfas):

298 tmp_folder = os.path.join(settings.LOG_DIR, "tmp/archive", colid, self.pid)

299

300 for article_pid in article_pids:

301 if pdfas[article_pid]:

302 self.create_pdfa(colid, article_pid)

303 else:

304 src_pdfa = os.path.join(tmp_folder, article_pid + "_PDFA.pdf")

305 dest_pdfa = os.path.join(

306 self.export_folder, colid, self.pid, article_pid, article_pid + "_PDFA.pdf"

307 )

308 resolver.copy_file(src_pdfa, dest_pdfa)

309

310 with open(

311 os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8"

312 ) as file_:

313 file_.write(f"Restore {dest_pdfa}\n")

314

315 if os.path.isdir(tmp_folder):

316 shutil.rmtree(tmp_folder)

317

318 with open(

319 os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8"

320 ) as file_:

321 file_.write(f"Delete {tmp_folder}\n")

322

323 def archive_files(self, colid, container, articles, article_pids, pdfas):

324 # II. Copy binary files (PDF...)

325 for a in articles:

326 article_folder = a.get_relative_folder()

327

328 with open(

329 os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8"

330 ) as file_:

331 file_.write(f"Delete {article_folder}\n")

332

333 resolver.delete_object_folder(article_folder, to_folder=self.export_folder)

334 resolver.copy_binary_files(a, self.binary_files_folder, self.export_folder)

335

336 params = {"pid": self.pid, "export_folder": self.export_folder, "export_all": False}

337 exportExtraDataPtfCmd(params).do()

338

339 tex_src_folder = resolver.get_cedram_issue_tex_folder(colid, self.pid)

340 tex_folders, _ = resolver.get_cedram_tex_folders(colid, self.pid)

341

342 # III. Articles written in LaTeX. We need to archive files needed to re-compile the LaTex source code

343 if len(tex_folders) > 0: 343 ↛ 344line 343 didn't jump to line 344, because the condition on line 343 was never true

344 i = 0

345 for article in container.article_set.exclude(do_not_publish=True):

346 if self.article is None or self.article.pid == article.pid:

347 self.archive_tex_src(article, colid, tex_src_folder, tex_folders[i])

348

349 i += 1

350

351 # IV. Digitized papers (Numdam). We basically need to archive the images (*.tif)

352 copy_numdam_src_files(colid, self.pid, article_pids, self.export_folder)

353

354 # V. Create PDF/A

355 if not self.skip_pdfa: 355 ↛ 356line 355 didn't jump to line 356, because the condition on line 355 was never true

356 self.create_or_restore_pdfa(colid, article_pids, pdfas)

357

358 def internal_do(self):

359 super().internal_do()

360

361 if self.article is None: 361 ↛ 367line 361 didn't jump to line 367, because the condition on line 361 was never false

362 container = model_helpers.get_container(self.pid, prefetch=False)

363 qs = container.article_set.all()

364 article_pids = list(qs.values_list("pid", flat=True))

365 articles = qs

366 else:

367 container = self.article.my_container

368 article_pids = [self.article.pid]

369 articles = [self.article]

370

371 colid = container.get_top_collection().pid

372 self.pid = container.pid

373

374 pdfas = {}

375 if not self.xml_only and not self.skip_pdfa: 375 ↛ 378line 375 didn't jump to line 378, because the condition on line 375 was never true

376 # Backup PDF/A before deleting the issue folder

377 # (it is time consuming to create a PDF/A, we will check if they have to be re-created)

378 pdfas = self.backup_pdfa_if_necessary(colid, article_pids)

379

380 # Delete the issue folder if we archive an issue

381 if self.article is None: 381 ↛ 392line 381 didn't jump to line 392, because the condition on line 381 was never false

382 issue_folder = container.get_relative_folder()

383

384 with open(

385 os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8"

386 ) as file_:

387 file_.write(f"Delete {self.export_folder}/{issue_folder}\n")

388

389 resolver.delete_object_folder(issue_folder, to_folder=self.export_folder)

390

391 # I. Always archive the issue XML, even if we archive only 1 article

392 exportPtfCmd(

393 {

394 "pid": self.pid,

395 "export_folder": self.export_folder,

396 "with_binary_files": self.article

397 is None, # binary files for 1 article are copied below

398 "for_archive": True,

399 "binary_files_folder": self.binary_files_folder,

400 }

401 ).do()

402

403 if not self.xml_only: 403 ↛ exitline 403 didn't return from function 'internal_do', because the condition on line 403 was never false

404 self.archive_files(colid, container, articles, article_pids, pdfas)

405

406

407def archive_numdam_xml(colid, pid, export_folder):

408 """

409 Get the XML of a collection or an issue

410 """

411 url = settings.NUMDAM_URL + "/api-item-xml/"

412 if pid is None:

413 url += colid

414 else:

415 url += pid

416 response = requests.get(url)

417 response.raise_for_status()

418

419 xml_body = response.content.decode("utf-8")

420

421 if xml_body:

422 file = resolver.get_archive_filename(export_folder, colid, pid, "xml", True)

423

424 with open(file, "w", encoding="utf-8") as f:

425 f.write(xml_body)

426

427

428def get_numdam_issues_list(colid):

429 """

430 Get the list of issues of a collection from numdam.org

431 """

432 response = requests.get(f"{settings.NUMDAM_URL}/api-issues/{colid}")

433 response.raise_for_status()

434

435 return response.json()["issues"]

436

437

438def get_numdam_file_list(colid, pid):

439 """

440 Get the list of files to archive (of a collection or an issue)

441 The files are those visible by the user, like PDF or DjVus

442 """

443 url = settings.NUMDAM_URL + "/api-item-file-list/"

444 if pid is None:

445 url += colid

446 else:

447 url += pid

448 response = requests.get(url)

449 response.raise_for_status()

450

451 data = response.json()

452 return data

453

454

455def copy_numdam_src_files(colid, pid, article_pids, export_folder, log_file=None):

456 src_folder = os.path.join(settings.NUMDAM_ISSUE_SRC_FOLDER, colid, pid)

457

458 if not os.path.isdir(src_folder):

459 return

460

461 # 1. Files related to the issue

462 dest_folder = os.path.join(

463 export_folder, resolver.get_relative_folder(colid, pid), "src/digitisation/"

464 )

465

466 if log_file: 466 ↛ 467line 466 didn't jump to line 467, because the condition on line 466 was never true

467 log_file.write("Create " + dest_folder + "...")

468

469 resolver.create_folder(dest_folder)

470

471 if log_file: 471 ↛ 472line 471 didn't jump to line 472, because the condition on line 471 was never true

472 log_file.write("done\n")

473

474 # 1a. issue.xml

475 full_src_file = os.path.join(src_folder, pid + ".xml")

476 if os.path.isfile(full_src_file): 476 ↛ 488line 476 didn't jump to line 488, because the condition on line 476 was never false

477 full_dest_file = os.path.join(dest_folder, pid + ".xml")

478

479 if log_file: 479 ↛ 480line 479 didn't jump to line 480, because the condition on line 479 was never true

480 log_file.write(f"Copy {os.path.basename(full_src_file)} ...")

481

482 resolver.copy_file(full_src_file, full_dest_file)

483

484 if log_file: 484 ↛ 485line 484 didn't jump to line 485, because the condition on line 484 was never true

485 log_file.write("done\n")

486

487 # 1b. tif, jpg files

488 img_files = [

489 os.path.join(src_folder, f)

490 for f in os.listdir(src_folder)

491 if (

492 os.path.isfile(os.path.join(src_folder, f))

493 and (f.endswith(".tif") or f.endswith(".jpg"))

494 )

495 ]

496

497 for img_file in img_files:

498 if log_file: 498 ↛ 499line 498 didn't jump to line 499, because the condition on line 498 was never true

499 log_file.write(f"Copy {os.path.basename(img_file)} ...")

500

501 resolver.copy_file(img_file, dest_folder)

502

503 if log_file: 503 ↛ 504line 503 didn't jump to line 504, because the condition on line 503 was never true

504 log_file.write("done\n")

505

506 # 2. Files related to articles

507 for article_pid in article_pids:

508 src_folder = os.path.join(settings.NUMDAM_ARTICLE_SRC_FOLDER, colid, pid, article_pid)

509

510 dest_folder = os.path.join(

511 export_folder,

512 resolver.get_relative_folder(colid, pid, article_pid),

513 "src/digitisation/",

514 )

515

516 if log_file: 516 ↛ 517line 516 didn't jump to line 517, because the condition on line 516 was never true

517 log_file.write("Create " + dest_folder + "...")

518

519 resolver.create_folder(dest_folder)

520

521 if log_file: 521 ↛ 522line 521 didn't jump to line 522, because the condition on line 521 was never true

522 log_file.write("done\n")

523

524 # 2a. article.xml (Full Text)

525 full_src_file = os.path.join(src_folder, article_pid + ".xml")

526 if os.path.isfile(full_src_file): 526 ↛ 538line 526 didn't jump to line 538, because the condition on line 526 was never false

527 full_dest_file = os.path.join(dest_folder, article_pid + ".xml")

528

529 if log_file: 529 ↛ 530line 529 didn't jump to line 530, because the condition on line 529 was never true

530 log_file.write(f"Copy {os.path.basename(full_src_file)} ...")

531

532 resolver.copy_file(full_src_file, full_dest_file)

533

534 if log_file: 534 ↛ 535line 534 didn't jump to line 535, because the condition on line 534 was never true

535 log_file.write("done\n")

536

537 # 2b. tif, jpg files

538 img_files = [

539 os.path.join(src_folder, f)

540 for f in os.listdir(src_folder)

541 if (

542 os.path.isfile(os.path.join(src_folder, f))

543 and (f.endswith(".tif") or f.endswith(".jpg"))

544 )

545 ]

546

547 for img_file in img_files: 547 ↛ 548line 547 didn't jump to line 548, because the loop on line 547 never started

548 if log_file:

549 log_file.write(f"Copy {os.path.basename(img_file)} ...")

550

551 resolver.copy_file(img_file, dest_folder)

552

553 if log_file:

554 log_file.write("done\n")

555

556 # PDF/DJVU without headers

557 # Olivier 09/05/2019: these files are almost identicals to the final PDFs

558 # There is no need to archive them. Just use the final PDF and remove the first page

559 # if needed

560

561 # # 2c. pdf files (without header)

562 # full_src_file = os.path.join(src_folder, article_pid + '.pdf')

563 # if os.path.isfile(full_src_file):

564 # full_dest_file = os.path.join(dest_folder, article_pid + '.pdf')

565 # if file:

566 # file.write("Copy {} ...".format(os.path.basename(full_src_file)))

567 # copy_file(full_src_file, full_dest_file)

568 # if file:

569 # file.write("done\n")

570 #

571

572

573# def copy_numdam_djvu(colid, pid, article_pids, export_folder, log_file=None):

574# """

575# Djvu might not be visible/listed in centre Mersenne articles, but might exist in Numdam

576# """

577#

578# if hasattr(settings, "NUMDAM_DATA_ROOT"):

579# for article_pid in article_pids:

580# article_folder = resolver.get_relative_folder(colid, pid, article_pid)

581# full_src_file = os.path.join(

582# settings.NUMDAM_DATA_ROOT, article_folder, article_pid + ".djvu"

583# )

584# if os.path.isfile(full_src_file):

585# full_dest_file = os.path.join(export_folder, article_folder, article_pid + ".djvu")

586# if log_file:

587# log_file.write(f"Copy {os.path.basename(full_src_file)} ...")

588# resolver.copy_file(full_src_file, full_dest_file)

589# if log_file:

590# log_file.write("done\n")

591

592

593class archiveNumdamResourcePtfCmd(archiveIssuePtfCmd):

594 """

595 Archive a Container or a Collection (just the collection level) stored in Numdam

596 """

597

598 def __init__(self, params=None):

599 self.colid = None # self.pid from the base class is the id of the container

600 self.export_folder = settings.MATHDOC_ARCHIVE_FOLDER

601 self.binary_files_folder = settings.NUMDAM_DATA_ROOT

602

603 super().__init__(params)

604

605 self.required_params.extend(["colid"])

606 # self.pid is optional when you want to archive a Collection

607 self.required_params = [id for id in self.required_params if id != "pid"]

608

609 def internal_do(self):

610 """

611 Archive files of Numdam.

612 - Send http requests to numdam.org to get the list of user files (PDF/DjVu/XML) to preserve

613 - Copy these files to self.export_older (/mathdoc_archive)

614 - Copy src files (mainly digitized TIF/JPG files) from /numdam_dev

615

616 Warning: this class does not operate with Resource objects stored in the database (Collection, Container...)

617 since the data comes from numdam.org

618 Information is only based on pids (collection, issue, article)

619 """

620

621 # 1. Get the list of user files to archive (files visible by the user, like PDF or DjVus) from numdam.org

622 data = get_numdam_file_list(self.colid, self.pid)

623 pdfas = {}

624 article_pids = []

625

626 # 2. Prepare the backup or an Issue (backup, delete previous folder)

627 if self.pid is not None: # Archive an issue

628 article_pids = [item["pid"] for item in data["articles"]] if "articles" in data else []

629

630 # Backup PDF/A before deleting the issue folder

631 # (it is time consuming to create a PDF/A, we will check if they have to be re-created)

632 pdfas = self.backup_pdfa_if_necessary(self.colid, article_pids)

633

634 issue_folder = resolver.get_relative_folder(self.colid, self.pid)

635

636 with open(

637 os.path.join(settings.LOG_DIR, "archive.log"), "a", encoding="utf-8"

638 ) as file_:

639 file_.write(f"Delete {self.export_folder}/{issue_folder}\n")

640

641 # Delete the issue folder

642 resolver.delete_object_folder(issue_folder, self.export_folder)

643

644 # 3. Archive the JATS XML of the pid

645 # TODO: The XML coming from Numdam does not list PDF/A in the <self-uri> of the articles, only PDF/DjVu

646 # create_or_restore_pdfa is going to add the PDF/A on disk

647 # We should modify the xml to add the PDF/A

648 archive_numdam_xml(self.colid, self.pid, self.export_folder)

649

650 # 4. Archive the user files (list gotten in 2.)

651 if "files" in data:

652 # Files of a Collection or a Container

653 resolver.copy_binary_files(

654 None,

655 self.binary_files_folder,

656 self.export_folder,

657 data["files"],

658 )

659

660 if "articles" in data:

661 # In case of a Container, files of each article

662 for article_data in data["articles"]:

663 resolver.copy_binary_files(

664 None,

665 self.binary_files_folder,

666 self.export_folder,

667 article_data["files"],

668 )

669

670 if self.pid is not None:

671 # 5. Archive the src files (tiff, pdf/djvu without headers,...)

672 copy_numdam_src_files(self.colid, self.pid, article_pids, self.export_folder)

673

674 # 6. Create PDF/A

675 self.create_or_restore_pdfa(self.colid, article_pids, pdfas)

Coverage for apps/ptf/cmds/ptf_cmds/archive_ptf_cmds.py: 30%

322 statements