Coverage for apps/ptf/utils.py: 50%

178 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-05-19 19:20 +0000

1import difflib 

2import html 

3import os 

4import re 

5import subprocess 

6import unicodedata 

7 

8from bleach.css_sanitizer import ALLOWED_CSS_PROPERTIES 

9from bleach.css_sanitizer import CSSSanitizer 

10from bleach.sanitizer import Cleaner 

11from PIL import Image 

12from PIL import ImageFile 

13 

14ImageFile.LOAD_TRUNCATED_IMAGES = True 

15 

16from django.conf import settings 

17from django.core.exceptions import ImproperlyConfigured 

18from django.core.mail import EmailMultiAlternatives 

19from django.template import Template 

20from django.template import TemplateSyntaxError 

21from django.template import engines 

22from django.template.loader import render_to_string 

23from django.utils.html import strip_tags 

24from django.utils.translation import gettext_lazy as _ 

25 

26from ptf.site_register import SITE_REGISTER 

27 

28 

29def strip_markup(string): 

30 """ 

31 Strip string from : 

32 - xml markkup (mathml, html, etc..) 

33 - html entities (&nbsp, etc...) 

34 """ 

35 cleanr = re.compile("<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});") 

36 return re.sub(cleanr, "", string) 

37 

38 

39def highlight_diff(ours, theirs): 

40 matcher = difflib.SequenceMatcher( 

41 None, strip_markup(ours.lower()), strip_markup(theirs.lower()) 

42 ) 

43 

44 def process_tag(tag, i1, i2, j1, j2): 

45 if tag == "equal": 

46 return f"<span class='bg-success'>{matcher.b[j1:j2]}</span>" 

47 elif tag == "replace": 

48 return f"<span class='bg-dark'>{matcher.b[j1:j2]}</span>" 

49 else: 

50 return matcher.b[j1:j2] 

51 

52 return "".join(process_tag(*t) for t in matcher.get_opcodes()) 

53 

54 

55def volume_display(): 

56 if settings.VOLUME_STRING: 56 ↛ 57line 56 didn't jump to line 57, because the condition on line 56 was never true

57 return "Volume" 

58 else: 

59 return _("Tome") 

60 

61 

62def execute_cmd(cmd, force_execute=False): 

63 """ 

64 excute shell command 

65 @param cmd: str which represents shell command 

66 @return: output of the command 

67 """ 

68 if settings.MERSENNE_CREATE_FRONTPAGE or force_execute: 

69 result = subprocess.check_output(cmd, shell=True) 

70 return result 

71 

72 else: 

73 # pour debug 

74 returnStatus = 0 

75 output = cmd 

76 with open(os.path.join(settings.LOG_DIR, "cmd.log"), "a", encoding="utf-8") as file_: 

77 file_.write(f"cmd : {cmd}\n") 

78 return returnStatus, output 

79 

80 

81def get_file_content_in_utf8(filename): 

82 """ 

83 

84 :param filename: 

85 :return: the body of a utf-8 file 

86 """ 

87 with open(filename, encoding="utf-8") as f: 

88 body = f.read() 

89 return body 

90 

91 

92def pdf_to_text(pdf_filename): 

93 # Extract full text from the PDF 

94 if not settings.MERSENNE_CREATE_FRONTPAGE: 94 ↛ 97line 94 didn't jump to line 97, because the condition on line 94 was never false

95 return "" 

96 else: 

97 os.makedirs(settings.MERSENNE_TMP_FOLDER, exist_ok=True) 

98 

99 txt_filename = os.path.join(settings.MERSENNE_TMP_FOLDER, "fulltext.txt") 

100 cmd_str = "pdftotext -raw -nopgbrk -enc UTF-8 " + pdf_filename + " " + txt_filename 

101 

102 execute_cmd(cmd_str) 

103 # Check if the output file has been created 

104 if not os.path.isfile(txt_filename): 

105 raise RuntimeError("The PDF file was not converted by pdftotext") 

106 

107 body = get_file_content_in_utf8(txt_filename) 

108 # strip control characters 

109 body = "".join(ch for ch in body if unicodedata.category(ch)[0] != "C") 

110 

111 return body 

112 

113 

114def linearize_pdf(from_path, to_path): 

115 # Linearize the PDF 

116 

117 cmd_str = "qpdf --linearize " + from_path + " " + to_path 

118 

119 try: 

120 subprocess.check_output(cmd_str, shell=True) 

121 except Exception as e: 

122 if not os.path.isfile(to_path): 

123 raise e 

124 

125 do_copy = False 

126 return do_copy 

127 

128 

129def get_display_name(prefix, first_name, last_name, suffix, string_name): 

130 display_first_name_first = getattr(settings, "DISPLAY_FIRST_NAME_FIRST", False) 

131 

132 list_name = [x for x in [last_name, first_name] if x.strip()] 

133 if display_first_name_first and (last_name or first_name): 

134 string_name = " ".join(list_name[::-1]) 

135 elif last_name or first_name: 

136 string_name = f"{prefix} " if prefix else "" 

137 string_name += ", ".join(list_name) 

138 string_name += f" {suffix}" if suffix else "" 

139 

140 return string_name 

141 

142 

143def ckeditor_input_sanitizer(html: str, allow_img: bool = False) -> str: 

144 """ 

145 Sanitizes HTML input from the CKEditor. 

146 It uses bleach library (https://bleach.readthedocs.io/en/latest/index.html), an allowed-list-based sanitizer. 

147 JavaScript is removed by allowing only a subset of HTML tags and attributes. 

148 It does not make use of `lxml.html.Cleaner` because the documentation clearly says that this is not a secure 

149 approach. 

150 

151 html: str 

152 The HTML string to sanitize. 

153 allow_img : bool 

154 Whether to preserve img related tags 

155 """ 

156 allowed_tags = [ 

157 "a", 

158 "abbr", 

159 "acronym", 

160 "address", 

161 "aside", 

162 "b", 

163 "bdi", 

164 "bdo", 

165 "blockquote", 

166 "br", 

167 "caption", 

168 "cite", 

169 "code", 

170 "dd", 

171 "del", 

172 "dfn", 

173 "div", 

174 "dl", 

175 "dt", 

176 "em", 

177 "h1", 

178 "h2", 

179 "h3", 

180 "h4", 

181 "h5", 

182 "h6", 

183 "hgroup", 

184 "hr", 

185 "i", 

186 "ins", 

187 "kbd", 

188 "li", 

189 "mark", 

190 "ol", 

191 "p", 

192 "pre", 

193 "q", 

194 "s", 

195 "samp", 

196 "small", 

197 "span", 

198 "strike", 

199 "strong", 

200 "sub", 

201 "table", 

202 "tbody", 

203 "td", 

204 "th", 

205 "thead", 

206 "tr", 

207 "u", 

208 "ul", 

209 "var", 

210 ] 

211 

212 allowed_attributes = { 

213 "*": ["class", "dir", "style", "id", "name"], 

214 "a": ["href", "target"], 

215 "img": ["alt", "height", "src", "width"], 

216 "source": ["type", "src"], 

217 # Those table attributes are deprecated but they are still used by CKEditor 4 

218 # We might consider upgrading/migrating to CKEditor 5 at some point 

219 "table": ["align", "border", "align", "cellspacing", "cellpadding"], 

220 "th": ["scope"], 

221 } 

222 

223 additional_css_properties = [ 

224 "border", 

225 "margin", 

226 "margin-left", 

227 "margin-right", 

228 "margin-top", 

229 "margin-bottom", 

230 "padding", 

231 "padding-left", 

232 "padding-right", 

233 "padding-top", 

234 "padding-bottom", 

235 ] 

236 

237 allowed_css_properties = ALLOWED_CSS_PROPERTIES | set(additional_css_properties) 

238 

239 image_allowed_tags = ["figcaption", "figure", "img", "picture", "source"] 

240 

241 if allow_img: 

242 allowed_tags += image_allowed_tags 

243 

244 css_sanitizer = CSSSanitizer(allowed_css_properties=allowed_css_properties) 

245 cleaner = Cleaner( 

246 tags=allowed_tags, attributes=allowed_attributes, css_sanitizer=css_sanitizer, strip=True 

247 ) 

248 return cleaner.clean(html) 

249 

250 

251def send_email( 

252 html_content: str, 

253 subject: str, 

254 to: list[str] | tuple[str], 

255 from_email: str | None = None, 

256 cc: list[str] | tuple[str] = [], 

257 from_collection: str = "", 

258 reply_to: list[str] | tuple[str] = [], 

259) -> None: 

260 """ 

261 Sends an e-mail to the provided recipients and copy recipients with the provided html content. 

262 It sends the e-mail with both a text and a HTML alternative. 

263 If not provided, the sender's e-mail address default to `settings.DEFAULT_FROM_EMAIL` 

264 Params: 

265 - html_content The HTML content of the e-mail 

266 - subject The e-mail's subject 

267 - from_email The sender's e-mail address 

268 - to The list or tuple of the e-mail recipients 

269 - cc The list or tuple of the e-mail CC 

270 - from_collection: The collection to send the mail for. If from_email is None 

271 it will get the email from site_register.py (`email_from`). 

272 """ 

273 if from_email == "": 

274 try: 

275 from_email = SITE_REGISTER[from_collection.lower()]["email_from"] 

276 except (KeyError, ValueError): 

277 if from_collection: 277 ↛ 278line 277 didn't jump to line 278, because the condition on line 277 was never true

278 raise ImproperlyConfigured( 

279 f"The collection {from_collection.lower()} is missing the " 

280 "email_from property in site_register.py" 

281 ) 

282 

283 # We additionally unescape HTML characters here to avoid having stuff like 

284 # &nbsp; &gt; etc. in the output text. 

285 text_content = html.unescape(strip_tags(html_content)) 

286 # Create the email, and attach the HTML version as well. 

287 return_path = getattr(settings, "RETURN_PATH", "no-reply@listes.mathdoc.fr") 

288 msg = EmailMultiAlternatives( 

289 subject=subject, 

290 body=text_content, 

291 from_email=from_email, 

292 to=to, 

293 cc=cc, 

294 headers={"Return-path": return_path}, 

295 reply_to=reply_to, 

296 ) 

297 msg.attach_alternative(html_content, "text/html") 

298 msg.send(fail_silently=False) 

299 

300 

301def send_email_from_template( 

302 template: str, 

303 context_data: dict, 

304 subject: str, 

305 to: list[str] | tuple[str], 

306 from_email: str = "", 

307 cc: list[str] | tuple[str] = [], 

308 from_collection: str = "", 

309) -> None: 

310 """ 

311 Renders the provided template and sends it as an e-mail to the 

312 provided recipients and copy recipients. 

313 It sends the e-mail with both a text and a HTML alternative. 

314 If not provided, the sender's e-mail address default to `settings.DEFAULT_FROM_EMAIL` 

315 Params: 

316 - template The HTML template of the e-mail 

317 - context_data The context data used to render the template 

318 - subject The e-mail's subject 

319 - from_email The sender's e-mail address 

320 - to The list or tuple of the e-mail recipients 

321 - cc The list or tuple of the e-mail CC 

322 - from_collection: The collection to send the mail for. If from_email is None 

323 it will get the email from site_register.py (`email_from`). 

324 """ 

325 if from_email == "": 325 ↛ 335line 325 didn't jump to line 335, because the condition on line 325 was never false

326 try: 

327 from_email = SITE_REGISTER[from_collection.lower()]["email_from"] 

328 except (KeyError, ValueError): 

329 if from_collection: 329 ↛ 330line 329 didn't jump to line 330, because the condition on line 329 was never true

330 raise ImproperlyConfigured( 

331 f"The collection {from_collection.lower()} is missing the " 

332 "email_from property in site_register.py" 

333 ) 

334 

335 html_content = render_to_string(template, context_data) 

336 send_email( 

337 html_content, subject, to=to, from_email=from_email, cc=cc, from_collection=from_collection 

338 ) 

339 

340 

341def template_from_string(template_string, using=None) -> Template: 

342 """ 

343 Convert a string into a template object using a given template engine 

344 or using the default backends from `settings.TEMPLATES` if no engine was specified. 

345 """ 

346 # This function is based on django.template.loader.get_template, 

347 # but uses Engine.from_string instead of Engine.get_template. 

348 engine_list = engines.all() if using is None else [engines[using]] 

349 for engine in engine_list: 

350 try: 

351 return engine.from_string(template_string) 

352 except TemplateSyntaxError: 

353 continue 

354 raise TemplateSyntaxError(template_string) 

355 

356 

357def resize_image(img, max_size=1600): 

358 """Take an image in argument and resize it to a {max_size} width with the same ratio""" 

359 

360 if img.width > max_size: 

361 ratio = img.width / img.height 

362 new_width = max_size 

363 new__height = int(max_size / ratio) 

364 img = img.resize((new_width, new__height)) 

365 

366 return img 

367 

368 

369def convert_tiff_to_jpg(img): 

370 """Take the path of a '.tiff' image and convert the image to a '.jpg' one""" 

371 image_file = os.path.basename(img.filename) 

372 image_name = os.path.splitext(image_file)[0] 

373 image_directory = os.path.dirname(img.filename) 

374 if img.mode == "RGBA": 

375 img = img.convert("RGB") 

376 img.thumbnail(img.size) 

377 img.save(os.path.join(image_directory, image_name + ".jpg"), "JPEG", quality=100) 

378 

379 

380Image.Image.resize_image = resize_image 

381Image.Image.convert_tiff_to_jpg = convert_tiff_to_jpg 

382 

383 

384def convert_tiff_to_jpg_from_path(image_path): 

385 """Take the path of a '.tiff' image and convert the image to a '.jpg' one""" 

386 

387 path = os.path.split(image_path)[0] 

388 image_file = os.path.basename(image_path) 

389 name = os.path.splitext(image_file)[0] 

390 

391 img = Image.open(image_path) 

392 

393 img.convert_tiff_to_jpg() 

394 

395 final_path = os.path.join(path, name + ".jpg") 

396 img.save(final_path, "JPEG", quality=100) 

397 img.close() 

398 

399 

400def resize_image_from_path(image_path): 

401 img = Image.open(image_path) 

402 

403 img = img.resize_image() 

404 

405 img.save(os.path.join(image_path), quality=100) 

406 img.close() 

407 

408 

409def convert_image_for_web(image_path): 

410 image_file = os.path.basename(image_path) 

411 extension = os.path.splitext(image_file)[1] 

412 

413 img = Image.open(image_path) 

414 if extension in [".tiff", ".tif"]: 

415 img.convert_tiff_to_jpg() 

416 elif extension in [".wmf", ".emf"]: 

417 return 

418 

419 img = img.resize_image() 

420 img.close() 

421 

422 

423def create_innerlink_for_citation(html_text, biblio): 

424 for key, value in biblio.items(): 

425 html_text = html_text.replace("\n", " ") 

426 label = value["label"].replace("[", "").replace("]", "") 

427 highlight_id = f"'r{label}'" 

428 reference = value["reference"] 

429 tooltip_html = ( 

430 '<span style="position: absolute; visibility: hidden" class="tooltip tooltiptexthidden">' 

431 + f"{reference}" 

432 + "</span>" 

433 ) 

434 citation_link = f'<a id="{label}" href="#r{label}" onclick="highlightReference({highlight_id}, 3000, 500)" >{key}</a>' 

435 new_html = f'<span class="tooltipPCJ">{citation_link}{tooltip_html}</span>' 

436 html_text = html_text.replace(f"{key}", new_html) 

437 key = key.replace(",", "") 

438 html_text = html_text.replace(f"{key}", new_html) 

439 key = key.replace("&", "and") 

440 html_text = html_text.replace(f"{key}", new_html) 

441 return html_text