Coverage for apps/ptf/utils.py: 50%

1import difflib

2import html

3import os

4import re

5import subprocess

6import unicodedata

8from bleach.css_sanitizer import ALLOWED_CSS_PROPERTIES

9from bleach.css_sanitizer import CSSSanitizer

10from bleach.sanitizer import Cleaner

11from PIL import Image

12from PIL import ImageFile

14ImageFile.LOAD_TRUNCATED_IMAGES = True

16from django.conf import settings

17from django.core.exceptions import ImproperlyConfigured

18from django.core.mail import EmailMultiAlternatives

19from django.template import Template

20from django.template import TemplateSyntaxError

21from django.template import engines

22from django.template.loader import render_to_string

23from django.utils.html import strip_tags

24from django.utils.translation import gettext_lazy as _

26from ptf.site_register import SITE_REGISTER

29def strip_markup(string):

30 """

31 Strip string from :

32 - xml markkup (mathml, html, etc..)

33 - html entities (&nbsp, etc...)

34 """

35 cleanr = re.compile("<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")

36 return re.sub(cleanr, "", string)

39def highlight_diff(ours, theirs):

40 matcher = difflib.SequenceMatcher(

41 None, strip_markup(ours.lower()), strip_markup(theirs.lower())

42 )

44 def process_tag(tag, i1, i2, j1, j2):

45 if tag == "equal":

46 return f"{matcher.b[j1:j2]}"

47 elif tag == "replace":

48 return f"{matcher.b[j1:j2]}"

49 else:

50 return matcher.b[j1:j2]

52 return "".join(process_tag(*t) for t in matcher.get_opcodes())

55def volume_display():

56 if settings.VOLUME_STRING: 56 ↛ 57line 56 didn't jump to line 57, because the condition on line 56 was never true

57 return "Volume"

58 else:

59 return _("Tome")

62def execute_cmd(cmd, force_execute=False):

63 """

64 excute shell command

65 @param cmd: str which represents shell command

66 @return: output of the command

67 """

68 if settings.MERSENNE_CREATE_FRONTPAGE or force_execute:

69 result = subprocess.check_output(cmd, shell=True)

70 return result

72 else:

73 # pour debug

74 returnStatus = 0

75 output = cmd

76 with open(os.path.join(settings.LOG_DIR, "cmd.log"), "a", encoding="utf-8") as file_:

77 file_.write(f"cmd : {cmd}\n")

78 return returnStatus, output

81def get_file_content_in_utf8(filename):

82 """

84 :param filename:

85 :return: the body of a utf-8 file

86 """

87 with open(filename, encoding="utf-8") as f:

88 body = f.read()

89 return body

92def pdf_to_text(pdf_filename):

93 # Extract full text from the PDF

94 if not settings.MERSENNE_CREATE_FRONTPAGE: 94 ↛ 97line 94 didn't jump to line 97, because the condition on line 94 was never false

95 return ""

96 else:

97 os.makedirs(settings.MERSENNE_TMP_FOLDER, exist_ok=True)

99 txt_filename = os.path.join(settings.MERSENNE_TMP_FOLDER, "fulltext.txt")

100 cmd_str = "pdftotext -raw -nopgbrk -enc UTF-8 " + pdf_filename + " " + txt_filename

101

102 execute_cmd(cmd_str)

103 # Check if the output file has been created

104 if not os.path.isfile(txt_filename):

105 raise RuntimeError("The PDF file was not converted by pdftotext")

106

107 body = get_file_content_in_utf8(txt_filename)

108 # strip control characters

109 body = "".join(ch for ch in body if unicodedata.category(ch)[0] != "C")

110

111 return body

112

113

114def linearize_pdf(from_path, to_path):

115 # Linearize the PDF

116

117 cmd_str = "qpdf --linearize " + from_path + " " + to_path

118

119 try:

120 subprocess.check_output(cmd_str, shell=True)

121 except Exception as e:

122 if not os.path.isfile(to_path):

123 raise e

124

125 do_copy = False

126 return do_copy

127

128

129def get_display_name(prefix, first_name, last_name, suffix, string_name):

130 display_first_name_first = getattr(settings, "DISPLAY_FIRST_NAME_FIRST", False)

131

132 list_name = [x for x in [last_name, first_name] if x.strip()]

133 if display_first_name_first and (last_name or first_name):

134 string_name = " ".join(list_name[::-1])

135 elif last_name or first_name:

136 string_name = f"{prefix} " if prefix else ""

137 string_name += ", ".join(list_name)

138 string_name += f" {suffix}" if suffix else ""

139

140 return string_name

141

142

143def ckeditor_input_sanitizer(html: str, allow_img: bool = False) -> str:

144 """

145 Sanitizes HTML input from the CKEditor.

146 It uses bleach library (https://bleach.readthedocs.io/en/latest/index.html), an allowed-list-based sanitizer.

147 JavaScript is removed by allowing only a subset of HTML tags and attributes.

148 It does not make use of `lxml.html.Cleaner` because the documentation clearly says that this is not a secure

149 approach.

150

151 html: str

152 The HTML string to sanitize.

153 allow_img : bool

154 Whether to preserve img related tags

155 """

156 allowed_tags = [

157 "a",

158 "abbr",

159 "acronym",

160 "address",

161 "aside",

162 "b",

163 "bdi",

164 "bdo",

165 "blockquote",

166 "br",

167 "caption",

168 "cite",

169 "code",

170 "dd",

171 "del",

172 "dfn",

173 "div",

174 "dl",

175 "dt",

176 "em",

177 "h1",

178 "h2",

179 "h3",

180 "h4",

181 "h5",

182 "h6",

183 "hgroup",

184 "hr",

185 "i",

186 "ins",

187 "kbd",

188 "li",

189 "mark",

190 "ol",

191 "p",

192 "pre",

193 "q",

194 "s",

195 "samp",

196 "small",

197 "span",

198 "strike",

199 "strong",

200 "sub",

201 "table",

202 "tbody",

203 "td",

204 "th",

205 "thead",

206 "tr",

207 "u",

208 "ul",

209 "var",

210 ]

211

212 allowed_attributes = {

213 "*": ["class", "dir", "style", "id", "name"],

214 "a": ["href", "target"],

215 "img": ["alt", "height", "src", "width"],

216 "source": ["type", "src"],

217 # Those table attributes are deprecated but they are still used by CKEditor 4

218 # We might consider upgrading/migrating to CKEditor 5 at some point

219 "table": ["align", "border", "align", "cellspacing", "cellpadding"],

220 "th": ["scope"],

221 }

222

223 additional_css_properties = [

224 "border",

225 "margin",

226 "margin-left",

227 "margin-right",

228 "margin-top",

229 "margin-bottom",

230 "padding",

231 "padding-left",

232 "padding-right",

233 "padding-top",

234 "padding-bottom",

235 ]

236

237 allowed_css_properties = ALLOWED_CSS_PROPERTIES | set(additional_css_properties)

238

239 image_allowed_tags = ["figcaption", "figure", "img", "picture", "source"]

240

241 if allow_img:

242 allowed_tags += image_allowed_tags

243

244 css_sanitizer = CSSSanitizer(allowed_css_properties=allowed_css_properties)

245 cleaner = Cleaner(

246 tags=allowed_tags, attributes=allowed_attributes, css_sanitizer=css_sanitizer, strip=True

247 )

248 return cleaner.clean(html)

249

250

251def send_email(

252 html_content: str,

253 subject: str,

254 to: list[str] | tuple[str],

255 from_email: str | None = None,

256 cc: list[str] | tuple[str] = [],

257 from_collection: str = "",

258 reply_to: list[str] | tuple[str] = [],

259) -> None:

260 """

261 Sends an e-mail to the provided recipients and copy recipients with the provided html content.

262 It sends the e-mail with both a text and a HTML alternative.

263 If not provided, the sender's e-mail address default to `settings.DEFAULT_FROM_EMAIL`

264 Params:

265 - html_content The HTML content of the e-mail

266 - subject The e-mail's subject

267 - from_email The sender's e-mail address

268 - to The list or tuple of the e-mail recipients

269 - cc The list or tuple of the e-mail CC

270 - from_collection: The collection to send the mail for. If from_email is None

271 it will get the email from site_register.py (`email_from`).

272 """

273 if from_email == "":

274 try:

275 from_email = SITE_REGISTER[from_collection.lower()]["email_from"]

276 except (KeyError, ValueError):

277 if from_collection: 277 ↛ 278line 277 didn't jump to line 278, because the condition on line 277 was never true

278 raise ImproperlyConfigured(

279 f"The collection {from_collection.lower()} is missing the "

280 "email_from property in site_register.py"

281 )

282

283 # We additionally unescape HTML characters here to avoid having stuff like

284 #   > etc. in the output text.

285 text_content = html.unescape(strip_tags(html_content))

286 # Create the email, and attach the HTML version as well.

287 return_path = getattr(settings, "RETURN_PATH", "no-reply@listes.mathdoc.fr")

288 msg = EmailMultiAlternatives(

289 subject=subject,

290 body=text_content,

291 from_email=from_email,

292 to=to,

293 cc=cc,

294 headers={"Return-path": return_path},

295 reply_to=reply_to,

296 )

297 msg.attach_alternative(html_content, "text/html")

298 msg.send(fail_silently=False)

299

300

301def send_email_from_template(

302 template: str,

303 context_data: dict,

304 subject: str,

305 to: list[str] | tuple[str],

306 from_email: str = "",

307 cc: list[str] | tuple[str] = [],

308 from_collection: str = "",

309) -> None:

310 """

311 Renders the provided template and sends it as an e-mail to the

312 provided recipients and copy recipients.

313 It sends the e-mail with both a text and a HTML alternative.

314 If not provided, the sender's e-mail address default to `settings.DEFAULT_FROM_EMAIL`

315 Params:

316 - template The HTML template of the e-mail

317 - context_data The context data used to render the template

318 - subject The e-mail's subject

319 - from_email The sender's e-mail address

320 - to The list or tuple of the e-mail recipients

321 - cc The list or tuple of the e-mail CC

322 - from_collection: The collection to send the mail for. If from_email is None

323 it will get the email from site_register.py (`email_from`).

324 """

325 if from_email == "": 325 ↛ 335line 325 didn't jump to line 335, because the condition on line 325 was never false

326 try:

327 from_email = SITE_REGISTER[from_collection.lower()]["email_from"]

328 except (KeyError, ValueError):

329 if from_collection: 329 ↛ 330line 329 didn't jump to line 330, because the condition on line 329 was never true

330 raise ImproperlyConfigured(

331 f"The collection {from_collection.lower()} is missing the "

332 "email_from property in site_register.py"

333 )

334

335 html_content = render_to_string(template, context_data)

336 send_email(

337 html_content, subject, to=to, from_email=from_email, cc=cc, from_collection=from_collection

338 )

339

340

341def template_from_string(template_string, using=None) -> Template:

342 """

343 Convert a string into a template object using a given template engine

344 or using the default backends from `settings.TEMPLATES` if no engine was specified.

345 """

346 # This function is based on django.template.loader.get_template,

347 # but uses Engine.from_string instead of Engine.get_template.

348 engine_list = engines.all() if using is None else [engines[using]]

349 for engine in engine_list:

350 try:

351 return engine.from_string(template_string)

352 except TemplateSyntaxError:

353 continue

354 raise TemplateSyntaxError(template_string)

355

356

357def resize_image(img, max_size=1600):

358 """Take an image in argument and resize it to a {max_size} width with the same ratio"""

359

360 if img.width > max_size:

361 ratio = img.width / img.height

362 new_width = max_size

363 new__height = int(max_size / ratio)

364 img = img.resize((new_width, new__height))

365

366 return img

367

368

369def convert_tiff_to_jpg(img):

370 """Take the path of a '.tiff' image and convert the image to a '.jpg' one"""

371 image_file = os.path.basename(img.filename)

372 image_name = os.path.splitext(image_file)[0]

373 image_directory = os.path.dirname(img.filename)

374 if img.mode == "RGBA":

375 img = img.convert("RGB")

376 img.thumbnail(img.size)

377 img.save(os.path.join(image_directory, image_name + ".jpg"), "JPEG", quality=100)

378

379

380Image.Image.resize_image = resize_image

381Image.Image.convert_tiff_to_jpg = convert_tiff_to_jpg

382

383

384def convert_tiff_to_jpg_from_path(image_path):

385 """Take the path of a '.tiff' image and convert the image to a '.jpg' one"""

386

387 path = os.path.split(image_path)[0]

388 image_file = os.path.basename(image_path)

389 name = os.path.splitext(image_file)[0]

390

391 img = Image.open(image_path)

392

393 img.convert_tiff_to_jpg()

394

395 final_path = os.path.join(path, name + ".jpg")

396 img.save(final_path, "JPEG", quality=100)

397 img.close()

398

399

400def resize_image_from_path(image_path):

401 img = Image.open(image_path)

402

403 img = img.resize_image()

404

405 img.save(os.path.join(image_path), quality=100)

406 img.close()

407

408

409def convert_image_for_web(image_path):

410 image_file = os.path.basename(image_path)

411 extension = os.path.splitext(image_file)[1]

412

413 img = Image.open(image_path)

414 if extension in [".tiff", ".tif"]:

415 img.convert_tiff_to_jpg()

416 elif extension in [".wmf", ".emf"]:

417 return

418

419 img = img.resize_image()

420 img.close()

421

422

423def create_innerlink_for_citation(html_text, biblio):

424 for key, value in biblio.items():

425 html_text = html_text.replace("\n", " ")

426 label = value["label"].replace("[", "").replace("]", "")

427 highlight_id = f"'r{label}'"

428 reference = value["reference"]

429 tooltip_html = (

430 ''

431 + f"{reference}"

432 + ""

433 )

434 citation_link = f'<a id="{label}" href="#r{label}" onclick="highlightReference({highlight_id}, 3000, 500)" >{key}</a>'

435 new_html = f'{citation_link}{tooltip_html}'

436 html_text = html_text.replace(f"{key}", new_html)

437 key = key.replace(",", "")

438 html_text = html_text.replace(f"{key}", new_html)

439 key = key.replace("&", "and")

440 html_text = html_text.replace(f"{key}", new_html)

441 return html_text