Coverage for apps/ptf/utils.py: 74%

120 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2024-02-28 09:09 +0000

1import difflib 

2import html 

3import os 

4import re 

5import subprocess 

6import unicodedata 

7 

8from bleach.css_sanitizer import ALLOWED_CSS_PROPERTIES 

9from bleach.css_sanitizer import CSSSanitizer 

10from bleach.sanitizer import Cleaner 

11 

12from django.conf import settings 

13from django.core.exceptions import ImproperlyConfigured 

14from django.core.mail import EmailMultiAlternatives 

15from django.template import Template 

16from django.template import TemplateSyntaxError 

17from django.template import engines 

18from django.template.loader import render_to_string 

19from django.utils.html import strip_tags 

20from django.utils.translation import gettext_lazy as _ 

21 

22from ptf.site_register import SITE_REGISTER 

23 

24 

25def strip_markup(string): 

26 """ 

27 Strip string from : 

28 - xml markkup (mathml, html, etc..) 

29 - html entities (&nbsp, etc...) 

30 """ 

31 cleanr = re.compile("<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});") 

32 return re.sub(cleanr, "", string) 

33 

34 

35def highlight_diff(ours, theirs): 

36 matcher = difflib.SequenceMatcher( 

37 None, strip_markup(ours.lower()), strip_markup(theirs.lower()) 

38 ) 

39 

40 def process_tag(tag, i1, i2, j1, j2): 

41 if tag == "equal": 

42 return f"<span class='bg-success'>{matcher.b[j1:j2]}</span>" 

43 elif tag == "replace": 

44 return f"<span class='bg-dark'>{matcher.b[j1:j2]}</span>" 

45 else: 

46 return matcher.b[j1:j2] 

47 

48 return "".join(process_tag(*t) for t in matcher.get_opcodes()) 

49 

50 

51def volume_display(): 

52 if settings.VOLUME_STRING: 52 ↛ 53line 52 didn't jump to line 53, because the condition on line 52 was never true

53 return "Volume" 

54 else: 

55 return _("Tome") 

56 

57 

58def execute_cmd(cmd, force_execute=False): 

59 """ 

60 excute shell command 

61 @param cmd: str which represents shell command 

62 @return: output of the command 

63 """ 

64 if settings.MERSENNE_CREATE_FRONTPAGE or force_execute: 

65 result = subprocess.check_output(cmd, shell=True) 

66 return result 

67 

68 else: 

69 # pour debug 

70 returnStatus = 0 

71 output = cmd 

72 with open(os.path.join(settings.LOG_DIR, "cmd.log"), "a", encoding="utf-8") as file_: 

73 file_.write(f"cmd : {cmd}\n") 

74 return returnStatus, output 

75 

76 

77def get_file_content_in_utf8(filename): 

78 """ 

79 

80 :param filename: 

81 :return: the body of a utf-8 file 

82 """ 

83 with open(filename, encoding="utf-8") as f: 

84 body = f.read() 

85 return body 

86 

87 

88def pdf_to_text(pdf_filename): 

89 # Extract full text from the PDF 

90 os.makedirs(settings.MERSENNE_TMP_FOLDER, exist_ok=True) 

91 

92 txt_filename = os.path.join(settings.MERSENNE_TMP_FOLDER, "fulltext.txt") 

93 cmd_str = "pdftotext -raw -nopgbrk -enc UTF-8 " + pdf_filename + " " + txt_filename 

94 

95 try: 

96 subprocess.check_output(cmd_str, shell=True) 

97 except Exception as e: 

98 raise e 

99 

100 # Check if the output file has been created 

101 if not os.path.isfile(txt_filename): 101 ↛ 102line 101 didn't jump to line 102, because the condition on line 101 was never true

102 raise RuntimeError("The PDF file was not converted by pdftotext") 

103 

104 body = get_file_content_in_utf8(txt_filename) 

105 # strip control characters 

106 body = "".join(ch for ch in body if unicodedata.category(ch)[0] != "C") 

107 

108 return body 

109 

110 

111def linearize_pdf(from_path, to_path): 

112 # Linearize the PDF 

113 

114 cmd_str = "qpdf --linearize " + from_path + " " + to_path 

115 

116 try: 

117 subprocess.check_output(cmd_str, shell=True) 

118 except Exception as e: 

119 if not os.path.isfile(to_path): 

120 raise e 

121 

122 do_copy = False 

123 return do_copy 

124 

125 

126def get_display_name(prefix, first_name, last_name, suffix, string_name): 

127 display_first_name_first = getattr(settings, "DISPLAY_FIRST_NAME_FIRST", False) 

128 

129 list_name = [x for x in [last_name, first_name] if x.strip()] 

130 if display_first_name_first and (last_name or first_name): 

131 string_name = " ".join(list_name[::-1]) 

132 elif last_name or first_name: 

133 string_name = f"{prefix} " if prefix else "" 

134 string_name += ", ".join(list_name) 

135 string_name += f" {suffix}" if suffix else "" 

136 

137 return string_name 

138 

139 

140def ckeditor_input_sanitizer(html: str, allow_img: bool = False) -> str: 

141 """ 

142 Sanitizes HTML input from the CKEditor. 

143 It uses bleach library (https://bleach.readthedocs.io/en/latest/index.html), an allowed-list-based sanitizer. 

144 JavaScript is removed by allowing only a subset of HTML tags and attributes. 

145 It does not make use of `lxml.html.Cleaner` because the documentation clearly says that this is not a secure 

146 approach. 

147 

148 html: str 

149 The HTML string to sanitize. 

150 allow_img : bool 

151 Whether to preserve img related tags 

152 """ 

153 allowed_tags = [ 

154 "a", 

155 "abbr", 

156 "acronym", 

157 "address", 

158 "aside", 

159 "b", 

160 "bdi", 

161 "bdo", 

162 "blockquote", 

163 "br", 

164 "caption", 

165 "cite", 

166 "code", 

167 "dd", 

168 "del", 

169 "dfn", 

170 "div", 

171 "dl", 

172 "dt", 

173 "em", 

174 "h1", 

175 "h2", 

176 "h3", 

177 "h4", 

178 "h5", 

179 "h6", 

180 "hgroup", 

181 "hr", 

182 "i", 

183 "ins", 

184 "kbd", 

185 "li", 

186 "mark", 

187 "ol", 

188 "p", 

189 "pre", 

190 "q", 

191 "s", 

192 "samp", 

193 "small", 

194 "span", 

195 "strike", 

196 "strong", 

197 "sub", 

198 "table", 

199 "tbody", 

200 "td", 

201 "th", 

202 "thead", 

203 "tr", 

204 "u", 

205 "ul", 

206 "var", 

207 ] 

208 

209 allowed_attributes = { 

210 "*": ["class", "dir", "style", "id", "name"], 

211 "a": ["href", "target"], 

212 "img": ["alt", "height", "src", "width"], 

213 "source": ["type", "src"], 

214 # Those table attributes are deprecated but they are still used by CKEditor 4 

215 # We might consider upgrading/migrating to CKEditor 5 at some point 

216 "table": ["align", "border", "align", "cellspacing", "cellpadding"], 

217 "th": ["scope"], 

218 } 

219 

220 additional_css_properties = [ 

221 "border", 

222 "margin", 

223 "margin-left", 

224 "margin-right", 

225 "margin-top", 

226 "margin-bottom", 

227 "padding", 

228 "padding-left", 

229 "padding-right", 

230 "padding-top", 

231 "padding-bottom", 

232 ] 

233 

234 allowed_css_properties = ALLOWED_CSS_PROPERTIES | set(additional_css_properties) 

235 

236 image_allowed_tags = ["figcaption", "figure", "img", "picture", "source"] 

237 

238 if allow_img: 

239 allowed_tags += image_allowed_tags 

240 

241 css_sanitizer = CSSSanitizer(allowed_css_properties=allowed_css_properties) 

242 cleaner = Cleaner( 

243 tags=allowed_tags, attributes=allowed_attributes, css_sanitizer=css_sanitizer, strip=True 

244 ) 

245 return cleaner.clean(html) 

246 

247 

248def send_email( 

249 html_content: str, 

250 subject: str, 

251 to: list[str] | tuple[str], 

252 from_email: str | None = None, 

253 cc: list[str] | tuple[str] = [], 

254 from_collection: str = "", 

255 reply_to: list[str] | tuple[str] = [], 

256) -> None: 

257 """ 

258 Sends an e-mail to the provided recipients and copy recipients with the provided html content. 

259 It sends the e-mail with both a text and a HTML alternative. 

260 If not provided, the sender's e-mail address default to `settings.DEFAULT_FROM_EMAIL` 

261 Params: 

262 - html_content The HTML content of the e-mail 

263 - subject The e-mail's subject 

264 - from_email The sender's e-mail address 

265 - to The list or tuple of the e-mail recipients 

266 - cc The list or tuple of the e-mail CC 

267 - from_collection: The collection to send the mail for. If from_email is None 

268 it will get the email from site_register.py (`email_from`). 

269 """ 

270 if from_email == "": 

271 try: 

272 from_email = SITE_REGISTER[from_collection.lower()]["email_from"] 

273 except (KeyError, ValueError): 

274 if from_collection: 274 ↛ 275line 274 didn't jump to line 275, because the condition on line 274 was never true

275 raise ImproperlyConfigured( 

276 f"The collection {from_collection.lower()} is missing the " 

277 "email_from property in site_register.py" 

278 ) 

279 

280 # We additionally unescape HTML characters here to avoid having stuff like 

281 # &nbsp; &gt; etc. in the output text. 

282 text_content = html.unescape(strip_tags(html_content)) 

283 # Create the email, and attach the HTML version as well. 

284 return_path = getattr(settings, "RETURN_PATH", "no-reply@listes.mathdoc.fr") 

285 msg = EmailMultiAlternatives( 

286 subject=subject, 

287 body=text_content, 

288 from_email=from_email, 

289 to=to, 

290 cc=cc, 

291 headers={"Return-path": return_path}, 

292 reply_to=reply_to, 

293 ) 

294 msg.attach_alternative(html_content, "text/html") 

295 msg.send(fail_silently=False) 

296 

297 

298def send_email_from_template( 

299 template: str, 

300 context_data: dict, 

301 subject: str, 

302 to: list[str] | tuple[str], 

303 from_email: str = "", 

304 cc: list[str] | tuple[str] = [], 

305 from_collection: str = "", 

306) -> None: 

307 """ 

308 Renders the provided template and sends it as an e-mail to the 

309 provided recipients and copy recipients. 

310 It sends the e-mail with both a text and a HTML alternative. 

311 If not provided, the sender's e-mail address default to `settings.DEFAULT_FROM_EMAIL` 

312 Params: 

313 - template The HTML template of the e-mail 

314 - context_data The context data used to render the template 

315 - subject The e-mail's subject 

316 - from_email The sender's e-mail address 

317 - to The list or tuple of the e-mail recipients 

318 - cc The list or tuple of the e-mail CC 

319 - from_collection: The collection to send the mail for. If from_email is None 

320 it will get the email from site_register.py (`email_from`). 

321 """ 

322 if from_email == "": 322 ↛ 332line 322 didn't jump to line 332, because the condition on line 322 was never false

323 try: 

324 from_email = SITE_REGISTER[from_collection.lower()]["email_from"] 

325 except (KeyError, ValueError): 

326 if from_collection: 326 ↛ 327line 326 didn't jump to line 327, because the condition on line 326 was never true

327 raise ImproperlyConfigured( 

328 f"The collection {from_collection.lower()} is missing the " 

329 "email_from property in site_register.py" 

330 ) 

331 

332 html_content = render_to_string(template, context_data) 

333 send_email( 

334 html_content, subject, to=to, from_email=from_email, cc=cc, from_collection=from_collection 

335 ) 

336 

337 

338def template_from_string(template_string, using=None) -> Template: 

339 """ 

340 Convert a string into a template object using a given template engine 

341 or using the default backends from `settings.TEMPLATES` if no engine was specified. 

342 """ 

343 # This function is based on django.template.loader.get_template, 

344 # but uses Engine.from_string instead of Engine.get_template. 

345 engine_list = engines.all() if using is None else [engines[using]] 

346 for engine in engine_list: 

347 try: 

348 return engine.from_string(template_string) 

349 except TemplateSyntaxError: 

350 continue 

351 raise TemplateSyntaxError(template_string)