Coverage for apps/ptf/model_data_converter.py: 60%

362 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2024-07-18 09:02 +0000

1################################################################################################## 

2# 

3# README 

4# 

5# Operations on the xml data objects 

6# Django DB -> Data objects 

7# 

8################################################################################################## 

9 

10import types 

11 

12from django.db.models import Q 

13 

14from ptf.cmds.xml.citation_html import get_citation_html 

15from ptf.cmds.xml.jats import jats_parser 

16from ptf.cmds.xml.xml_base import RefBase 

17from ptf.cmds.xml.xml_utils import escape 

18from ptf.cmds.xml.xml_utils import get_contrib_xml 

19from ptf.model_data import ArticleData 

20from ptf.model_data import BookData 

21from ptf.model_data import BookPartData 

22from ptf.model_data import Foo 

23from ptf.model_data import IssueData 

24from ptf.model_data import JournalData 

25from ptf.model_data import MathdocPublicationData 

26from ptf.model_data import PublisherData 

27from ptf.model_data import RefData 

28from ptf.model_data import create_contributor 

29 

30 

31def db_append_obj_with_location_to_list(resource_qs, data_list): 

32 for obj_with_location in resource_qs: 

33 data = { 

34 "rel": obj_with_location.rel, 

35 "mimetype": obj_with_location.mimetype, 

36 "location": obj_with_location.location, 

37 "base": obj_with_location.base.base if obj_with_location.base else "", 

38 } 

39 # 'seq': obj_with_location.seq} 

40 

41 for attr in ["metadata", "text", "caption"]: 

42 if hasattr(obj_with_location, attr): 

43 data[attr] = getattr(obj_with_location, attr) 

44 

45 data_list.append(data) 

46 

47 

48def db_to_contributors(qs): 

49 contributors = [] 

50 for contribution in qs.all(): 

51 contributor = create_contributor() 

52 

53 contributor["first_name"] = contribution.first_name 

54 contributor["last_name"] = contribution.last_name 

55 contributor["prefix"] = contribution.prefix 

56 contributor["suffix"] = contribution.suffix 

57 contributor["orcid"] = contribution.orcid if contribution.orcid else "" 

58 contributor["email"] = contribution.email 

59 contributor["string_name"] = contribution.string_name 

60 contributor["mid"] = contribution.mid if contribution.mid else "" 

61 contributor["addresses"] = [ 

62 contrib_address.address for contrib_address in contribution.contribaddress_set.all() 

63 ] 

64 contributor["role"] = contribution.role 

65 contributor["deceased_before_publication"] = contribution.deceased_before_publication 

66 contributor["equal_contrib"] = contribution.equal_contrib 

67 contributor["corresponding"] = contribution.corresponding 

68 contributor["contrib_xml"] = contribution.contrib_xml 

69 

70 contributors.append(contributor) 

71 

72 return contributors 

73 

74 

75def db_to_resource_data_common(resource, data_resource): 

76 data_resource.pid = resource.pid 

77 data_resource.doi = resource.doi 

78 

79 data_resource.lang = resource.lang 

80 data_resource.title_xml = resource.title_xml 

81 data_resource.title_tex = resource.title_tex 

82 data_resource.title_html = resource.title_html 

83 data_resource.abbrev = resource.abbrev 

84 

85 data_resource.trans_lang = resource.trans_lang 

86 data_resource.trans_title_tex = resource.trans_title_tex 

87 data_resource.trans_title_html = resource.trans_title_html 

88 

89 data_resource.funding_statement_xml = resource.funding_statement_xml 

90 data_resource.funding_statement_html = resource.funding_statement_html 

91 data_resource.footnotes_xml = resource.footnotes_xml 

92 data_resource.footnotes_html = resource.footnotes_html 

93 

94 data_resource.ids = [(id.id_type, id.id_value) for id in resource.resourceid_set.all()] 

95 data_resource.extids = [(extid.id_type, extid.id_value) for extid in resource.extid_set.all()] 

96 

97 db_append_obj_with_location_to_list(resource.extlink_set.all(), data_resource.ext_links) 

98 db_append_obj_with_location_to_list(resource.datastream_set.all(), data_resource.streams) 

99 db_append_obj_with_location_to_list( 

100 resource.relatedobject_set.all(), data_resource.related_objects 

101 ) 

102 

103 # Ignore related_objects and figures: they are updated by the FullText import after the Cedrics import 

104 # db_append_obj_with_location_to_list(resource.relatedobject_set.all(), 

105 # data_resource.related_objects) 

106 # db_append_obj_with_location_to_list(resource.relatedobject_set.filter(rel='html-image'), 

107 # data_resource.figures) 

108 db_append_obj_with_location_to_list( 

109 resource.relatedobject_set.filter(Q(rel="supplementary-material") | Q(rel="review")), 

110 data_resource.supplementary_materials, 

111 ) 

112 

113 data_resource.counts = [ 

114 (count.name, count.value) for count in resource.resourcecount_set.all() 

115 ] 

116 

117 data_resource.contributors = db_to_contributors(resource.contributions) 

118 

119 data_resource.kwds = [ 

120 {"type": kwd.type, "lang": kwd.lang, "value": kwd.value} for kwd in resource.kwd_set.all() 

121 ] 

122 data_resource.subjs = [ 

123 {"type": subj.type, "lang": subj.lang, "value": subj.value} 

124 for subj in resource.subj_set.all() 

125 ] 

126 

127 data_resource.abstracts = [ 

128 { 

129 "tag": abstract.tag, 

130 "lang": abstract.lang, 

131 "value_xml": abstract.value_xml, 

132 "value_tex": abstract.value_tex, 

133 "value_html": abstract.value_html, 

134 } 

135 for abstract in resource.abstract_set.all() 

136 ] 

137 

138 data_resource.awards = [ 

139 {"abbrev": award.abbrev, "award_id": award.award_id} for award in resource.award_set.all() 

140 ] 

141 

142 for relation in resource.subject_of.all(): 142 ↛ 143line 142 didn't jump to line 143, because the loop on line 142 never started

143 obj = Foo() 

144 obj.rel_type = relation.rel_info.left 

145 obj.id_value = relation.object_pid 

146 data_resource.relations.append(obj) 

147 

148 for relation in resource.object_of.all(): 

149 obj = Foo() 

150 obj.rel_type = relation.rel_info.right 

151 obj.id_value = relation.subject_pid 

152 data_resource.relations.append(obj) 

153 if hasattr(resource, "issn"): 

154 data_resource.issn = resource.issn 

155 if hasattr(resource, "e_issn"): 

156 data_resource.e_issn = resource.e_issn 

157 

158 

159def db_to_publisher_data(publisher): 

160 data_publisher = PublisherData() 

161 

162 data_publisher.name = publisher.pub_name 

163 data_publisher.loc = publisher.pub_loc 

164 

165 # TODO: ext_links ? 

166 data_publisher.ext_links = [] 

167 

168 return data_publisher 

169 

170 

171def db_to_publication_data(collection): 

172 data_col = MathdocPublicationData() 

173 

174 db_to_resource_data_common(collection, data_col) 

175 

176 data_col.coltype = collection.coltype 

177 data_col.wall = collection.wall 

178 data_col.issn = collection.issn 

179 data_col.e_issn = collection.e_issn 

180 

181 return data_col 

182 

183 

184def db_to_journal_data(collection): 

185 data_journal = JournalData() 

186 

187 # A JournalData has no coltype ? 

188 

189 # A JournalData has a publisher but it does not seem to be used anywhere ? 

190 # The publisher seems to belong to the issue/article and not to the Journal. 

191 

192 db_to_resource_data_common(collection, data_journal) 

193 return data_journal 

194 

195 

196def db_to_collection_data(collection): 

197 data_col = MathdocPublicationData() 

198 

199 db_to_resource_data_common(collection, data_col) 

200 

201 data_col.coltype = collection.coltype 

202 data_col.issn = collection.issn 

203 data_col.e_issn = collection.e_issn 

204 

205 # attributes used for CollectionMembership 

206 if hasattr(collection, "vseries"): 

207 data_col.vseries = collection.vseries 

208 if hasattr(collection, "volume"): 

209 data_col.volume = collection.volume 

210 if hasattr(collection, "seq"): 

211 data_col.seq = collection.seq 

212 

213 return data_col 

214 

215 

216def db_to_issue_data(container, articles=None): 

217 data_issue = IssueData() 

218 

219 db_to_resource_data_common(container, data_issue) 

220 

221 data_issue.ctype = container.ctype 

222 

223 data_issue.year = container.year 

224 data_issue.vseries = container.vseries 

225 data_issue.volume = container.volume 

226 data_issue.number = container.number 

227 

228 data_issue.last_modified_iso_8601_date_str = ( 

229 container.last_modified.isoformat() if container.last_modified else "" 

230 ) 

231 data_issue.prod_deployed_date_iso_8601_date_str = ( 

232 container.deployed_date().isoformat() if container.deployed_date() else "" 

233 ) 

234 

235 data_issue.journal = db_to_journal_data(container.my_collection) 

236 data_issue.publisher = db_to_publisher_data(container.my_publisher) 

237 data_issue.provider = container.provider.name 

238 

239 # a Container has a seq, but it is used only for the books collections 

240 

241 # articles may have been prefetched / filtered before 

242 if not articles: 242 ↛ 245line 242 didn't jump to line 245, because the condition on line 242 was never false

243 articles = container.article_set.all() 

244 

245 for article in articles: 

246 data_article = db_to_article_data(article) 

247 data_issue.articles.append(data_article) 

248 

249 return data_issue 

250 

251 

252def db_to_book_data(container): 

253 data_book = BookData() 

254 

255 db_to_resource_data_common(container, data_book) 

256 

257 data_book.ctype = container.ctype 

258 setattr(data_book, "year", container.year) 

259 

260 data_book.publisher = db_to_publisher_data(container.my_publisher) 

261 data_book.provider = container.provider 

262 

263 data_col = db_to_collection_data(container.my_collection) 

264 # These attributes are required when adding a container to solr 

265 if not hasattr(data_col, "vseries"): 

266 setattr(data_col, "vseries", 0) 

267 if not hasattr(data_col, "volume"): 

268 setattr(data_col, "volume", 0) 

269 data_book.incollection.append(data_col) 

270 for collection in container.my_other_collections.all(): 

271 data_col = db_to_collection_data(container.my_collection) 

272 data_book.incollection.append(data_col) 

273 

274 if hasattr(container, "frontmatter") and container.frontmatter is not None: 

275 data_book.frontmatter_xml = container.frontmatter.value_xml 

276 data_book.frontmatter_toc_html = container.frontmatter.value_html 

277 data_book.frontmatter_foreword_html = container.frontmatter.foreword_html 

278 data_book.body = container.get_body() 

279 

280 data_book.last_modified_iso_8601_date_str = ( 

281 container.last_modified.isoformat() if container.last_modified else "" 

282 ) 

283 data_book.prod_deployed_date_iso_8601_date_str = ( 

284 container.deployed_date().isoformat() if container.deployed_date() else "" 

285 ) 

286 

287 for bookpart in container.article_set.all(): 

288 data_bookpart = db_to_bookpart_data(bookpart) 

289 data_book.parts.append(data_bookpart) 

290 

291 for bibitem in container.bibitem_set.all(): 

292 data_ref = db_to_ref_data(bibitem, data_book.lang) 

293 data_book.bibitems.append(data_ref) 

294 data_book.bibitem.append(data_ref.citation_html) 

295 

296 return data_book 

297 

298 

299def db_to_article_data(article): 

300 data_article = ArticleData() 

301 

302 db_to_resource_data_common(article, data_article) 

303 

304 data_article.atype = article.atype 

305 data_article.seq = str(article.seq) 

306 

307 data_article.fpage = article.fpage 

308 data_article.lpage = article.lpage 

309 data_article.page_range = article.page_range 

310 data_article.page_type = article.page_type 

311 

312 data_article.article_number = article.article_number 

313 data_article.talk_number = article.talk_number 

314 data_article.elocation = article.elocation 

315 data_article.coi_statement = article.coi_statement if article.coi_statement else "" 

316 

317 data_article.date_published_iso_8601_date_str = ( 

318 article.date_published.isoformat() if article.date_published else "" 

319 ) 

320 data_article.prod_deployed_date_iso_8601_date_str = ( 

321 article.deployed_date().isoformat() 

322 if article.my_container and article.deployed_date() 

323 else "" 

324 ) 

325 

326 data_article.history_dates = [ 

327 {"type": type, "date": date.isoformat()} 

328 for type, date in [ 

329 ("received", article.date_received), 

330 ("revised", article.date_revised), 

331 ("accepted", article.date_accepted), 

332 ("online", article.date_online_first), 

333 ] 

334 if date 

335 ] 

336 

337 data_article.body = article.get_body() 

338 data_article.body_html = article.body_html 

339 data_article.body_tex = article.body_tex 

340 data_article.body_xml = article.body_xml 

341 

342 for bibitem in article.bibitem_set.all(): 

343 data_ref = db_to_ref_data(bibitem, "und") 

344 data_article.bibitems.append(data_ref) 

345 data_article.bibitem.append(data_ref.citation_html) 

346 

347 for trans_article in article.translations.all(): 347 ↛ 348line 347 didn't jump to line 348, because the loop on line 347 never started

348 trans_data_article = db_to_article_data(trans_article) 

349 data_article.translations.append(trans_data_article) 

350 

351 return data_article 

352 

353 

354def db_to_bookpart_data(article): 

355 data_bookpart = BookPartData() 

356 

357 db_to_resource_data_common(article, data_bookpart) 

358 

359 data_bookpart.atype = article.atype 

360 

361 data_bookpart.fpage = article.fpage 

362 data_bookpart.lpage = article.lpage 

363 data_bookpart.page_range = article.page_range 

364 data_bookpart.page_type = article.page_type 

365 

366 if hasattr(article, "frontmatter") and article.frontmatter is not None: 

367 data_bookpart.frontmatter_xml = article.frontmatter.value_xml 

368 data_bookpart.frontmatter_toc_html = article.frontmatter.value_html 

369 data_bookpart.frontmatter_foreword_html = article.frontmatter.foreword_html 

370 data_bookpart.body = article.get_body() 

371 

372 for bibitem in article.bibitem_set.all(): 

373 data_ref = db_to_ref_data(bibitem, data_bookpart.lang) 

374 data_bookpart.bibitems.append(data_ref) 

375 data_bookpart.bibitem.append(data_ref.citation_html) 

376 

377 return data_bookpart 

378 

379 

380def db_to_ref_data(bibitem, lang): 

381 data_ref = RefData(lang=lang) 

382 

383 data_ref.type = bibitem.type 

384 data_ref.user_id = bibitem.user_id 

385 data_ref.label = bibitem.label 

386 

387 data_ref.citation_xml = bibitem.citation_xml 

388 data_ref.citation_tex = bibitem.citation_tex 

389 data_ref.citation_html = bibitem.citation_html 

390 

391 data_ref.publisher_name = bibitem.publisher_name 

392 data_ref.publisher_loc = bibitem.publisher_loc 

393 

394 data_ref.article_title_tex = bibitem.article_title_tex 

395 data_ref.chapter_title_tex = bibitem.chapter_title_tex 

396 data_ref.institution = bibitem.institution 

397 data_ref.series = bibitem.series 

398 data_ref.volume = bibitem.volume 

399 data_ref.issue = bibitem.issue 

400 data_ref.month = bibitem.month 

401 data_ref.year = bibitem.year 

402 data_ref.comment = bibitem.comment 

403 data_ref.annotation = bibitem.annotation 

404 data_ref.fpage = bibitem.fpage 

405 data_ref.lpage = bibitem.lpage 

406 data_ref.page_range = bibitem.page_range 

407 data_ref.size = bibitem.size 

408 data_ref.source_tex = bibitem.source_tex 

409 

410 data_ref.extids = [ 

411 (bibitemid.id_type, bibitemid.id_value) for bibitemid in bibitem.bibitemid_set.all() 

412 ] 

413 

414 data_ref.contributors = db_to_contributors(bibitem.contributions) 

415 

416 return data_ref 

417 

418 

419def jats_from_ref_comment(ref): 

420 attr = getattr(ref, "comment") 

421 if attr is None: 421 ↛ 422line 421 didn't jump to line 422, because the condition on line 421 was never true

422 return "" 

423 

424 text = "" 

425 start = attr.find("http://") 

426 if start == -1: 426 ↛ 429line 426 didn't jump to line 429, because the condition on line 426 was never false

427 start = attr.find("https://") 

428 

429 if start != -1: 429 ↛ 430line 429 didn't jump to line 430, because the condition on line 429 was never true

430 end = attr.find(" ", start) 

431 if end == -1: 

432 url = escape(attr[start:]) 

433 else: 

434 url = escape(attr[start:end]) 

435 

436 text = escape(attr[0:start]) 

437 text += f'<ext-link xlink:href="{url}">{url}</ext-link>' 

438 

439 if end != -1: 

440 text += escape(attr[end + 1 :]) 

441 else: 

442 text = escape(attr) 

443 

444 text = f'<comment xml:space="preserve">{text}</comment>' 

445 

446 return text 

447 

448 

449def jats_from_ref_attr( 

450 ref, 

451 attr_name, 

452 jats_tag="", 

453 preserve=False, 

454 attr_type=None, 

455 attr_type_value="", 

456 convert_html_tag=False, 

457): 

458 if not hasattr(ref, attr_name): 458 ↛ 459line 458 didn't jump to line 459, because the condition on line 458 was never true

459 return "" 

460 

461 text = "" 

462 attr = getattr(ref, attr_name) 

463 if len(jats_tag) == 0: 

464 jats_tag = attr_name 

465 if attr and preserve: 

466 value = jats_parser.get_single_title_xml(attr) if convert_html_tag else escape(attr) 

467 if attr_type is not None: 467 ↛ 468line 467 didn't jump to line 468, because the condition on line 467 was never true

468 text = f'<{jats_tag} {attr_type}="{attr_type_value}" xml:space="preserve">{escape(attr)}</{jats_tag}>' 

469 else: 

470 text = f'<{jats_tag} xml:space="preserve">{value}</{jats_tag}>' 

471 elif attr: 

472 value = jats_parser.get_single_title_xml(attr) if convert_html_tag else escape(attr) 

473 if attr_type is not None: 

474 text = f'<{jats_tag} {attr_type}="{attr_type_value}">{value}</{jats_tag}>' 

475 else: 

476 text = f"<{jats_tag}>{escape(attr)}</{jats_tag}>" 

477 

478 return text 

479 

480 

481def jats_from_ref(ref): 

482 text = "" 

483 authors = ref.get_authors() 

484 if authors is not None: 484 ↛ 487line 484 didn't jump to line 487, because the condition on line 484 was never false

485 text += "".join([author["contrib_xml"] for author in authors]) 

486 

487 text += jats_from_ref_attr( 

488 ref, "article_title_tex", "article-title", preserve=True, convert_html_tag=True 

489 ) 

490 text += jats_from_ref_attr(ref, "chapter_title_tex", "chapter-title", convert_html_tag=True) 

491 text += jats_from_ref_attr(ref, "source_tex", "source", preserve=True, convert_html_tag=True) 

492 

493 editors = ref.get_editors() 

494 if editors is not None: 494 ↛ 497line 494 didn't jump to line 497, because the condition on line 494 was never false

495 text += "".join([editor["contrib_xml"] for editor in editors]) 

496 

497 text += jats_from_ref_attr(ref, "series", preserve=True) 

498 text += jats_from_ref_attr(ref, "volume") 

499 text += jats_from_ref_attr(ref, "publisher_name", "publisher-name") 

500 text += jats_from_ref_attr(ref, "publisher_loc", "publisher-loc") 

501 text += jats_from_ref_attr(ref, "institution") 

502 text += jats_from_ref_attr(ref, "year") 

503 text += jats_from_ref_attr(ref, "issue") 

504 text += jats_from_ref_attr( 

505 ref, "doi", "pub-id", attr_type="pub-id-type", attr_type_value="doi" 

506 ) 

507 text += jats_from_ref_attr(ref, "fpage") 

508 text += jats_from_ref_attr(ref, "lpage") 

509 text += jats_from_ref_attr(ref, "size", "size") 

510 text += jats_from_ref_comment(ref) 

511 

512 return text 

513 

514 

515def update_ref_data_for_jats(ref, i, with_label=True): 

516 """ 

517 Set with_label=False if you do not want a label in the citation_html (for example in the citedby) 

518 """ 

519 

520 if hasattr(ref, "eid") and ref.eid is not None and ref.eid != "": 520 ↛ 521line 520 didn't jump to line 521, because the condition on line 520 was never true

521 eids = [item for item in ref.extids if item[0] == "eid"] 

522 if len(eids) > 0: 

523 ref.extids.remove(eids[0]) 

524 ref.extids.append(("eid", ref.eid)) 

525 

526 label = ref.label 

527 if not label and with_label: 527 ↛ 528line 527 didn't jump to line 528, because the condition on line 527 was never true

528 label = f"[{i}]" 

529 ref.label = label 

530 

531 if ref.type == "unknown": 531 ↛ 532line 531 didn't jump to line 532, because the condition on line 531 was never true

532 if not ref.citation_html: 

533 if with_label and ref.citation_tex.find(label) != 0: 

534 ref.citation_html = f"{label} {ref.citation_tex}" 

535 else: 

536 ref.citation_html = ref.citation_tex 

537 

538 if not ref.citation_xml: 

539 ref.citation_xml = f'<label>{escape(ref.label)}</label><mixed-citation xml:space="preserve">{ref.citation_tex}</mixed_ciation>' 

540 else: 

541 ref.label = f"{label}" if with_label else "" 

542 # ref can be a Munch dictionary, or a RefData object. 

543 # Add RefBase member functions, like get_authors 

544 # ref_base = RefBase(lang='und') 

545 # ref_base.from_dict(ref) 

546 ref.get_authors = types.MethodType(RefBase.get_authors, ref) 

547 ref.get_editors = types.MethodType(RefBase.get_editors, ref) 

548 text = get_citation_html(ref) 

549 ref.citation_html = ref.citation_tex = text 

550 

551 for contrib in ref.contributors: 

552 contrib["contrib_xml"] = get_contrib_xml(contrib, is_ref=True) 

553 

554 if ref.type != "unknown": 554 ↛ exitline 554 didn't return from function 'update_ref_data_for_jats', because the condition on line 554 was never false

555 element_citation = jats_from_ref(ref) 

556 ref.citation_xml = f'<label>{escape(ref.label)}</label><element-citation publication-type="{ref.type}">{element_citation}</element-citation>' 

557 

558 

559def update_data_for_jats(data_article, create_author_if_empty=False, with_label=True): 

560 if not data_article.title_html: 

561 data_article.title_html = data_article.title_tex 

562 if not data_article.trans_title_html: 

563 data_article.trans_title_html = data_article.trans_title_tex 

564 if not data_article.title_xml: 

565 data_article.title_xml = jats_parser.get_title_xml( 

566 data_article.title_tex, data_article.trans_title_tex, data_article.trans_lang 

567 ) 

568 

569 for contrib in data_article.contributors: 

570 contrib["contrib_xml"] = get_contrib_xml(contrib) 

571 

572 if data_article.doi is not None: 

573 value = ("doi", data_article.doi) 

574 if value not in data_article.ids: 

575 data_article.ids.append(value) 

576 

577 if create_author_if_empty and len(data_article.contributors) == 0: 

578 contrib = create_contributor() 

579 contrib["role"] = "author" 

580 contrib["contrib_xml"] = get_contrib_xml(contrib) 

581 data_article.contributors = [contrib] 

582 

583 for i, ref in enumerate(data_article.bibitems, start=1): 

584 update_ref_data_for_jats(ref, i, with_label=with_label) 

585 

586 for trans_data_article in data_article.translations: 

587 update_data_for_jats(trans_data_article, create_author_if_empty, with_label) 

588 

589 

590def convert_refdata_for_editor(ref): 

591 contribs_text = "\n".join( 

592 [f"{contrib['last_name']}, {contrib['first_name']}" for contrib in ref.contributors] 

593 ) 

594 ref.contribs_text = contribs_text 

595 

596 if not ref.article_title_tex and not ref.chapter_title_tex and not ref.source_tex: 

597 ref.type = "unknown" 

598 

599 ref.doi = "" 

600 for extid in ref.extids: 

601 if extid[0] == "doi": 

602 ref.doi = extid[1] 

603 elif extid[0] == "eid": 

604 ref.eid = extid[1] 

605 # URLs are in <comment> 

606 # ref.url = '' 

607 # for ext_link in ref.ext_links: 

608 # if ext_link['link_type'] == '': 

609 # ref.url = ext_link['location']