Coverage for apps/ptf/model_data_converter.py: 60%
362 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-05-19 19:20 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-05-19 19:20 +0000
1##################################################################################################
2#
3# README
4#
5# Operations on the xml data objects
6# Django DB -> Data objects
7#
8##################################################################################################
10import types
12from django.db.models import Q
14from ptf.cmds.xml.citation_html import get_citation_html
15from ptf.cmds.xml.jats import jats_parser
16from ptf.cmds.xml.xml_base import RefBase
17from ptf.cmds.xml.xml_utils import escape
18from ptf.cmds.xml.xml_utils import get_contrib_xml
19from ptf.model_data import ArticleData
20from ptf.model_data import BookData
21from ptf.model_data import BookPartData
22from ptf.model_data import Foo
23from ptf.model_data import IssueData
24from ptf.model_data import JournalData
25from ptf.model_data import MathdocPublicationData
26from ptf.model_data import PublisherData
27from ptf.model_data import RefData
28from ptf.model_data import create_contributor
31def db_append_obj_with_location_to_list(resource_qs, data_list):
32 for obj_with_location in resource_qs:
33 data = {
34 "rel": obj_with_location.rel,
35 "mimetype": obj_with_location.mimetype,
36 "location": obj_with_location.location,
37 "base": obj_with_location.base.base if obj_with_location.base else "",
38 }
39 # 'seq': obj_with_location.seq}
41 for attr in ["metadata", "text", "caption"]:
42 if hasattr(obj_with_location, attr):
43 data[attr] = getattr(obj_with_location, attr)
45 data_list.append(data)
48def db_to_contributors(qs):
49 contributors = []
50 for contribution in qs.all():
51 contributor = create_contributor()
53 contributor["first_name"] = contribution.first_name
54 contributor["last_name"] = contribution.last_name
55 contributor["prefix"] = contribution.prefix
56 contributor["suffix"] = contribution.suffix
57 contributor["orcid"] = contribution.orcid if contribution.orcid else ""
58 contributor["email"] = contribution.email
59 contributor["string_name"] = contribution.string_name
60 contributor["mid"] = contribution.mid if contribution.mid else ""
61 contributor["addresses"] = [
62 contrib_address.address for contrib_address in contribution.contribaddress_set.all()
63 ]
64 contributor["role"] = contribution.role
65 contributor["deceased_before_publication"] = contribution.deceased_before_publication
66 contributor["equal_contrib"] = contribution.equal_contrib
67 contributor["corresponding"] = contribution.corresponding
68 contributor["contrib_xml"] = contribution.contrib_xml
70 contributors.append(contributor)
72 return contributors
75def db_to_resource_data_common(resource, data_resource):
76 data_resource.pid = resource.pid
77 data_resource.doi = resource.doi
79 data_resource.lang = resource.lang
80 data_resource.title_xml = resource.title_xml
81 data_resource.title_tex = resource.title_tex
82 data_resource.title_html = resource.title_html
83 data_resource.abbrev = resource.abbrev
85 data_resource.trans_lang = resource.trans_lang
86 data_resource.trans_title_tex = resource.trans_title_tex
87 data_resource.trans_title_html = resource.trans_title_html
89 data_resource.funding_statement_xml = resource.funding_statement_xml
90 data_resource.funding_statement_html = resource.funding_statement_html
91 data_resource.footnotes_xml = resource.footnotes_xml
92 data_resource.footnotes_html = resource.footnotes_html
94 data_resource.ids = [(id.id_type, id.id_value) for id in resource.resourceid_set.all()]
95 data_resource.extids = [(extid.id_type, extid.id_value) for extid in resource.extid_set.all()]
97 db_append_obj_with_location_to_list(resource.extlink_set.all(), data_resource.ext_links)
98 db_append_obj_with_location_to_list(resource.datastream_set.all(), data_resource.streams)
99 db_append_obj_with_location_to_list(
100 resource.relatedobject_set.all(), data_resource.related_objects
101 )
103 # Ignore related_objects and figures: they are updated by the FullText import after the Cedrics import
104 # db_append_obj_with_location_to_list(resource.relatedobject_set.all(),
105 # data_resource.related_objects)
106 # db_append_obj_with_location_to_list(resource.relatedobject_set.filter(rel='html-image'),
107 # data_resource.figures)
108 db_append_obj_with_location_to_list(
109 resource.relatedobject_set.filter(Q(rel="supplementary-material") | Q(rel="review")),
110 data_resource.supplementary_materials,
111 )
113 data_resource.counts = [
114 (count.name, count.value) for count in resource.resourcecount_set.all()
115 ]
117 data_resource.contributors = db_to_contributors(resource.contributions)
119 data_resource.kwds = [
120 {"type": kwd.type, "lang": kwd.lang, "value": kwd.value} for kwd in resource.kwd_set.all()
121 ]
122 data_resource.subjs = [
123 {"type": subj.type, "lang": subj.lang, "value": subj.value}
124 for subj in resource.subj_set.all()
125 ]
127 data_resource.abstracts = [
128 {
129 "tag": abstract.tag,
130 "lang": abstract.lang,
131 "value_xml": abstract.value_xml,
132 "value_tex": abstract.value_tex,
133 "value_html": abstract.value_html,
134 }
135 for abstract in resource.abstract_set.all()
136 ]
138 data_resource.awards = [
139 {"abbrev": award.abbrev, "award_id": award.award_id} for award in resource.award_set.all()
140 ]
142 for relation in resource.subject_of.all(): 142 ↛ 143line 142 didn't jump to line 143, because the loop on line 142 never started
143 obj = Foo()
144 obj.rel_type = relation.rel_info.left
145 obj.id_value = relation.object_pid
146 data_resource.relations.append(obj)
148 for relation in resource.object_of.all():
149 obj = Foo()
150 obj.rel_type = relation.rel_info.right
151 obj.id_value = relation.subject_pid
152 data_resource.relations.append(obj)
153 if hasattr(resource, "issn"):
154 data_resource.issn = resource.issn
155 if hasattr(resource, "e_issn"):
156 data_resource.e_issn = resource.e_issn
159def db_to_publisher_data(publisher):
160 data_publisher = PublisherData()
162 data_publisher.name = publisher.pub_name
163 data_publisher.loc = publisher.pub_loc
165 # TODO: ext_links ?
166 data_publisher.ext_links = []
168 return data_publisher
171def db_to_publication_data(collection):
172 data_col = MathdocPublicationData()
174 db_to_resource_data_common(collection, data_col)
176 data_col.coltype = collection.coltype
177 data_col.wall = collection.wall
178 data_col.issn = collection.issn
179 data_col.e_issn = collection.e_issn
181 return data_col
184def db_to_journal_data(collection):
185 data_journal = JournalData()
187 # A JournalData has no coltype ?
189 # A JournalData has a publisher but it does not seem to be used anywhere ?
190 # The publisher seems to belong to the issue/article and not to the Journal.
192 db_to_resource_data_common(collection, data_journal)
193 return data_journal
196def db_to_collection_data(collection):
197 data_col = MathdocPublicationData()
199 db_to_resource_data_common(collection, data_col)
201 data_col.coltype = collection.coltype
202 data_col.issn = collection.issn
203 data_col.e_issn = collection.e_issn
205 # attributes used for CollectionMembership
206 if hasattr(collection, "vseries"):
207 data_col.vseries = collection.vseries
208 if hasattr(collection, "volume"):
209 data_col.volume = collection.volume
210 if hasattr(collection, "seq"):
211 data_col.seq = collection.seq
213 return data_col
216def db_to_issue_data(container, articles=None):
217 data_issue = IssueData()
219 db_to_resource_data_common(container, data_issue)
221 data_issue.ctype = container.ctype
223 data_issue.year = container.year
224 data_issue.vseries = container.vseries
225 data_issue.volume = container.volume
226 data_issue.number = container.number
228 data_issue.last_modified_iso_8601_date_str = (
229 container.last_modified.isoformat() if container.last_modified else ""
230 )
231 data_issue.prod_deployed_date_iso_8601_date_str = (
232 container.deployed_date().isoformat() if container.deployed_date() else ""
233 )
235 data_issue.journal = db_to_journal_data(container.my_collection)
236 data_issue.publisher = db_to_publisher_data(container.my_publisher)
237 data_issue.provider = container.provider.name
239 # a Container has a seq, but it is used only for the books collections
241 # articles may have been prefetched / filtered before
242 if not articles: 242 ↛ 245line 242 didn't jump to line 245, because the condition on line 242 was never false
243 articles = container.article_set.all()
245 for article in articles:
246 data_article = db_to_article_data(article)
247 data_issue.articles.append(data_article)
249 return data_issue
252def db_to_book_data(container):
253 data_book = BookData()
255 db_to_resource_data_common(container, data_book)
257 data_book.ctype = container.ctype
258 setattr(data_book, "year", container.year)
260 data_book.publisher = db_to_publisher_data(container.my_publisher)
261 data_book.provider = container.provider
263 data_col = db_to_collection_data(container.my_collection)
264 # These attributes are required when adding a container to solr
265 if not hasattr(data_col, "vseries"):
266 setattr(data_col, "vseries", 0)
267 if not hasattr(data_col, "volume"):
268 setattr(data_col, "volume", 0)
269 data_book.incollection.append(data_col)
270 for collection in container.my_other_collections.all():
271 data_col = db_to_collection_data(container.my_collection)
272 data_book.incollection.append(data_col)
274 if hasattr(container, "frontmatter") and container.frontmatter is not None:
275 data_book.frontmatter_xml = container.frontmatter.value_xml
276 data_book.frontmatter_toc_html = container.frontmatter.value_html
277 data_book.frontmatter_foreword_html = container.frontmatter.foreword_html
278 data_book.body = container.get_body()
280 data_book.last_modified_iso_8601_date_str = (
281 container.last_modified.isoformat() if container.last_modified else ""
282 )
283 data_book.prod_deployed_date_iso_8601_date_str = (
284 container.deployed_date().isoformat() if container.deployed_date() else ""
285 )
287 for bookpart in container.article_set.all():
288 data_bookpart = db_to_bookpart_data(bookpart)
289 data_book.parts.append(data_bookpart)
291 for bibitem in container.bibitem_set.all():
292 data_ref = db_to_ref_data(bibitem, data_book.lang)
293 data_book.bibitems.append(data_ref)
294 data_book.bibitem.append(data_ref.citation_html)
296 return data_book
299def db_to_article_data(article):
300 data_article = ArticleData()
302 db_to_resource_data_common(article, data_article)
304 data_article.atype = article.atype
305 data_article.seq = str(article.seq)
307 data_article.fpage = article.fpage
308 data_article.lpage = article.lpage
309 data_article.page_range = article.page_range
310 data_article.page_type = article.page_type
312 data_article.article_number = article.article_number
313 data_article.talk_number = article.talk_number
314 data_article.elocation = article.elocation
315 data_article.coi_statement = article.coi_statement if article.coi_statement else ""
317 data_article.date_published_iso_8601_date_str = (
318 article.date_published.isoformat() if article.date_published else ""
319 )
320 data_article.prod_deployed_date_iso_8601_date_str = (
321 article.deployed_date().isoformat()
322 if article.my_container and article.deployed_date()
323 else ""
324 )
326 data_article.history_dates = [
327 {"type": type, "date": date.isoformat()}
328 for type, date in [
329 ("received", article.date_received),
330 ("revised", article.date_revised),
331 ("accepted", article.date_accepted),
332 ("online", article.date_online_first),
333 ]
334 if date
335 ]
337 data_article.body = article.get_body()
338 data_article.body_html = article.body_html
339 data_article.body_tex = article.body_tex
340 data_article.body_xml = article.body_xml
342 for bibitem in article.bibitem_set.all():
343 data_ref = db_to_ref_data(bibitem, "und")
344 data_article.bibitems.append(data_ref)
345 data_article.bibitem.append(data_ref.citation_html)
347 for trans_article in article.translations.all(): 347 ↛ 348line 347 didn't jump to line 348, because the loop on line 347 never started
348 trans_data_article = db_to_article_data(trans_article)
349 data_article.translations.append(trans_data_article)
351 return data_article
354def db_to_bookpart_data(article):
355 data_bookpart = BookPartData()
357 db_to_resource_data_common(article, data_bookpart)
359 data_bookpart.atype = article.atype
361 data_bookpart.fpage = article.fpage
362 data_bookpart.lpage = article.lpage
363 data_bookpart.page_range = article.page_range
364 data_bookpart.page_type = article.page_type
366 if hasattr(article, "frontmatter") and article.frontmatter is not None:
367 data_bookpart.frontmatter_xml = article.frontmatter.value_xml
368 data_bookpart.frontmatter_toc_html = article.frontmatter.value_html
369 data_bookpart.frontmatter_foreword_html = article.frontmatter.foreword_html
370 data_bookpart.body = article.get_body()
372 for bibitem in article.bibitem_set.all():
373 data_ref = db_to_ref_data(bibitem, data_bookpart.lang)
374 data_bookpart.bibitems.append(data_ref)
375 data_bookpart.bibitem.append(data_ref.citation_html)
377 return data_bookpart
380def db_to_ref_data(bibitem, lang):
381 data_ref = RefData(lang=lang)
383 data_ref.type = bibitem.type
384 data_ref.user_id = bibitem.user_id
385 data_ref.label = bibitem.label
387 data_ref.citation_xml = bibitem.citation_xml
388 data_ref.citation_tex = bibitem.citation_tex
389 data_ref.citation_html = bibitem.citation_html
391 data_ref.publisher_name = bibitem.publisher_name
392 data_ref.publisher_loc = bibitem.publisher_loc
394 data_ref.article_title_tex = bibitem.article_title_tex
395 data_ref.chapter_title_tex = bibitem.chapter_title_tex
396 data_ref.institution = bibitem.institution
397 data_ref.series = bibitem.series
398 data_ref.volume = bibitem.volume
399 data_ref.issue = bibitem.issue
400 data_ref.month = bibitem.month
401 data_ref.year = bibitem.year
402 data_ref.comment = bibitem.comment
403 data_ref.annotation = bibitem.annotation
404 data_ref.fpage = bibitem.fpage
405 data_ref.lpage = bibitem.lpage
406 data_ref.page_range = bibitem.page_range
407 data_ref.size = bibitem.size
408 data_ref.source_tex = bibitem.source_tex
410 data_ref.extids = [
411 (bibitemid.id_type, bibitemid.id_value) for bibitemid in bibitem.bibitemid_set.all()
412 ]
414 data_ref.contributors = db_to_contributors(bibitem.contributions)
416 return data_ref
419def jats_from_ref_comment(ref):
420 attr = getattr(ref, "comment")
421 if attr is None: 421 ↛ 422line 421 didn't jump to line 422, because the condition on line 421 was never true
422 return ""
424 text = ""
425 start = attr.find("http://")
426 if start == -1: 426 ↛ 429line 426 didn't jump to line 429, because the condition on line 426 was never false
427 start = attr.find("https://")
429 if start != -1: 429 ↛ 430line 429 didn't jump to line 430, because the condition on line 429 was never true
430 end = attr.find(" ", start)
431 if end == -1:
432 url = escape(attr[start:])
433 else:
434 url = escape(attr[start:end])
436 text = escape(attr[0:start])
437 text += f'<ext-link xlink:href="{url}">{url}</ext-link>'
439 if end != -1:
440 text += escape(attr[end + 1 :])
441 else:
442 text = escape(attr)
444 text = f'<comment xml:space="preserve">{text}</comment>'
446 return text
449def jats_from_ref_attr(
450 ref,
451 attr_name,
452 jats_tag="",
453 preserve=False,
454 attr_type=None,
455 attr_type_value="",
456 convert_html_tag=False,
457):
458 if not hasattr(ref, attr_name): 458 ↛ 459line 458 didn't jump to line 459, because the condition on line 458 was never true
459 return ""
461 text = ""
462 attr = getattr(ref, attr_name)
463 if len(jats_tag) == 0:
464 jats_tag = attr_name
465 if attr and preserve:
466 value = jats_parser.get_single_title_xml(attr) if convert_html_tag else escape(attr)
467 if attr_type is not None: 467 ↛ 468line 467 didn't jump to line 468, because the condition on line 467 was never true
468 text = f'<{jats_tag} {attr_type}="{attr_type_value}" xml:space="preserve">{escape(attr)}</{jats_tag}>'
469 else:
470 text = f'<{jats_tag} xml:space="preserve">{value}</{jats_tag}>'
471 elif attr:
472 value = jats_parser.get_single_title_xml(attr) if convert_html_tag else escape(attr)
473 if attr_type is not None:
474 text = f'<{jats_tag} {attr_type}="{attr_type_value}">{value}</{jats_tag}>'
475 else:
476 text = f"<{jats_tag}>{escape(attr)}</{jats_tag}>"
478 return text
481def jats_from_ref(ref):
482 text = ""
483 authors = ref.get_authors()
484 if authors is not None: 484 ↛ 487line 484 didn't jump to line 487, because the condition on line 484 was never false
485 text += "".join([author["contrib_xml"] for author in authors])
487 text += jats_from_ref_attr(
488 ref, "article_title_tex", "article-title", preserve=True, convert_html_tag=True
489 )
490 text += jats_from_ref_attr(ref, "chapter_title_tex", "chapter-title", convert_html_tag=True)
491 text += jats_from_ref_attr(ref, "source_tex", "source", preserve=True, convert_html_tag=True)
493 editors = ref.get_editors()
494 if editors is not None: 494 ↛ 497line 494 didn't jump to line 497, because the condition on line 494 was never false
495 text += "".join([editor["contrib_xml"] for editor in editors])
497 text += jats_from_ref_attr(ref, "series", preserve=True)
498 text += jats_from_ref_attr(ref, "volume")
499 text += jats_from_ref_attr(ref, "publisher_name", "publisher-name")
500 text += jats_from_ref_attr(ref, "publisher_loc", "publisher-loc")
501 text += jats_from_ref_attr(ref, "institution")
502 text += jats_from_ref_attr(ref, "year")
503 text += jats_from_ref_attr(ref, "issue")
504 text += jats_from_ref_attr(
505 ref, "doi", "pub-id", attr_type="pub-id-type", attr_type_value="doi"
506 )
507 text += jats_from_ref_attr(ref, "fpage")
508 text += jats_from_ref_attr(ref, "lpage")
509 text += jats_from_ref_attr(ref, "size", "size")
510 text += jats_from_ref_comment(ref)
512 return text
515def update_ref_data_for_jats(ref, i, with_label=True):
516 """
517 Set with_label=False if you do not want a label in the citation_html (for example in the citedby)
518 """
520 if hasattr(ref, "eid") and ref.eid is not None and ref.eid != "": 520 ↛ 521line 520 didn't jump to line 521, because the condition on line 520 was never true
521 eids = [item for item in ref.extids if item[0] == "eid"]
522 if len(eids) > 0:
523 ref.extids.remove(eids[0])
524 ref.extids.append(("eid", ref.eid))
526 label = ref.label
527 if not label and with_label: 527 ↛ 528line 527 didn't jump to line 528, because the condition on line 527 was never true
528 label = f"[{i}]"
529 ref.label = label
531 if ref.type == "unknown": 531 ↛ 532line 531 didn't jump to line 532, because the condition on line 531 was never true
532 if not ref.citation_html:
533 if with_label and ref.citation_tex.find(label) != 0:
534 ref.citation_html = f"{label} {ref.citation_tex}"
535 else:
536 ref.citation_html = ref.citation_tex
538 if not ref.citation_xml:
539 ref.citation_xml = f'<label>{escape(ref.label)}</label><mixed-citation xml:space="preserve">{ref.citation_tex}</mixed_ciation>'
540 else:
541 ref.label = f"{label}" if with_label else ""
542 # ref can be a Munch dictionary, or a RefData object.
543 # Add RefBase member functions, like get_authors
544 # ref_base = RefBase(lang='und')
545 # ref_base.from_dict(ref)
546 ref.get_authors = types.MethodType(RefBase.get_authors, ref)
547 ref.get_editors = types.MethodType(RefBase.get_editors, ref)
548 text = get_citation_html(ref)
549 ref.citation_html = ref.citation_tex = text
551 for contrib in ref.contributors:
552 contrib["contrib_xml"] = get_contrib_xml(contrib, is_ref=True)
554 if ref.type != "unknown": 554 ↛ exitline 554 didn't return from function 'update_ref_data_for_jats', because the condition on line 554 was never false
555 element_citation = jats_from_ref(ref)
556 ref.citation_xml = f'<label>{escape(ref.label)}</label><element-citation publication-type="{ref.type}">{element_citation}</element-citation>'
559def update_data_for_jats(data_article, create_author_if_empty=False, with_label=True):
560 if not data_article.title_html:
561 data_article.title_html = data_article.title_tex
562 if not data_article.trans_title_html:
563 data_article.trans_title_html = data_article.trans_title_tex
564 if not data_article.title_xml:
565 data_article.title_xml = jats_parser.get_title_xml(
566 data_article.title_tex, data_article.trans_title_tex, data_article.trans_lang
567 )
569 for contrib in data_article.contributors:
570 contrib["contrib_xml"] = get_contrib_xml(contrib)
572 if data_article.doi is not None:
573 value = ("doi", data_article.doi)
574 if value not in data_article.ids:
575 data_article.ids.append(value)
577 if create_author_if_empty and len(data_article.contributors) == 0:
578 contrib = create_contributor()
579 contrib["role"] = "author"
580 contrib["contrib_xml"] = get_contrib_xml(contrib)
581 data_article.contributors = [contrib]
583 for i, ref in enumerate(data_article.bibitems, start=1):
584 update_ref_data_for_jats(ref, i, with_label=with_label)
586 for trans_data_article in data_article.translations:
587 update_data_for_jats(trans_data_article, create_author_if_empty, with_label)
590def convert_refdata_for_editor(ref):
591 contribs_text = "\n".join(
592 [f"{contrib['last_name']}, {contrib['first_name']}" for contrib in ref.contributors]
593 )
594 ref.contribs_text = contribs_text
596 if not ref.article_title_tex and not ref.chapter_title_tex and not ref.source_tex:
597 ref.type = "unknown"
599 ref.doi = ""
600 for extid in ref.extids:
601 if extid[0] == "doi":
602 ref.doi = extid[1]
603 elif extid[0] == "eid":
604 ref.eid = extid[1]
605 # URLs are in <comment>
606 # ref.url = ''
607 # for ext_link in ref.ext_links:
608 # if ext_link['link_type'] == '':
609 # ref.url = ext_link['location']