From 2cca072001a1895e3573f18fa65a62be45afbc31 Mon Sep 17 00:00:00 2001 From: Bruno Amaral Date: Sun, 19 Feb 2023 00:28:26 +0000 Subject: [PATCH 1/7] create new ClinicalTrial class --- django/gregory/classes.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/django/gregory/classes.py b/django/gregory/classes.py index e90a0d92..86a033b3 100644 --- a/django/gregory/classes.py +++ b/django/gregory/classes.py @@ -135,3 +135,29 @@ def find_doi(self,title=None): if i == 5: return None + +class ClinicalTrial: + def __init__(self, title=None, summary=None, link=None, published_date=None, relevant=None, identifiers=None): + self.title = title + self.summary = summary + self.link = link + self.published_date = published_date + self.relevant = relevant + self.identifiers = identifiers + def __str__(self): + return f"{self.title}, {self.identifiers}" + def __repr__(self): + return f"{self.title}, \"{self.identifiers}\"" + + def clean_summary(self=None,summary=None): + from bs4 import BeautifulSoup + import html + if summary == None and self.summary != None: + summary = self.summary + if summary != None: + summary = html.unescape(summary) + soup = BeautifulSoup(summary,'html.parser') + for tag in soup(): + for attribute in ["class", "id", "name", "style"]: + del tag[attribute] + return str(soup) From 0c18f5608b9f0a113b94afbd1ca658968a0831d9 Mon Sep 17 00:00:00 2001 From: Bruno Amaral Date: Sun, 19 Feb 2023 00:30:28 +0000 Subject: [PATCH 2/7] use ClinicalTrial class --- django/gregory/feedreader.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/django/gregory/feedreader.py b/django/gregory/feedreader.py index ea15ac5e..eed919c9 100644 --- a/django/gregory/feedreader.py +++ b/django/gregory/feedreader.py @@ -11,7 +11,7 @@ import os import re import gregory.functions as greg -from gregory.classes import SciencePaper +from gregory.classes import SciencePaper, ClinicalTrial from django.utils import timezone import pytz SITE = CustomSetting.objects.get(site__domain=os.environ.get('DOMAIN_NAME')) @@ -149,7 +149,9 @@ def do(self): if 'clinicaltrials.gov' in link: nct = entry['guid'] identifiers = {"eudract": eudract, "euct": euct, "nct": nct} + clinical_trial = ClinicalTrial(title = entry['title'], summary = summary, link = link, published_date = published, identifiers = identifiers,) + clinical_trial.clean_summary() try: - trial = Trials.objects.create( discovery_date=timezone.now(), title = entry['title'], summary = summary, link = link, published_date = published, identifiers=identifiers, source = i) + trial = Trials.objects.create( discovery_date=timezone.now(), title = clinical_trial.title, summary = clinical_trial.summary, link = clinical_trial.link, published_date = clinical_trial.published_date, identifiers=clinical_trial.identifiers, source = i) except: - pass + pass \ No newline at end of file From 3090cdefd010c05740607180e0f9899c43818e7f Mon Sep 17 00:00:00 2001 From: Bruno Amaral Date: Sun, 19 Feb 2023 00:40:42 +0000 Subject: [PATCH 3/7] add new method to cleanup urls --- django/gregory/classes.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/django/gregory/classes.py b/django/gregory/classes.py index e90a0d92..8b409102 100644 --- a/django/gregory/classes.py +++ b/django/gregory/classes.py @@ -26,6 +26,14 @@ def clean_abstract(self=None,abstract=None): del tag[attribute] return str(soup) + def clean_url(self=None): + from gregory.functions import remove_utm + if self.link != None: + self.link = remove_utm(self.link) + else: + print('no url found') + + def refresh(self): from db_maintenance.unpaywall import unpaywall_utils from crossref.restful import Works, Etiquette From 715299a22ef2a5be89d464e682852bbb55b070fd Mon Sep 17 00:00:00 2001 From: Bruno Amaral Date: Sun, 19 Feb 2023 00:41:06 +0000 Subject: [PATCH 4/7] put remove_utm in functions.py --- django/gregory/feedreader.py | 12 +----------- django/gregory/functions.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/django/gregory/feedreader.py b/django/gregory/feedreader.py index ea15ac5e..ead29ed7 100644 --- a/django/gregory/feedreader.py +++ b/django/gregory/feedreader.py @@ -19,16 +19,6 @@ my_etiquette = Etiquette(SITE.title, 'v8', CLIENT_WEBSITE, SITE.admin_email) works = Works(etiquette=my_etiquette) -def remove_utm(url): - u = urlparse(url) - query = parse_qs(u.query, keep_blank_values=True) - query.pop('utm_source', None) - query.pop('utm_medium', None) - query.pop('utm_campaign', None) - query.pop('utm_content', None) - u = u._replace(query=urlencode(query, True)) - return urlunparse(u) - class FeedReaderTask(CronJobBase): RUN_EVERY_MINS = 30 schedule = Schedule(run_every_mins=RUN_EVERY_MINS) @@ -137,7 +127,7 @@ def do(self): published = entry.get('published') if published: published = parse(entry['published']) - link = remove_utm(entry['link']) + link = greg.remove_utm(entry['link']) eudract = None euct = None nct = None diff --git a/django/gregory/functions.py b/django/gregory/functions.py index 55fb30ab..72bcb14c 100644 --- a/django/gregory/functions.py +++ b/django/gregory/functions.py @@ -13,6 +13,17 @@ from .models import Articles from django_cron import CronJobBase, Schedule +def remove_utm(url): + u = urlparse(url) + query = parse_qs(u.query, keep_blank_values=True) + query.pop('utm_source', None) + query.pop('utm_medium', None) + query.pop('utm_campaign', None) + query.pop('utm_content', None) + u = u._replace(query=urlencode(query, True)) + return urlunparse(u) + + def get_doi(title): doi = None if title != '': From 00318d50be09de162fb57b7f3ebbd30cc53565ac Mon Sep 17 00:00:00 2001 From: Bruno Amaral Date: Sun, 19 Feb 2023 00:42:15 +0000 Subject: [PATCH 5/7] add clean url method --- django/gregory/classes.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/django/gregory/classes.py b/django/gregory/classes.py index 86a033b3..feda8a31 100644 --- a/django/gregory/classes.py +++ b/django/gregory/classes.py @@ -161,3 +161,10 @@ def clean_summary(self=None,summary=None): for attribute in ["class", "id", "name", "style"]: del tag[attribute] return str(soup) + + def clean_url(self=None): + from gregory.functions import remove_utm + if self.link != None: + self.link = remove_utm(self.link) + else: + print('no url found') From 50204113ec4ec96cac52773a1450971a95c11eb7 Mon Sep 17 00:00:00 2001 From: Bruno Amaral Date: Sun, 19 Feb 2023 16:39:56 +0000 Subject: [PATCH 6/7] add missing parent --- django/gregory/feedreader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/django/gregory/feedreader.py b/django/gregory/feedreader.py index ec7e9b70..626d5af6 100644 --- a/django/gregory/feedreader.py +++ b/django/gregory/feedreader.py @@ -54,7 +54,7 @@ def do(self): published = parse(entry['published']) else: published = parse(entry['prism_coverdate']) - link = remove_utm(entry['link']) + link = greg.remove_utm(entry['link']) ### # This is a bad solution but it will have to do for now ### From 06f322fc7625aaa729d9e392db26b706a055a957 Mon Sep 17 00:00:00 2001 From: Bruno Amaral Date: Sun, 19 Feb 2023 16:40:10 +0000 Subject: [PATCH 7/7] add missing import --- django/gregory/functions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/django/gregory/functions.py b/django/gregory/functions.py index 72bcb14c..6c24a2ca 100644 --- a/django/gregory/functions.py +++ b/django/gregory/functions.py @@ -12,6 +12,7 @@ from joblib import load from .models import Articles from django_cron import CronJobBase, Schedule +from urllib.parse import urlencode, urlparse, urlunparse, parse_qs def remove_utm(url): u = urlparse(url)