From 2cca072001a1895e3573f18fa65a62be45afbc31 Mon Sep 17 00:00:00 2001
From: Bruno Amaral <mail@brunoamaral.eu>
Date: Sun, 19 Feb 2023 00:28:26 +0000
Subject: [PATCH 1/7] create new ClinicalTrial class

---
 django/gregory/classes.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/django/gregory/classes.py b/django/gregory/classes.py
index e90a0d92..86a033b3 100644
--- a/django/gregory/classes.py
+++ b/django/gregory/classes.py
@@ -135,3 +135,29 @@ def find_doi(self,title=None):
 					if i == 5:
 						return None
 
+
+class ClinicalTrial:
+	def __init__(self, title=None, summary=None, link=None, published_date=None, relevant=None, identifiers=None):
+		self.title = title
+		self.summary = summary
+		self.link = link
+		self.published_date = published_date
+		self.relevant = relevant
+		self.identifiers = identifiers
+	def __str__(self):
+		return f"{self.title}, {self.identifiers}"
+	def __repr__(self):
+		return f"{self.title}, \"{self.identifiers}\""
+
+	def clean_summary(self=None,summary=None):
+		from bs4 import BeautifulSoup
+		import html
+		if summary == None and self.summary != None:
+			summary = self.summary
+		if summary != None:
+			summary = html.unescape(summary)
+			soup = BeautifulSoup(summary,'html.parser')
+			for tag in soup():
+				for attribute in ["class", "id", "name", "style"]:
+					del tag[attribute]
+			return str(soup)

From 0c18f5608b9f0a113b94afbd1ca658968a0831d9 Mon Sep 17 00:00:00 2001
From: Bruno Amaral <mail@brunoamaral.eu>
Date: Sun, 19 Feb 2023 00:30:28 +0000
Subject: [PATCH 2/7] use ClinicalTrial class

---
 django/gregory/feedreader.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/django/gregory/feedreader.py b/django/gregory/feedreader.py
index ea15ac5e..eed919c9 100644
--- a/django/gregory/feedreader.py
+++ b/django/gregory/feedreader.py
@@ -11,7 +11,7 @@
 import os
 import re
 import gregory.functions as greg
-from gregory.classes import SciencePaper
+from gregory.classes import SciencePaper, ClinicalTrial
 from django.utils import timezone
 import pytz
 SITE = CustomSetting.objects.get(site__domain=os.environ.get('DOMAIN_NAME'))
@@ -149,7 +149,9 @@ def do(self):
 				if 'clinicaltrials.gov' in link:
 					nct = entry['guid']
 				identifiers = {"eudract": eudract, "euct": euct, "nct": nct}
+				clinical_trial = ClinicalTrial(title = entry['title'], summary = summary, link = link, published_date = published, identifiers = identifiers,)
+				clinical_trial.clean_summary()
 				try:
-					trial = Trials.objects.create( discovery_date=timezone.now(), title = entry['title'], summary = summary, link = link, published_date = published, identifiers=identifiers, source = i)
+					trial = Trials.objects.create( discovery_date=timezone.now(), title = clinical_trial.title, summary = clinical_trial.summary, link = clinical_trial.link, published_date = clinical_trial.published_date, identifiers=clinical_trial.identifiers, source = i)
 				except:
-					pass
+					pass
\ No newline at end of file

From 3090cdefd010c05740607180e0f9899c43818e7f Mon Sep 17 00:00:00 2001
From: Bruno Amaral <mail@brunoamaral.eu>
Date: Sun, 19 Feb 2023 00:40:42 +0000
Subject: [PATCH 3/7] add new method to cleanup urls

---
 django/gregory/classes.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/django/gregory/classes.py b/django/gregory/classes.py
index e90a0d92..8b409102 100644
--- a/django/gregory/classes.py
+++ b/django/gregory/classes.py
@@ -26,6 +26,14 @@ def clean_abstract(self=None,abstract=None):
 					del tag[attribute]
 			return str(soup)
 
+	def clean_url(self=None):
+		from gregory.functions import remove_utm
+		if self.link != None:
+			self.link = remove_utm(self.link)
+		else:
+			print('no url found')
+
+
 	def refresh(self):
 		from db_maintenance.unpaywall import unpaywall_utils
 		from crossref.restful import Works, Etiquette

From 715299a22ef2a5be89d464e682852bbb55b070fd Mon Sep 17 00:00:00 2001
From: Bruno Amaral <mail@brunoamaral.eu>
Date: Sun, 19 Feb 2023 00:41:06 +0000
Subject: [PATCH 4/7] put remove_utm in functions.py

---
 django/gregory/feedreader.py | 12 +-----------
 django/gregory/functions.py  | 11 +++++++++++
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/django/gregory/feedreader.py b/django/gregory/feedreader.py
index ea15ac5e..ead29ed7 100644
--- a/django/gregory/feedreader.py
+++ b/django/gregory/feedreader.py
@@ -19,16 +19,6 @@
 my_etiquette = Etiquette(SITE.title, 'v8', CLIENT_WEBSITE, SITE.admin_email)
 works = Works(etiquette=my_etiquette)
 
-def remove_utm(url):
-	u = urlparse(url)
-	query = parse_qs(u.query, keep_blank_values=True)
-	query.pop('utm_source', None)
-	query.pop('utm_medium', None)
-	query.pop('utm_campaign', None)
-	query.pop('utm_content', None)
-	u = u._replace(query=urlencode(query, True))
-	return urlunparse(u)
-
 class FeedReaderTask(CronJobBase):
 	RUN_EVERY_MINS = 30
 	schedule = Schedule(run_every_mins=RUN_EVERY_MINS)
@@ -137,7 +127,7 @@ def do(self):
 				published = entry.get('published')
 				if published:
 					published = parse(entry['published'])
-				link = remove_utm(entry['link'])
+				link = greg.remove_utm(entry['link'])
 				eudract = None
 				euct = None
 				nct = None
diff --git a/django/gregory/functions.py b/django/gregory/functions.py
index 55fb30ab..72bcb14c 100644
--- a/django/gregory/functions.py
+++ b/django/gregory/functions.py
@@ -13,6 +13,17 @@
 from .models import Articles
 from django_cron import CronJobBase, Schedule
 
+def remove_utm(url):
+	u = urlparse(url)
+	query = parse_qs(u.query, keep_blank_values=True)
+	query.pop('utm_source', None)
+	query.pop('utm_medium', None)
+	query.pop('utm_campaign', None)
+	query.pop('utm_content', None)
+	u = u._replace(query=urlencode(query, True))
+	return urlunparse(u)
+
+
 def get_doi(title):
 	doi = None
 	if title != '':

From 00318d50be09de162fb57b7f3ebbd30cc53565ac Mon Sep 17 00:00:00 2001
From: Bruno Amaral <mail@brunoamaral.eu>
Date: Sun, 19 Feb 2023 00:42:15 +0000
Subject: [PATCH 5/7] add clean url method

---
 django/gregory/classes.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/django/gregory/classes.py b/django/gregory/classes.py
index 86a033b3..feda8a31 100644
--- a/django/gregory/classes.py
+++ b/django/gregory/classes.py
@@ -161,3 +161,10 @@ def clean_summary(self=None,summary=None):
 				for attribute in ["class", "id", "name", "style"]:
 					del tag[attribute]
 			return str(soup)
+
+	def clean_url(self=None):
+		from gregory.functions import remove_utm
+		if self.link != None:
+			self.link = remove_utm(self.link)
+		else:
+			print('no url found')

From 50204113ec4ec96cac52773a1450971a95c11eb7 Mon Sep 17 00:00:00 2001
From: Bruno Amaral <mail@brunoamaral.eu>
Date: Sun, 19 Feb 2023 16:39:56 +0000
Subject: [PATCH 6/7] add missing parent

---
 django/gregory/feedreader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/django/gregory/feedreader.py b/django/gregory/feedreader.py
index ec7e9b70..626d5af6 100644
--- a/django/gregory/feedreader.py
+++ b/django/gregory/feedreader.py
@@ -54,7 +54,7 @@ def do(self):
 					published = parse(entry['published'])
 				else:
 					published = parse(entry['prism_coverdate'])
-				link = remove_utm(entry['link'])
+				link = greg.remove_utm(entry['link'])
 				###
 				# This is a bad solution but it will have to do for now
 				###

From 06f322fc7625aaa729d9e392db26b706a055a957 Mon Sep 17 00:00:00 2001
From: Bruno Amaral <mail@brunoamaral.eu>
Date: Sun, 19 Feb 2023 16:40:10 +0000
Subject: [PATCH 7/7] add missing import

---
 django/gregory/functions.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/django/gregory/functions.py b/django/gregory/functions.py
index 72bcb14c..6c24a2ca 100644
--- a/django/gregory/functions.py
+++ b/django/gregory/functions.py
@@ -12,6 +12,7 @@
 from joblib import load
 from .models import Articles
 from django_cron import CronJobBase, Schedule
+from urllib.parse import urlencode, urlparse, urlunparse, parse_qs
 
 def remove_utm(url):
 	u = urlparse(url)