From 1d1b66c20e5a1635093260ed14d33d89f0cc1534 Mon Sep 17 00:00:00 2001
From: Bruno Amaral <mail@brunoamaral.eu>
Date: Fri, 15 Apr 2022 15:17:05 +0100
Subject: [PATCH 1/2] wip

---
 django/gregory/2_train_models.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/django/gregory/2_train_models.py b/django/gregory/2_train_models.py
index 5f68d0f5..129e68d9 100644
--- a/django/gregory/2_train_models.py
+++ b/django/gregory/2_train_models.py
@@ -14,10 +14,10 @@
 from django_cron import CronJobBase, Schedule
 
 class TrainModels(CronJobBase):
-	RUN_EVERY_MINS = 2880 # every 2 days
+	RUN_EVERY_MINS = 1 # every 2 days
 	schedule = Schedule(run_every_mins=RUN_EVERY_MINS)
 	code = 'gregory.train_models'    # a unique code
-	def do(self):    
+	def do(self):
 		# The CSV file that has the source data
 		SOURCE_DATA_CSV = "/code/gregory/data/source.csv"
 
@@ -104,3 +104,4 @@ def do(self):
 			pipeline.fit(input, output)
 			# Save the pipeline for later use (`compress` argument is to save as one single file with the entire pipeline)
 			dump(pipeline, '/code/gregory/ml_models/model_' + model + '.joblib', compress=1)
+	pass

From 2db5cc3c19a859fffaacc311518d904ed5965607 Mon Sep 17 00:00:00 2001
From: Bruno Amaral <mail@brunoamaral.eu>
Date: Fri, 29 Jul 2022 02:31:36 +0100
Subject: [PATCH 2/2] make sure summary length > 50 chars

---
 django/gregory/1_data_processor.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/django/gregory/1_data_processor.py b/django/gregory/1_data_processor.py
index 09c7be46..754e7501 100644
--- a/django/gregory/1_data_processor.py
+++ b/django/gregory/1_data_processor.py
@@ -4,6 +4,7 @@
 from .utils.text_utils import cleanHTML
 import html
 from .models import Articles
+from django.db.models.functions import Length
 
 
 class DataProcessor(CronJobBase):
@@ -13,7 +14,7 @@ class DataProcessor(CronJobBase):
 
 	def do(self):
 		# Read the JSON data into a pandas dataframe
-		dataset = pd.DataFrame(list(Articles.objects.all().values()))
+		dataset = pd.DataFrame(list(Articles.objects.annotate(summary_len=Length('summary')).filter(summary_len__gt=50).values()))
 
 		# Give some info on the dataset
 		# dataset.info()