From 1d1b66c20e5a1635093260ed14d33d89f0cc1534 Mon Sep 17 00:00:00 2001 From: Bruno Amaral Date: Fri, 15 Apr 2022 15:17:05 +0100 Subject: [PATCH 1/2] wip --- django/gregory/2_train_models.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/django/gregory/2_train_models.py b/django/gregory/2_train_models.py index 5f68d0f5..129e68d9 100644 --- a/django/gregory/2_train_models.py +++ b/django/gregory/2_train_models.py @@ -14,10 +14,10 @@ from django_cron import CronJobBase, Schedule class TrainModels(CronJobBase): - RUN_EVERY_MINS = 2880 # every 2 days + RUN_EVERY_MINS = 1 # every 2 days schedule = Schedule(run_every_mins=RUN_EVERY_MINS) code = 'gregory.train_models' # a unique code - def do(self): + def do(self): # The CSV file that has the source data SOURCE_DATA_CSV = "/code/gregory/data/source.csv" @@ -104,3 +104,4 @@ def do(self): pipeline.fit(input, output) # Save the pipeline for later use (`compress` argument is to save as one single file with the entire pipeline) dump(pipeline, '/code/gregory/ml_models/model_' + model + '.joblib', compress=1) + pass From 2db5cc3c19a859fffaacc311518d904ed5965607 Mon Sep 17 00:00:00 2001 From: Bruno Amaral Date: Fri, 29 Jul 2022 02:31:36 +0100 Subject: [PATCH 2/2] make sure summary length > 50 chars --- django/gregory/1_data_processor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/django/gregory/1_data_processor.py b/django/gregory/1_data_processor.py index 09c7be46..754e7501 100644 --- a/django/gregory/1_data_processor.py +++ b/django/gregory/1_data_processor.py @@ -4,6 +4,7 @@ from .utils.text_utils import cleanHTML import html from .models import Articles +from django.db.models.functions import Length class DataProcessor(CronJobBase): @@ -13,7 +14,7 @@ class DataProcessor(CronJobBase): def do(self): # Read the JSON data into a pandas dataframe - dataset = pd.DataFrame(list(Articles.objects.all().values())) + dataset = pd.DataFrame(list(Articles.objects.annotate(summary_len=Length('summary')).filter(summary_len__gt=50).values())) # Give some info on the dataset # dataset.info()