diff --git a/django/helper-scripts/ml_train.py b/django/helper-scripts/ml_train.py index 89e90631..a7b3de03 100644 --- a/django/helper-scripts/ml_train.py +++ b/django/helper-scripts/ml_train.py @@ -20,6 +20,7 @@ import re import requests import argparse +from tqdm import tqdm # Progress bar library # Script to be run locally. It will download the db from the API or load from source.csv @@ -36,112 +37,111 @@ args = parser.parse_args() def get_initial_count(url, api_key): - """Fetch the initial page to get the total count of articles.""" - headers = {'Authorization': api_key} - try: - response = requests.get(url, headers=headers) - response.raise_for_status() # Ensures HTTPError is raised for bad requests - data = response.json() - return data['count'] - except requests.exceptions.RequestException as e: - print(f"Failed to fetch initial count: {e}") - return None + """Fetch the initial page to get the total count of articles.""" + headers = {'Authorization': api_key} + try: + response = requests.get(url, headers=headers) + response.raise_for_status() # Ensures HTTPError is raised for bad requests + data = response.json() + return data['count'] + except requests.exceptions.RequestException as e: + print(f"Failed to fetch initial count: {e}") + return None def fetch_articles_page(url, api_key, page): - """Fetch a single page of articles.""" - params = {'page': page} - headers = {'Authorization': api_key} - try: - response = requests.get(url, headers=headers, params=params) - response.raise_for_status() - return response.json()['results'] - except requests.exceptions.RequestException as e: - print(f"Failed to fetch page {page}: {e}") - return [] + """Fetch a single page of articles.""" + params = {'page': page} + headers = {'Authorization': api_key} + try: + response = requests.get(url, headers=headers, params=params) + response.raise_for_status() + return response.json()['results'] + except requests.exceptions.RequestException as e: + print(f"Failed to fetch page {page}: {e}") + return [] # Util function to clean text def cleanText(text): - """ - Cleans the input text by applying the following: - - * change to lowercase text - * replace common symbols with a space - * replace invalid symbols with empty char - * remove stopwords from text - - Arguments - --------- - text: a string - - Output - ------ - return: modified initial string - """ - - # change to lowercase text - text = text.lower() - - # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space. - text = REPLACE_BY_SPACE_RE.sub(' ', text) - - # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. - text = BAD_SYMBOLS_RE.sub('', text) - - # remove stopwords from text - text = ' '.join(word for word in text.split() if word not in STOPWORDS) - - return text + """ + Cleans the input text by applying the following: + + * change to lowercase text + * replace common symbols with a space + * replace invalid symbols with empty char + * remove stopwords from text + + Arguments + --------- + text: a string + + Output + ------ + return: modified initial string + """ + + # change to lowercase text + text = text.lower() + + # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space. + text = REPLACE_BY_SPACE_RE.sub(' ', text) + + # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. + text = BAD_SYMBOLS_RE.sub('', text) + + # remove stopwords from text + text = ' '.join(word for word in text.split() if word not in STOPWORDS) + + return text def cleanHTML(input): - return BeautifulSoup(input, 'html.parser').get_text() + return BeautifulSoup(input, 'html.parser').get_text() class DenseTransformer(TransformerMixin): - def fit(self, X, y=None, **fit_params): - return self + def fit(self, X, y=None, **fit_params): + return self - def transform(self, X, y=None, **fit_params): - return X.toarray() + def transform(self, X, y=None, **fit_params): + return X.toarray() # Fetch and download data def get_articles(url, api_key): - headers = {'Authorization': api_key} - try: - response = requests.get(url, headers=headers) - if response.status_code == 200: - return response.json() - else: - print(f"Error: Received response code {response.status_code}") - return None - except requests.exceptions.RequestException as e: - print(f"An error occurred: {e}") - return None + headers = {'Authorization': api_key} + try: + response = requests.get(url, headers=headers) + if response.status_code == 200: + return response.json() + else: + print(f"Error: Received response code {response.status_code}") + return None + except requests.exceptions.RequestException as e: + print(f"An error occurred: {e}") + return None source_data_csv = f"source.csv" if args.load_from_csv == True: # load data from csv file into a pandas dataframe - print("Loading data from source.csv") - articles_df = pd.read_csv(source_data_csv) - + print("Loading data from source.csv") + articles_df = pd.read_csv(source_data_csv) + else: - print("Loading data from API...") - articles_per_page = 10 - total_articles = get_initial_count(get_api_url, api_key) - fetch_url = get_api_url - articles = [] - if not total_articles: - print("Failed to get the total count of articles.") - else: - total_pages = math.ceil(total_articles / articles_per_page) - print(f"Total articles: {total_articles}, Total pages: {total_pages}") - # Concurrently fetch all pages - with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor: - futures = [executor.submit(fetch_articles_page, get_api_url, api_key, page) for page in range(1, total_pages + 1)] - articles = [] - for future in concurrent.futures.as_completed(futures): - articles.extend(future.result()) - # Save fetched data to CSV - articles_df = pd.DataFrame(articles) - file_date = datetime.now().strftime("%Y-%m-%d") - articles_df.to_csv(source_data_csv, index=False) + print("Loading data from API...") + articles_per_page = 10 + total_articles = get_initial_count(get_api_url, api_key) + fetch_url = get_api_url + articles = [] + if not total_articles: + print("Failed to get the total count of articles.") + else: + total_pages = math.ceil(total_articles / articles_per_page) + print(f"Total articles: {total_articles}, Total pages: {total_pages}") + # Concurrently fetch all pages with progress bar + with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor: + futures = {executor.submit(fetch_articles_page, get_api_url, api_key, page): page for page in range(1, total_pages + 1)} + for future in tqdm(concurrent.futures.as_completed(futures), total=total_pages, desc="Fetching articles"): + articles.extend(future.result()) + # Save fetched data to CSV + articles_df = pd.DataFrame(articles) + file_date = datetime.now().strftime("%Y-%m-%d") + articles_df.to_csv(source_data_csv, index=False) # Clean the dataset articles_df['relevant'] = articles_df['relevant'].fillna(value=0) @@ -159,16 +159,16 @@ def get_articles(url, api_key): # Divide into train and test sets X_train, X_test, Y_train, Y_test = train_test_split( - # The array of inputs - input, - # The array of outputs - output, - # The size of the testing set in relation to the entire dataset (0.2 = 20%) - test_size = 0.2, - # This acts as a seed, to maintain consistency between tests - random_state = 42, - # Whether to shuffle the data before splitting, it helps maintain consistency between tests - shuffle = False + # The array of inputs + input, + # The array of outputs + output, + # The size of the testing set in relation to the entire dataset (0.2 = 20%) + test_size = 0.2, + # This acts as a seed, to maintain consistency between tests + random_state = 42, + # Whether to shuffle the data before splitting, it helps maintain consistency between tests + shuffle = False ) # change the type of the Y axis to avoid error further on Y_train = Y_train.astype(int) @@ -193,42 +193,42 @@ def get_articles(url, api_key): # Define a pipeline combining a text feature extractor with a classifier for each model pipelines[GNB] = Pipeline([ - (VECTORIZER, vectorizer), - # This intermediate step is required because the GaussianNB - # model does not work with sparse vectors - ('to_dense', DenseTransformer()), - (CLASSIFIER, OneVsRestClassifier(GaussianNB())), - ]) + (VECTORIZER, vectorizer), + # This intermediate step is required because the GaussianNB + # model does not work with sparse vectors + ('to_dense', DenseTransformer()), + (CLASSIFIER, OneVsRestClassifier(GaussianNB())), + ]) pipelines[MNB] = Pipeline([ - (VECTORIZER, vectorizer), - (CLASSIFIER, OneVsRestClassifier(MultinomialNB(fit_prior=True, class_prior=None))), - ]) + (VECTORIZER, vectorizer), + (CLASSIFIER, OneVsRestClassifier(MultinomialNB(fit_prior=True, class_prior=None))), + ]) pipelines[LR] = Pipeline([ - (VECTORIZER, vectorizer), - (CLASSIFIER, OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)), - ]) + (VECTORIZER, vectorizer), + (CLASSIFIER, OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)), + ]) pipelines[LSVC] = Pipeline([ - (VECTORIZER, vectorizer), - (CLASSIFIER, OneVsRestClassifier(LinearSVC(), n_jobs=1)), - ]) + (VECTORIZER, vectorizer), + (CLASSIFIER, OneVsRestClassifier(LinearSVC(), n_jobs=1)), + ]) for model, pipeline in pipelines.items(): - # Train phase - print("Training the " + model + " model...") - pipeline.fit(X_train, Y_train) - - # Testing accuracy - prediction = pipeline.predict(X_test) - accuracy = accuracy_score(Y_test, prediction) - print(" => Accuracy for the " + model + " model: {:2.1f}%".format(accuracy * 100)) + # Train phase + print("Training the " + model + " model...") + pipeline.fit(X_train, Y_train) + + # Testing accuracy + prediction = pipeline.predict(X_test) + accuracy = accuracy_score(Y_test, prediction) + print(" => Accuracy for the " + model + " model: {:2.1f}%".format(accuracy * 100)) for model, pipeline in pipelines.items(): - # Before saving, let's train the model with the entire dataset first - print("Training the " + model + " model with the entire dataset...") - pipeline.fit(input, output) - # Save the pipeline for later use (`compress` argument is to save as one single file with the entire pipeline) - dump(pipeline, './model_' + model + '.joblib', compress=1) + # Before saving, let's train the model with the entire dataset first + print("Training the " + model + " model with the entire dataset...") + pipeline.fit(input, output) + # Save the pipeline for later use (`compress` argument is to save as one single file with the entire pipeline) + dump(pipeline, './model_' + model + '.joblib', compress=1)