-
Notifications
You must be signed in to change notification settings - Fork 0
/
predict.py
83 lines (64 loc) · 2.63 KB
/
predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import os
from math import sqrt
import joblib
import pandas as pd
from TaxiFareModelAdvanced.params import MODEL_NAME
from google.cloud import storage
from sklearn.metrics import mean_absolute_error, mean_squared_error
PATH_TO_LOCAL_MODEL = 'model.joblib'
AWS_BUCKET_TEST_PATH = "s3://wagon-public-datasets/taxi-fare-test.csv"
BUCKET_NAME = "XXX" # ⚠️ replace with your BUCKET NAME
def get_test_data(nrows, data="s3"):
"""method to get the test data (or a portion of it) from google cloud bucket
To predict we can either obtain predictions from train data or from test data"""
# Add Client() here
path = "data/test.csv" # ⚠️ to test from actual KAGGLE test set for submission
if data == "local":
df = pd.read_csv(path)
elif data == "full":
df = pd.read_csv(AWS_BUCKET_TEST_PATH)
else:
df = pd.read_csv(AWS_BUCKET_TEST_PATH, nrows=nrows)
return df
def download_model(model_directory="PipelineTest", bucket=BUCKET_NAME, rm=True):
client = storage.Client().bucket(bucket)
storage_location = 'models/{}/versions/{}/{}'.format(
MODEL_NAME,
model_directory,
'model.joblib')
blob = client.blob(storage_location)
blob.download_to_filename('model.joblib')
print("=> pipeline downloaded from storage")
model = joblib.load('model.joblib')
if rm:
os.remove('model.joblib')
return model
def get_model(path_to_joblib):
pipeline = joblib.load(path_to_joblib)
return pipeline
def evaluate_model(y, y_pred):
MAE = round(mean_absolute_error(y, y_pred), 2)
RMSE = round(sqrt(mean_squared_error(y, y_pred)), 2)
res = {'MAE': MAE, 'RMSE': RMSE}
return res
def generate_submission_csv(nrows, kaggle_upload=False):
df_test = get_test_data(nrows)
pipeline = joblib.load(PATH_TO_LOCAL_MODEL)
if "best_estimator_" in dir(pipeline):
y_pred = pipeline.best_estimator_.predict(df_test)
else:
y_pred = pipeline.predict(df_test)
df_test["fare_amount"] = y_pred
df_sample = df_test[["key", "fare_amount"]]
name = f"predictions_test_ex.csv"
df_sample.to_csv(name, index=False)
print("prediction saved under kaggle format")
# Set kaggle_upload to False unless you install kaggle cli
if kaggle_upload:
kaggle_message_submission = name[:-4]
command = f'kaggle competitions submit -c new-york-city-taxi-fare-prediction -f {name} -m "{kaggle_message_submission}"'
os.system(command)
if __name__ == '__main__':
# ⚠️ in order to push a submission to kaggle you need to use the WHOLE dataset
nrows = 100
generate_submission_csv(nrows, kaggle_upload=False)