-
-
Notifications
You must be signed in to change notification settings - Fork 10
/
clf-train.py
84 lines (64 loc) · 2.83 KB
/
clf-train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python
"""Example for training a random forest classifier in sklearn
and using mlflow to save a model.
"""
import argparse
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('modelPath', type=str,
help='Name of mlflow artifact path location to drop model.')
parser.add_argument('--outputTestData', type=str,
help='Name of output csv file if writing split test data.')
args = parser.parse_args()
model_path = args.modelPath
# Load a standard machine learning dataset
cancer = load_breast_cancer()
df = pd.DataFrame(cancer['data'], columns=cancer['feature_names'])
df['target'] = cancer['target']
# Define features and target variables
features = [x for x in list(df.columns) if x != 'target']
x_raw = df[features]
y_raw = df['target']
# Split data into training and testing
x_train, x_test, y_train, _ = train_test_split(x_raw, y_raw,
test_size=0.20,
random_state=123, # seeding
stratify=y_raw)
# Optionally write test data, used for inference example with the API
if args.outputTestData:
test_df = pd.DataFrame(data=x_test, columns=features)
test_df.to_csv('test.csv', index=False)
print("Test data written to 'test.csv'")
# Build a classifier sklearn pipeline
clf = RandomForestClassifier(n_estimators=100,
min_samples_leaf=2,
class_weight='balanced',
random_state=123)
preprocessor = Pipeline(steps=[('scaler', StandardScaler())])
model = Pipeline(steps=[('preprocessor', preprocessor),
('randomforestclassifier', clf)])
# Train the model
model.fit(x_train, y_train)
def overwrite_predict(func):
def wrapper(*args, **kwargs):
result = func(*args, **kwargs)
return [round(x, 4) for x in result[:, 1]]
return wrapper
# Overwriting the model to use predict to output probabilities
model.predict = overwrite_predict(model.predict_proba)
# Save the model locally
try:
mlflow.sklearn.save_model(model, model_path)
print(f"Model saved at path: {model_path}")
except Exception as e:
print(f"Error saving model at path {model_path}: {e}. Does it already exist?")
if __name__ == "__main__":
main()