generated from suhaibmujahid/se4ai-2022-03-21
-
Notifications
You must be signed in to change notification settings - Fork 0
/
example.py
71 lines (54 loc) · 2.32 KB
/
example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
max_training_rows = 10000
target = 'buggy'
date_column = 'author_date'
id_column = 'commit_id'
ignored_days = 90
seconds_in_day = 60*60*24
experience = ['exp', 'rexp', 'sexp']
history = ['ndev', 'nuc', 'age']
size = ['la', 'ld', 'lt']
diffusion = ['ns', 'nd', 'nf', 'entropy']
features = experience + history + size + diffusion
def main():
df = pd.read_csv("data/mybatis-3_5ffe1bc68e3f65b96a5eb9e2_metrics.csv")
latest_date = df[date_column].max()
oldest_ignored_date = latest_date - (ignored_days * 86400)
df = df.loc[df[date_column] < oldest_ignored_date]
df = df.dropna(subset=features + [target], axis=0)
df = df.sort_values(date_column, ascending=False)
df = df[:max_training_rows]
X = df.loc[:, features]
y = df.loc[:, target].astype(str)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,)
print("Training on {} rows, testing on {} rows".format(
len(X_train), len(X_test))
)
param_ranges = {
'n_estimators': [int(x) for x in np.linspace(start=10, stop=2000, num=10)],
'bootstrap': [True, False],
'max_depth': [int(x) for x in np.linspace(5, 100, num=10)],
'max_features': ['auto', None]
}
best_params = RandomizedSearchCV(estimator=RandomForestClassifier(),
param_distributions=param_ranges,
scoring='roc_auc',
cv=5,
verbose=1,
n_jobs=10).fit(X_train, y_train).best_params_
print("best hyperparameters:", best_params)
model = RandomForestClassifier(n_estimators=best_params['n_estimators'],
bootstrap=best_params['bootstrap'],
max_depth=best_params['max_depth'],
max_features=best_params['max_features'])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)
if __name__ == '__main__':
main()