-
Notifications
You must be signed in to change notification settings - Fork 2.7k
/
Salary_ML.py
121 lines (98 loc) · 4.71 KB
/
Salary_ML.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns
def clean_data(df):
'''
INPUT
df - pandas dataframe
OUTPUT
X - A matrix holding all of the variables you want to consider when predicting the response
y - the corresponding response vector
This function cleans df using the following steps to produce X and y:
1. Drop all the rows with no salaries
2. Create X as all the columns that are not the Salary column
3. Create y as the Salary column
4. Drop the Salary, Respondent, and the ExpectedSalary columns
5. For each numeric variable, fill the column with the mean value.
6. Create dummy columns for all the categorical variables, drop the original columns
'''
# Drop rows with missing salary values
df = df.dropna(subset=['Salary'], axis=0)
y = df['Salary']
#Drop respondent and expected salary columns
df = df.drop(['Respondent', 'ExpectedSalary', 'Salary'], axis=1)
# Fill numeric columns with the mean
num_vars = df.select_dtypes(include=['float', 'int']).columns
for col in num_vars:
df[col].fillna((df[col].mean()), inplace=True)
# Dummy the categorical variables
cat_vars = df.select_dtypes(include=['object']).copy().columns
for var in cat_vars:
# for each cat add dummy var, drop original column
df = pd.concat([df.drop(var, axis=1), pd.get_dummies(df[var], prefix=var, prefix_sep='_', drop_first=True)], axis=1)
X = df
return X, y
def find_optimal_lm_mod(X, y, cutoffs, test_size = .30, random_state=42, plot=True):
'''
INPUT
X - pandas dataframe, X matrix
y - pandas dataframe, response variable
cutoffs - list of ints, cutoff for number of non-zero values in dummy categorical vars
test_size - float between 0 and 1, default 0.3, determines the proportion of data as test data
random_state - int, default 42, controls random state for train_test_split
plot - boolean, default 0.3, True to plot result
OUTPUT
r2_scores_test - list of floats of r2 scores on the test data
r2_scores_train - list of floats of r2 scores on the train data
lm_model - model object from sklearn
X_train, X_test, y_train, y_test - output from sklearn train test split used for optimal model
'''
r2_scores_test, r2_scores_train, num_feats, results = [], [], [], dict()
for cutoff in cutoffs:
#reduce X matrix
reduce_X = X.iloc[:, np.where((X.sum() > cutoff) == True)[0]]
num_feats.append(reduce_X.shape[1])
#split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(reduce_X, y, test_size = test_size, random_state=random_state)
#fit the model and obtain pred response
lm_model = LinearRegression(normalize=True)
lm_model.fit(X_train, y_train)
y_test_preds = lm_model.predict(X_test)
y_train_preds = lm_model.predict(X_train)
#append the r2 value from the test set
r2_scores_test.append(r2_score(y_test, y_test_preds))
r2_scores_train.append(r2_score(y_train, y_train_preds))
results[str(cutoff)] = r2_score(y_test, y_test_preds)
if plot:
plt.plot(num_feats, r2_scores_test, label="Test", alpha=.5)
plt.plot(num_feats, r2_scores_train, label="Train", alpha=.5)
plt.xlabel('Number of Features')
plt.ylabel('Rsquared')
plt.title('Rsquared by Number of Features')
plt.legend(loc=1)
plt.show()
best_cutoff = max(results, key=results.get)
#reduce X matrix
reduce_X = X.iloc[:, np.where((X.sum() > int(best_cutoff)) == True)[0]]
num_feats.append(reduce_X.shape[1])
#split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(reduce_X, y, test_size = test_size, random_state=random_state)
#fit the model
lm_model = LinearRegression(normalize=True)
lm_model.fit(X_train, y_train)
return r2_scores_test, r2_scores_train, lm_model, X_train, X_test, y_train, y_test
def main():
df = pd.read_csv('../Part1/stackoverflow/survey_results_public.csv')
X, y = clean_data(df)
#cutoffs here pertains to the number of missing values allowed in the used columns.
#Therefore, lower values for the cutoff provides more predictors in the model.
cutoffs = [5000, 3500, 2500, 1000, 100, 50, 30, 20, 10, 5]
r2_scores_test, r2_scores_train, lm_model, X_train, X_test, y_train, y_test = find_optimal_lm_mod(X, y, cutoffs, plot=False)
print('Finished Finding the Best Model')
return lm_model
if __name__ == '__main__':
best_model = main()