forked from bnsreenu/python_for_microscopists
-
Notifications
You must be signed in to change notification settings - Fork 0
/
049-Logistic_regression.py
105 lines (71 loc) · 3.74 KB
/
049-Logistic_regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env python
__author__ = "Sreenivas Bhattiprolu"
__license__ = "Feel free to copy, I appreciate if you acknowledge Python for Microscopists"
# https://www.youtube.com/watch?v=xl8ljyE66jM
"""
@author: Sreenivas Bhattiprolu
"""
import pandas as pd
from matplotlib import pyplot as plt
#STEP 1: DATA READING AND UNDERSTANDING
df = pd.read_csv("other_files/images_analyzed_productivity1.csv")
print(df.head())
#plt.scatter(df.Age, df.Productivity, marker='+', color='red')
#plt.scatter(df.Time, df.Productivity, marker='+', color='red')
#plt.scatter(df.Coffee, df.Productivity, marker='+', color='red')
#PLot productivity values to see the split between Good and Bad
sizes = df['Productivity'].value_counts(sort = 1)
plt.pie(sizes, shadow=True, autopct='%1.1f%%')
#Good to know so we know the proportion of each label
#STEP 2: DROP IRRELEVANT DATA
#In our example, Images_Analyzed reflects whether it is good analysis or bad
#so should not include it. ALso, User number is just a number and has no inflence
#on the productivity, so we can drop it.
df.drop(['Images_Analyzed'], axis=1, inplace=True)
df.drop(['User'], axis=1, inplace=True)
#STEP 3: Handle missing values, if needed
#df = df.dropna() #Drops all rows with at least one null value.
#STEP 4: Convert non-numeric to numeric, if needed.
#Sometimes we may have non-numeric data, for example batch name, user name, city name, etc.
#e.g. if data is in the form of YES and NO then convert to 1 and 2
df.Productivity[df.Productivity == 'Good'] = 1
df.Productivity[df.Productivity == 'Bad'] = 2
print(df.head())
#STEP 5: PREPARE THE DATA.
#Y is the data with dependent variable, this is the Productivity column
Y = df["Productivity"].values #At this point Y is an object not of type int
#Convert Y to int
Y=Y.astype('int')
#X is data with independent variables, everything except Productivity column
# Drop label column from X as you don't want that included as one of the features
X = df.drop(labels = ["Productivity"], axis=1)
#print(X.head())
#STEP 6: SPLIT THE DATA into TRAIN AND TEST data.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=20)
#random_state can be any integer and it is used as a seed to randomly split dataset.
#By doing this we work with same test dataset evry time, if this is important.
#random_state=None splits dataset randomly every time
#print(X_train)
#STEP 7: Defining the model and training.
from sklearn.linear_model import LogisticRegression #Import the relevant model
model = LogisticRegression() #Create an instance of the model.
model.fit(X_train, y_train) # Train the model using training data
#STEP 8: TESTING THE MODEL BY PREDICTING ON TEST DATA
#AND CALCULATE THE ACCURACY SCORE
prediction_test = model.predict(X_test)
from sklearn import metrics
#Print the prediction accuracy
print ("Accuracy = ", metrics.accuracy_score(y_test, prediction_test))
#Test accuracy for various test sizes and see how it gets better with more training data
#UNDERSTAND WHICH VARIABLES HAVE MOST INFLUENCE ON THE OUTCOME
# To get the weights of all the variables
print(model.coef_) #Print the coefficients for each independent variable.
#But it is not clear which one corresponds to what.
#SO let us print both column values and coefficients.
#.Series is a 1-D labeled array capable of holding any data type.
#Default index would be 0,1,2,3... but let us overwrite them with column names for X (independent variables)
weights = pd.Series(model.coef_[0], index=X.columns.values)
print("Weights for each variables is a follows...")
print(weights)
#+VE VALUE INDICATES THAT THE VARIABLE HAS A POSITIVE IMPACT