forked from melifluos/twitter_age_detection
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tf_basic_mnist.py
180 lines (147 loc) · 5.93 KB
/
tf_basic_mnist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import numpy as np
from utils import *
from sklearn.preprocessing import OneHotEncoder
# Import data
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
from sklearn.cross_validation import KFold, StratifiedKFold
__author__ = 'benchamberlain'
FLAGS = None
class MLDate:
def __init__(self):
self.features = None
self.target = None
def next_batch(self, batch_size):
"""
sample a batch of data
"""
n_data, _ = features.shape
idx = np.random.choice(n_data, batch_size)
target_batch = self.target.eval()[idx, :]
feature_batch = np.array(self.features[idx, :].todense())
return feature_batch, target_batch
class MLdataset(object):
"""
supervised ml data object
"""
def __init__(self):
self.train = MLData
self.test = MLData
def read_data(threshold):
"""
Read the Twitter user ages test data
:param threshold: the minimum number of edges each
:return:
"""
x_path = 'resources/X.p'
y_path = 'resources/y.p'
X = read_pickle(x_path)
features, cols = remove_sparse_features(X, threshold=threshold)
# features = np.array(X.todense())
targets = read_pickle(y_path)
target = np.array(targets['cat'])
unique, counts = np.unique(target, return_counts=True)
total = float(sum(counts))
norm_counts = counts / total
print(np.asarray((unique, norm_counts)).T)
return features, target
def run_cv_pred(X, y, n_folds=3):
"""
Run n-fold cross validation returning a prediction for every row of X
:param X: A scipy sparse feature matrix
:param y: The target labels corresponding to rows of X
:param clf: The
:param n_folds:
:return:
"""
# Construct a kfolds object
kf = StratifiedKFold(y, n_folds=n_folds)
y_pred = y.copy()
# Iterate through folds
for train_index, test_index in kf:
data = MLdataset()
data.train.features, data.test.features = X[train_index], X[test_index]
data.train.target, data.test.target = y[train_index], y[test_index]
# Initialize a classifier with key word arguments
preds = run_classifier(data, batch_size=100)
y_pred[test_index] = preds
return y_pred
def run_classifier(data, batch_size=100, n_iter=2000):
"""
Run the tensor flow multinomial logistic regression classifier
:param data: an MLdata object
:return:
"""
n_data, n_features = data.train.features.shape
# need to onehotencode
data.train.target = tf.one_hot(data.train.target, depth=10, dtype=tf.float32)
x = tf.placeholder(tf.float32, [None, n_features])
W = tf.Variable(tf.zeros([n_features, 10]))
b = tf.Variable(tf.zeros([10]))
y = tf.matmul(x, W) + b
# Define loss and optimizer
y_ = tf.placeholder(tf.float32, [None, 10])
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, y_))
# This returns a function that performs a single step of gradient descent through backprop
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
# using an interactive session allows us to interleave building and running elements of the graph
sess = tf.InteractiveSession()
tf.initialize_all_variables().run()
# Train
# tf.initialize_all_variables().run()
for _ in range(n_iter):
# take a random sample of the data for batch gradient descent
idx = np.random.choice(n_data, batch_size)
y_slice = data.train.target.eval()[idx, :]
x_slice = np.array(data.train.features[idx, :].todense())
sess.run(train_step, feed_dict={x: x_slice, y_: y_slice})
test_features = np.array(data.test.features.todense())
preds = sess.run(tf.argmax(y, 1), feed_dict={x: test_features})
return preds
def accuracy(y, pred):
return sum(y == pred) / float(len(y))
def main():
# mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
features, target = read_data(threshold=5)
n_data, n_features = features.shape
# Create the model
# Placeholders allow the DAG to be constructed in python. The second parameter is the shape
# NOT SURE WHY THESE ARE FLOATS WHEN THE DATA IS INTEGER - MAYBE TF ONLY WORKS WITH FLOATS??
x = tf.placeholder(tf.float32, [None, n_features])
# in tensorflow params are usually read into variables
W = tf.Variable(tf.zeros([n_features, 10]))
b = tf.Variable(tf.zeros([10]))
y = tf.matmul(x, W) + b
# Define loss and optimizer
y_ = tf.placeholder(tf.float32, [None, 10])
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, y_))
# This returns a function that performs a single step of gradient descent through backprop
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
sess = tf.InteractiveSession()
tf.initialize_all_variables().run()
# Train
# tf.initialize_all_variables().run()
for _ in range(2):
# # batch_xs, batch_ys = mnist.train.next_batch(100)
idx = np.random.choice(3000, 100)
y_slice = target.eval()[idx, :]
x_slice = np.array(features[idx, :].todense())
sess.run(train_step, feed_dict={x: x_slice, y_: y_slice})
# Test trained model
y_test = target[3000:, :].eval()
x_test = np.array(features[3000:, :].todense())
print(tf.argmax(y_test, 1).eval())
print(sess.run(tf.argmax(y, 1), feed_dict={x: x_test}))
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print(sess.run(accuracy, feed_dict={x: x_test,
y_: y_test}))
if __name__ == '__main__':
features, target = read_data(threshold=5)
y_pred = run_cv_pred(features, target, n_folds=2)
print(y_pred[0:20])
print(accuracy(target, y_pred))