-
Notifications
You must be signed in to change notification settings - Fork 1
/
data_augment.py
113 lines (95 loc) · 3.84 KB
/
data_augment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""
Augment the dataset according to the loss functions.
Input:
- a regression data set (x, a, y), which may be obtained using the data_parser
- loss function
- Theta, a set of thresholds in between 0 and 1
Output:
a weighted classification dataset (X, A, Y, W)
"""
import functools
import numpy as np
import pandas as pd
import data_parser as parser
from itertools import repeat
import itertools
_LOGISTIC_C = 5
def augment_data_ab(X, A, Y, Theta):
"""
Takes input data and augment it with an additional feature of
theta; Return: X tensor_product Theta
For absolute loss, we don't do any reweighting.
TODO: might add the alpha/2 to match with the write-up
"""
n = np.shape(X)[0]
num_theta = len(Theta)
X_aug = pd.concat(repeat(X, num_theta))
A_aug = pd.concat(repeat(A, num_theta))
Y_values = pd.concat(repeat(Y, num_theta))
theta_list = [s for theta in Theta for s in repeat(theta, n)]
# Adding theta to the feature
X_aug['theta'] = pd.Series(theta_list, index=X_aug.index)
Y_aug = Y_values >= X_aug['theta']
Y_aug = Y_aug.map({True: 1, False: 0})
X_aug.index = range(n * num_theta)
Y_aug.index = range(n * num_theta)
A_aug.index = range(n * num_theta)
W_aug = pd.Series(1, Y_aug.index)
return X_aug, A_aug, Y_aug, W_aug
def augment_data_sq(x, a, y, Theta):
"""
Augment the dataset so that the x carries an additional feature of theta
Then also attach appropriate weights to each data point.
Theta: Assume uniform grid Theta
"""
n = np.shape(x)[0] # number of original data points
num_theta = len(Theta)
width = Theta[1] - Theta[0]
X_aug = pd.concat(repeat(x, num_theta))
A_aug = pd.concat(repeat(a, num_theta))
Y_values = pd.concat(repeat(y, num_theta))
theta_list = [s for theta in Theta for s in repeat(theta, n)]
# Adding theta to the feature
X_aug['theta'] = pd.Series(theta_list, index=X_aug.index)
X_aug.index = range(n * num_theta)
# Y_aug.index = range(n * num_theta)
A_aug.index = range(n * num_theta)
Y_values.index = range(n * num_theta)
# two helper functions
sq_loss = lambda a, b: (a - b)**2 # square loss function
weight_assign = lambda theta, y: (sq_loss(theta + width/2, y) - sq_loss(theta - width/2, y))
W = weight_assign(X_aug['theta'], Y_values)
Y_aug = 1*(W < 0)
W = abs(W)
# Compute the weights
return X_aug, A_aug, Y_aug, W
def augment_data_logistic(x, a, y, Theta):
"""
Augment the dataset so that the x carries an additional feature of theta
Then also attach appropriate weights to each data point, so that optimize
for logisitc loss
Theta: Assume uniform grid Theta
y: assume the labels are {0, 1}
"""
n = np.shape(x)[0] # number of original data points
num_theta = len(Theta)
width = Theta[1] - Theta[0]
X_aug = pd.concat(repeat(x, num_theta))
A_aug = pd.concat(repeat(a, num_theta))
Y_values = pd.concat(repeat(y, num_theta))
theta_list = [s for theta in Theta for s in repeat(theta, n)]
# Adding theta to the feature
X_aug['theta'] = pd.Series(theta_list, index=X_aug.index)
X_aug.index = range(n * num_theta)
A_aug.index = range(n * num_theta)
Y_values.index = range(n * num_theta)
# two helper functions
logistic_loss = lambda y_hat, y: np.log(1 + np.exp(-(_LOGISTIC_C)*(2 * y - 1) * (2 * y_hat - 1))) / (np.log(1 + np.exp(_LOGISTIC_C))) # re-scaled logistic loss
#logistic_loss = lambda y_hat, y: np.log(1 + np.exp(-(_LOGISTIC_C)*(2 * y - 1) * (2 * y_hat - 1))) # re-scaled logistic loss
weight_assign = lambda theta, y: (logistic_loss(theta + width/2,
y) - logistic_loss(theta - width/2, y))
W = weight_assign(X_aug['theta'], Y_values)
Y_aug = 1*(W < 0)
W = abs(W)
# Compute the weights
return X_aug, A_aug, Y_aug, W