-
Notifications
You must be signed in to change notification settings - Fork 0
/
testScriptMutation.py
117 lines (107 loc) · 4.88 KB
/
testScriptMutation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#Inputs:
#1. path to clean dataset
#2. list of attribute names (comma separated)
#3. Index of mutation operator
#example: python testScriptMutation.py synthetic_1_clean.csv "value" 1
#Outputs:
#1. Mutated data file in CSV
#2. A CSV file that containts outlier indexes
#assumption: The first column of dataset is an increasing index
import sys
import pandas as pd
import random
clean_dataset=pd.read_csv(sys.argv[1])
M_dataset=clean_dataset
attribute_names=sys.argv[2].split(",")
num_of_muted_attrs=1
mutated_attributes=[0]
if len(attribute_names)>1:
num_of_muted_attrs=random.randint(1,len(attribute_names))
mutated_attributes=random.sample(range(1, len(attribute_names)), num_of_muted_attrs)
print("**********")
print(mutated_attributes)
print(num_of_muted_attrs)
operator=sys.argv[3]
print(operator)
#random.randint(1,5)
muted_attributes=[]
ids=[]
if operator=="1":
#select records to mutate
r0=int(0.002*len(M_dataset))
mutated_records=[]
for x in range(r0):
mutated_records.append(random.randint(0,len(M_dataset)))
for x in mutated_records:
#ids.append(M_dataset.ix[x, M_dataset.columns.values[0]]) commented by sanket on 28jun2020 because it is deprecated and was not working. replaced by below iloc line across the file
ids.append(M_dataset[M_dataset.columns.values[0]].iloc[x])
else:
#select the subsequent to mutate
len_temp=random.randint(int(0.2*len(clean_dataset)),len(M_dataset))
temp_df=M_dataset.tail(len_temp)
starting_index=len(M_dataset)-len_temp
r=random.randint(int(0.05*len(clean_dataset)) , int(0.2*len(clean_dataset)))
for x in range(r):
index=starting_index+x
#ids.append(M_dataset.ix[index, M_dataset.columns.values[0]])
ids.append(M_dataset[M_dataset.columns.values[0]].iloc[index])
####################################################################
for i in mutated_attributes:
attribute=attribute_names[i]
muted_attributes.append(attribute)
print(attribute)
if operator=="1":
#M1
min_value=-10*abs(int(clean_dataset[attribute].min(axis = 0)))
max_value=10*int(clean_dataset[attribute].max(axis = 0))
for x in mutated_records:
index=x
#M_dataset.ix[index, attribute] = random.randint(min_value,max_value)
M_dataset[attribute].iloc[index] = random.randint(min_value,max_value)
if operator=="2":
#M2
temp_df=M_dataset[attribute].tail(len_temp)
shifted_horiz_df=temp_df.shift(periods=r, axis = 0)
#shifted_horiz_df=shifted_horiz_df.fillna(M_dataset.ix[index, attribute])
shifted_horiz_df=shifted_horiz_df.fillna(M_dataset[attribute].iloc[index])
for i in range(len_temp):
index_to_replace=i+(len(M_dataset)-len(temp_df))
#M_dataset.ix[index_to_replace,attribute]=shifted_horiz_df[index_to_replace]
M_dataset[attribute].iloc[index_to_replace]=shifted_horiz_df[index_to_replace]
if operator=="3":
#M3
temp_df=M_dataset[attribute].tail(len_temp)
min_value=-5*abs(int(clean_dataset[attribute].min(axis = 0)))
max_value=5*int(clean_dataset[attribute].max(axis = 0))
value_to_add=random.randint(min_value,max_value)
shifted_vertical_df=temp_df+value_to_add
for i in range(r):
index_to_replace=i+(len(M_dataset)-len(temp_df))
#M_dataset.ix[index_to_replace,attribute]=shifted_vertical_df[index_to_replace]
M_dataset[attribute].iloc[index_to_replace]=shifted_vertical_df[index_to_replace]
if operator=="4":
#M4
min_value=(int(clean_dataset[attribute].min(axis = 0)))
max_value=int(clean_dataset[attribute].max(axis = 0))
temp_df=M_dataset[attribute].tail(len_temp)
value_to_multiply=random.randint(min_value,max_value)
print(temp_df)
rescaled_df=temp_df*value_to_multiply
print("***************")
print(rescaled_df)
for i in range(r):
index_to_replace=i+(len(M_dataset)-len(temp_df))
#M_dataset.ix[index_to_replace,attribute]=rescaled_df[index_to_replace]
M_dataset[attribute].iloc[index_to_replace]=rescaled_df[index_to_replace]
if operator=="5":
#M5
min_value=int(clean_dataset[attribute].min(axis = 0))
max_value=int(clean_dataset[attribute].max(axis = 0))
temp_df=M_dataset[attribute].tail(len_temp)
for i in range(r):
index_to_replace=i+(len(M_dataset)-len(temp_df))
#M_dataset.ix[index_to_replace,attribute]=random.randint(min_value,max_value)
M_dataset[attribute].iloc[index_to_replace]=random.randint(min_value,max_value)
M_dataset.to_csv(sys.argv[1][:-4]+"_"+''.join(map(str, muted_attributes)) +"_M"+str(operator)+".csv", index=False)
outliers_dataset = pd.DataFrame(ids, columns = [M_dataset.columns.values[0]])
outliers_dataset.to_csv(sys.argv[1][:-4]+"_"+''.join(map(str, muted_attributes))+"_M"+str(operator)+"_outliers.csv", index=False)