-
Notifications
You must be signed in to change notification settings - Fork 3
/
mit_experiment.py
127 lines (103 loc) · 4.54 KB
/
mit_experiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os, time, datetime
import numpy as np
import pandas as pd
from dateutil.relativedelta import relativedelta
import matplotlib.pyplot as plt
import networkx as nx
from experiments_base import *
def get_contiguous_ids(all_userids,df_label):
userids_to_newids ={}
newids_to_userids = {}
tempidx = 1
for user_id in all_userids:
#hacky because index and user_id differ by 1 for df_label
if df_label.year_school[user_id-1] > 0:
userids_to_newids[user_id] = {'id':tempidx,'group':df_label.year_school[user_id-1]}
newids_to_userids[tempidx] = user_id
tempidx += 1
return userids_to_newids,newids_to_userids
def construct_graph_sequence(df,userids_to_newids, newids_to_userids):
st = time.time()
print('Generating data')
start_date = datetime.datetime(2008, 10, 1,0,0,0) #hard coded
GT = []
while True:
df_current = df[df.date >= start_date]
df_current = df_current[df_current.date < start_date + relativedelta(months=1)]
print('start_date',start_date,' len(df_current) ',len(df_current))
if len(df_current)==0:
break
else:
Gcurrent = nx.Graph()
for user_id in userids_to_newids:
Gcurrent.add_node(userids_to_newids[user_id]['id'],group=np.array([userids_to_newids[user_id]['group']]))
for user_id in userids_to_newids:
df_temp = df_current.loc[df_current['user_id']==user_id,]
#print(len(df_temp))
for x in df_temp.dest_user_id_if_known:
if x in userids_to_newids:
Gcurrent.add_edge(userids_to_newids[user_id]['id'],userids_to_newids[x]['id'])
GT.append(Gcurrent)
print('new interval is',start_date,start_date + relativedelta(months=1))
start_date = start_date + relativedelta(months=1)
return GT
def processed_data():
df = pd.read_csv('./data/Calls.csv')
df_label= pd.read_csv('./data/Subjects.csv')
# len(df) #63824
df_label['year_school'] = df_label['year_school'].astype(str)
df_label['year_school'] = df_label['year_school'].str.replace('Freshman', '1')
df_label['year_school'] = df_label['year_school'].str.replace('Sophomore', '1')
df_label['year_school'] = df_label['year_school'].str.replace('Senior', '1')
df_label['year_school'] = df_label['year_school'].str.replace('Junior', '1')
df_label['year_school'] = df_label['year_school'].str.replace('GRT / Other', '2')
df_label['year_school'] = df_label['year_school'].str.replace('nan', '0')
df_label['year_school'] = df_label['year_school'].astype(int)
#drop NA values for nodes
df=df.dropna(subset=['user_id'])
df=df.dropna(subset=['dest_user_id_if_known'])
#Change timestamps to dates
df['date'] = pd.to_datetime(df['time_stamp']).dt.floor('d')
df.index = range(len(df))
return df,df_label
df,df_label = processed_data()
all_userids = set(df['user_id'].unique()).union(set(df['dest_user_id_if_known'].unique()))
userids_to_newids, newids_to_userids = get_contiguous_ids(all_userids,df_label) #for relabling
GT0 = construct_graph_sequence(df,userids_to_newids, newids_to_userids)
#Remove the last three months because they have lots of isolates
isolated = []
for G in GT0:
isolated.append([x for x in nx.isolates(G)])
# print([len(x) for x in isolated])
isolated = isolated[:6]
# print([len(x) for x in isolated])
#Remove nodes that are isolated in at least one snapshot
beta_nodes = set()
for x in isolated:
beta_nodes = set().union(beta_nodes,x)
print('beta_nodes',beta_nodes)
# print('len(beta_nodes)',len(beta_nodes))
beta_userids = [newids_to_userids[x] for x in beta_nodes]
all_userids_filtered = [x for x in all_userids if x not in beta_userids]
userids_to_newids_filtered, newids_to_userids_filtered = get_contiguous_ids(all_userids_filtered,df_label) #for relabling
GT = construct_graph_sequence(df,userids_to_newids_filtered, newids_to_userids_filtered)
GT = GT[:6]
#There will still be some isolated nodes, but less the better.
params = {}
params['dynamic'] = 'bernoulli'
params['n'] = len(GT[0])
params['k'] = 2 # number of communities
params['total_time'] = len(GT)
params['estimation_indices'] = range(2,len(GT))
params['ngridpoints'] = 21 # grid search parameter
params['start_time'] = time.time()
params['unify_method'] = 'sets' # 'lp' #
params['only_unify'] = False
params['debug'] = False
glog = graph_stats_fixed_group(params,GT)
log = estimate_multiple_times(params,GT,glog)
print("\t Run funish time:", time.time()-params['start_time'])
params['end_time_delta'] = time.time() - params['start_time']
fname = './output/pickles/real_mit_log_'+params['dynamic']+'_'+localtime()+'.pkl'
pickle.dump({'log':[log],'glog':[glog],'params':params, 'GT':GT},open(fname,'wb'))
print('Experiment end time:', params['end_time_delta'])