-
Notifications
You must be signed in to change notification settings - Fork 11
/
dataset.py
141 lines (112 loc) · 5.51 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import datetime
import os
from typing import Callable, Optional
import pandas as pd
from sklearn import preprocessing
import numpy as np
import torch
from torch_geometric.data import (
Data,
InMemoryDataset
)
pd.set_option('display.max_columns', None)
class AMLtoGraph(InMemoryDataset):
def __init__(self, root: str, edge_window_size: int = 10,
transform: Optional[Callable] = None,
pre_transform: Optional[Callable] = None):
self.edge_window_size = edge_window_size
super().__init__(root, transform, pre_transform)
self.data, self.slices = torch.load(self.processed_paths[0])
@property
def raw_file_names(self) -> str:
return 'HI-Small_Trans.csv'
@property
def processed_file_names(self) -> str:
return 'data.pt'
@property
def num_nodes(self) -> int:
return self._data.edge_index.max().item() + 1
def df_label_encoder(self, df, columns):
le = preprocessing.LabelEncoder()
for i in columns:
df[i] = le.fit_transform(df[i].astype(str))
return df
def preprocess(self, df):
df = self.df_label_encoder(df,['Payment Format', 'Payment Currency', 'Receiving Currency'])
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['Timestamp'] = df['Timestamp'].apply(lambda x: x.value)
df['Timestamp'] = (df['Timestamp']-df['Timestamp'].min())/(df['Timestamp'].max()-df['Timestamp'].min())
df['Account'] = df['From Bank'].astype(str) + '_' + df['Account']
df['Account.1'] = df['To Bank'].astype(str) + '_' + df['Account.1']
df = df.sort_values(by=['Account'])
receiving_df = df[['Account.1', 'Amount Received', 'Receiving Currency']]
paying_df = df[['Account', 'Amount Paid', 'Payment Currency']]
receiving_df = receiving_df.rename({'Account.1': 'Account'}, axis=1)
currency_ls = sorted(df['Receiving Currency'].unique())
return df, receiving_df, paying_df, currency_ls
def get_all_account(self, df):
ldf = df[['Account', 'From Bank']]
rdf = df[['Account.1', 'To Bank']]
suspicious = df[df['Is Laundering']==1]
s1 = suspicious[['Account', 'Is Laundering']]
s2 = suspicious[['Account.1', 'Is Laundering']]
s2 = s2.rename({'Account.1': 'Account'}, axis=1)
suspicious = pd.concat([s1, s2], join='outer')
suspicious = suspicious.drop_duplicates()
ldf = ldf.rename({'From Bank': 'Bank'}, axis=1)
rdf = rdf.rename({'Account.1': 'Account', 'To Bank': 'Bank'}, axis=1)
df = pd.concat([ldf, rdf], join='outer')
df = df.drop_duplicates()
df['Is Laundering'] = 0
df.set_index('Account', inplace=True)
df.update(suspicious.set_index('Account'))
df = df.reset_index()
return df
def paid_currency_aggregate(self, currency_ls, paying_df, accounts):
for i in currency_ls:
temp = paying_df[paying_df['Payment Currency'] == i]
accounts['avg paid '+str(i)] = temp['Amount Paid'].groupby(temp['Account']).transform('mean')
return accounts
def received_currency_aggregate(self, currency_ls, receiving_df, accounts):
for i in currency_ls:
temp = receiving_df[receiving_df['Receiving Currency'] == i]
accounts['avg received '+str(i)] = temp['Amount Received'].groupby(temp['Account']).transform('mean')
accounts = accounts.fillna(0)
return accounts
def get_edge_df(self, accounts, df):
accounts = accounts.reset_index(drop=True)
accounts['ID'] = accounts.index
mapping_dict = dict(zip(accounts['Account'], accounts['ID']))
df['From'] = df['Account'].map(mapping_dict)
df['To'] = df['Account.1'].map(mapping_dict)
df = df.drop(['Account', 'Account.1', 'From Bank', 'To Bank'], axis=1)
edge_index = torch.stack([torch.from_numpy(df['From'].values), torch.from_numpy(df['To'].values)], dim=0)
df = df.drop(['Is Laundering', 'From', 'To'], axis=1)
edge_attr = torch.from_numpy(df.values).to(torch.float)
return edge_attr, edge_index
def get_node_attr(self, currency_ls, paying_df,receiving_df, accounts):
node_df = self.paid_currency_aggregate(currency_ls, paying_df, accounts)
node_df = self.received_currency_aggregate(currency_ls, receiving_df, node_df)
node_label = torch.from_numpy(node_df['Is Laundering'].values).to(torch.float)
node_df = node_df.drop(['Account', 'Is Laundering'], axis=1)
node_df = self.df_label_encoder(node_df,['Bank'])
node_df = torch.from_numpy(node_df.values).to(torch.float)
return node_df, node_label
def process(self):
df = pd.read_csv(self.raw_paths[0])
df, receiving_df, paying_df, currency_ls = self.preprocess(df)
accounts = self.get_all_account(df)
node_attr, node_label = self.get_node_attr(currency_ls, paying_df,receiving_df, accounts)
edge_attr, edge_index = self.get_edge_df(accounts, df)
data = Data(x=node_attr,
edge_index=edge_index,
y=node_label,
edge_attr=edge_attr
)
data_list = [data]
if self.pre_filter is not None:
data_list = [d for d in data_list if self.pre_filter(d)]
if self.pre_transform is not None:
data_list = [self.pre_transform(d) for d in data_list]
data, slices = self.collate(data_list)
torch.save((data, slices), self.processed_paths[0])