-
Notifications
You must be signed in to change notification settings - Fork 18
/
preprocessing.py
89 lines (81 loc) · 3.39 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import csv
import re
import pandas as pd
from nltk.tokenize import sent_tokenize
csv.field_size_limit(1000000)
def pre_process_pb(path):
''' Clean FinancialPhrasebank data
Requires:
- Sentences_66Agree.txt
Creates:
- Sentences_AllAgree_preprocessed_baseline.csv
- Sentences_AllAgree_preprocessed.csv
'''
df = pd.read_csv(path+'FinancialPhraseBank-v1.0/Sentences_66Agree.txt',
sep='\@',
engine='python',
header=None,
names=['sentence','label'])
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df.to_csv(path+'Sentences_AllAgree_preprocessed_baseline.csv', index=False)
df = pd.get_dummies(df, columns=['label'])
df.to_csv(path+'Sentences_AllAgree_preprocessed.csv', index=False)
return df
def clean_5(x):
'''Function to add whitespaces, when they are missing after a . '''
words = re.findall('\.[A-Z][a-z]*', x)
for i in words:
x = x.replace(i, ". " + i[1:])
return x
def pre_process_news(path):
''' Clean news articles, only use 2018 data, choose only AAPL tagged articles
Requires:
- all_upto_2019-08-29.csv (this file is confidential and will only provided if you need it)
Creates:
- pre_processed_aapl_sentences.csv
'''
df = pd.read_csv(path+'all_upto_2019-08-29.csv',
sep=',',
engine='python',
parse_dates=['article_time'],
encoding='utf-8')
df['symbols'] = df['symbols'].fillna(value='None')
df = df[df['symbols'].str.contains('AAPL')]
df = df[df['article_time'].dt.year == 2018]
df['text'] = df['article_title']+'. '+df['article_content']
df['text'].replace('\xa0', ' ', inplace=True)
df['text'].replace('\n', ' ', inplace=True)
df['text'].replace('Inc.', '', inplace=True)
df['text'].replace('"', '', inplace=True)
df['text'].apply(clean_5)
df = df[['article_time', 'text']]
sentences = df['text'].apply(sent_tokenize)
sentences = sentences.apply(pd.Series).stack()
sentences.index = sentences.index.droplevel(-1)
sentences.name = 'text'
sentences = pd.DataFrame(sentences)
df.drop('text', axis=1, inplace=True)
df = df.join(sentences)
df.to_csv(path+'/pre_processed_aapl_sentences.csv')
def pre_process_vars(path):
''' Pre-process other economic indicators and variables used as data for the prediction
Requires:
- AAPL_daily.csv
- fama5.csv
- LIBOR USD.csv
- USDX.csv
'''
nasdaq = pd.read_csv(path+'AAPL_daily.csv', usecols=["Date", "Close"], parse_dates=['Date'])
nasdaq.columns = ["date", "nasdaq_index_close"]
fama = pd.read_csv(path+'fama5.csv', parse_dates=[0],skiprows=2)
fama['date'] = fama['Unnamed: 0']
fama = fama[["date", 'Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA']]
libor = pd.read_csv(path+'LIBOR USD.csv', parse_dates=[0])
libor = libor[["Date", "1M", "3M"]]
libor.columns = ["date", "libor_1M", "libor_3M"]
libor["risk_premium"] = libor["libor_3M"] - libor["libor_1M"]
usd_index = pd.read_csv(path+'USDX.csv', parse_dates=[0], usecols=['Date', 'Close'])
usd_index.columns = ['date', 'usd_index_close']
[x.set_index('date', inplace=True) for x in [nasdaq, fama, libor, usd_index]]
return nasdaq, fama, libor, usd_index