-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.py
90 lines (63 loc) · 1.86 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import nltk
import re
import pandas as pd
nltk.download('stopwords')
sw=nltk.corpus.stopwords.words('english')
def remove_stop_words(txt):
words= txt.split(" ")
words_ns=[]
for word in words:
if word not in sw:
words_ns.append(word)
return list_to_str(words_ns)
def remove_stop_words(txt):
words= txt.split(" ")
words_ns=[]
for word in words:
if word not in sw:
words_ns.append(word)
return list_to_str(words_ns)
def remove_numeric(txt):
words=txt.split(" ")
words_ns=[]
for word in words:
if word.isnumeric() == False:
words_ns.append(word)
return list_to_str(words_ns)
def list_to_str(lst):
return ' '.join([str(elem) for elem in lst])
def get_nouns(txt):
nouns=['NNP','NN']
new_tagged=[]
text = word_tokenize(txt)
tagged = nltk.pos_tag(text)
for tag in tagged:
if tag[1] in nouns:
new_tagged.append(tag[0])
return list_to_str(new_tagged)
def remove_meaninglesswords(txt,meanless_words):
words=txt.split(" ")
words_ns=[]
for word in words:
if word not in meanless_words:
words_ns.append(word)
return list_to_str(words_ns)
def remove_RT(txt):
words=txt.split(" ")
words_ns=[]
for word in words:
if '@' not in word:
words_ns.append(word)
return list_to_str(words_ns)
def merge_df_col_val(txt_col):
newstr=""
for t in txt_col.values.flatten():
newstr+=str(t).lower() + ''
return newstr
def remove_punct_marks(txt):
lst= re.findall('\w+',txt)
return list_to_str(lst)
def get_tweet_for_specific_range(str_strt_date,str_end_date,df):
strt=pd.to_datetime(str_strt_date).date()
end=pd.to_datetime(str_end_date).date()
return df[(df["date"]>=strt) & (df["date"]<=end)]