-
Notifications
You must be signed in to change notification settings - Fork 1
/
Meeting_Notes.py
373 lines (291 loc) · 12.2 KB
/
Meeting_Notes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 7 19:25:35 2019
@author: nitin
"""
import nltk
from nltk.corpus import stopwords
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import os
import re
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from operator import itemgetter
nltk.download('punkt')
import docx
from docx.shared import Inches
######################Summary################
doc = docx.Document()
doc.add_heading('Meeting Notes - Global Executive Dashboard', 0)
doc.add_heading('Summary', 1)
df = pd.read_csv("transcript2.csv")
#df = open('transcript.txt').read().strip()
df.head()
df['article_text'][0]
from nltk.tokenize import sent_tokenize
sentences = []
for s in df['article_text']:
sentences.append(sent_tokenize(s))
sentences = [y for x in sentences for y in x] # flatten list
sentences[:5]
#https://nlp.stanford.edu/projects/glove/
# Extract word vectors
os.getcwd()
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
word_embeddings[word] = coefs
f.close()
len(word_embeddings)
#We now have word vectors for 400,000 different
#terms stored in the dictionary – ‘word_embeddings’.
#Text Preprocessing
# remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")
# make alphabets lowercase
clean_sentences = [s.lower() for s in clean_sentences]
n=len(sentences)
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# function to remove stopwords
def remove_stopwords(sen):
sen_new = " ".join([i for i in sen if i not in stop_words])
return sen_new
# remove stopwords from the sentences
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]
#Vector Representation of Sentences
# Extract word vectors
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
word_embeddings[word] = coefs
f.close()
sentence_vectors = []
for i in clean_sentences:
if len(i) != 0:
v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
else:
v = np.zeros((100,))
sentence_vectors.append(v)
#Similarity Matrix Preparation
# similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])
from sklearn.metrics.pairwise import cosine_similarity
for i in range(len(sentences)):
for j in range(len(sentences)):
if i != j:
sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]
#Applying PageRank Algorithm
import networkx as nx
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)
#Summary Extraction
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
# Extract top 10 sentences as the summary
ranked_sentences_proc=[]
for i in range(n//3):
print(ranked_sentences[i][1])
ranked_sentences_proc.append(ranked_sentences[i][1])
ranked_sentences_proc_df=pd.DataFrame(ranked_sentences_proc)
ranked_sentences_proc_df['Length']=ranked_sentences_proc_df[0].str.len()
ranked_sentences_proc_df['Rank']=ranked_sentences_proc_df['Length'].rank(ascending=False,method='average')
ranked_sentences_proc_df=ranked_sentences_proc_df.sort_values(by='Rank',ascending=True)
final=ranked_sentences_proc_df[0:7]
final = final.sort_index()
k=[]
for i in final.iloc[1:6,0]:
k.append(i)
final_txt = ' '.join(k)
p = doc.add_paragraph(final_txt)
r = p.add_run()
##############To-do; Critical Items#####################
rawText=open('transcript.txt').read().strip()
##to-do and critical
rawText = rawText.lower()
rawText = rawText.replace('?','.')
rawText = rawText.split('.')
def check(sentence, words):
res = [all([k in s for k in words]) for s in sentence]
return [sentence[i] for i in range(0, len(res)) if res[i]]
# Driver code
#model = "When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.I felt like the best weeks that I had to get to know players when I was playing were the Fed Cup weeks or the Olympic weeks, not necessarily during the tournaments."
#sentence_old = "Hello my name is alpha. I am going to help you. my agenda for today."
#checker = rawText.replace(".","','").strip()
#print(checker)
sentence = rawText
#sentence = ['Can you do this for me','make sure you do this','Can you also help','email','My important strategy is this ....','meeting','today','wrong','know','details','week' ,'day', 'help in this', 'my agenda', 'the conclusion..', 'brainstorm ...', 'collaborate ...','new update','what are the updates', 'think','feedback','everyone','input','result','method','point', 'important','agree','disagree','could','dont','do','recommend','should','how','approach','next','meeting','deadline','Working on this project', 'deadline is next weekend...', 'We must do ......', 'We could.....', 'Please check.....', 'Assure ....', 'Look into this....',]
word_can = ['Can']
word_could = ['could']
word_should = ['should']
word_help = ['help']
word_agenda = ['agenda']
word_conclusion = ['conclusion']
word_today = ['today']
word_email =['email']
word_meeting =['meeting']
word_wrong = ['wrong']
word_right = ['right']
word_know = ['know']
word_do = ['do']
word_dont = ['dont']
word_details = ['details']
word_week = ['week']
word_day = ['day']
word_brainstorm = ['brainstorm']
word_collaborate = ['collaborate']
word_update = ['update']
word_think = ['think']
word_feedback = ['feedback']
word_result = ['result']
word_method = ['method']
word_important = ['important']
word_point = ['point']
word_agree = ['agree']
word_disagree = ['disagree']
word_recommend = ['recommend']
word_dashboard = ['dashboard']
word_primarily = ['primarily']
word_performance =['performance']
word_phrase = ['important', 'strategy']
word_phrase_2 = ['make', 'sure']
word_phrase_3 = ['immediate', 'action']
word_immediate = ['immediate']
word_use = ['use']
word_year =["years"]
word_days =["days"]
word_want =["want"]
#note_can =print(check(sentence, word_can))
#note_help =print(check(sentence, word_help))
#note_agenda =print(check(sentence, word_agenda))
#note_conclusion =print(check(sentence, word_conclusion))
#note_dashboard =print(check(sentence, word_dashboard))
note_primarily =check(sentence, word_primarily)
note_year =(check(sentence, word_year))
note_days =(check(sentence, word_days))
note_want =(check(sentence, word_want))
note_immediate =(check(sentence, word_immediate))
#note_use =print(check(sentence, word_use))
#print(note_primarily)
#for i in note_primarily:
# print(i.strip())
note_phrase_2 =(check(sentence, word_phrase_2))
note_phrase_3 =(check(sentence, word_phrase_3))
note_phrase_3 =(check(sentence, word_phrase_3))
#critical = print(note_immediate + note_phrase_2)
to_do= (note_immediate + note_phrase_2)
critical= (note_year,note_days,note_want)
#doc = docx.Document()
#doc.add_heading('Meeting Notes - Global Executive Dashboard', 0)
doc.add_heading('To-Do List', 1)
for i in to_do:
i = i.replace('\n','').strip().capitalize()
# for a in i:
p = doc.add_paragraph(i, style='ListBullet')
r = p.add_run()
doc.add_heading('Critical Items', 1)
for i in critical:
# i.strip()
# i = i.capitalize()
# count = count+1
# for a in i:
# print(str(count) +" "+i)
# print(i)
for a in i:
a = a.replace('\n','').strip().capitalize()
p = doc.add_paragraph(a, style='ListBullet')
r = p.add_run()
WNL = nltk.WordNetLemmatizer()
# -----
##################WordCloud#########################################
def prepareStopWords():
stopwordsList = []
# Load default stop words and add a few more specific to my text.
stopwordsList = stopwords.words('english')
stopwordsList.append('dont')
stopwordsList.append('didnt')
stopwordsList.append('doesnt')
stopwordsList.append('cant')
stopwordsList.append('couldnt')
stopwordsList.append('couldve')
stopwordsList.append('im')
stopwordsList.append('ive')
stopwordsList.append('isnt')
stopwordsList.append('theres')
stopwordsList.append('wasnt')
stopwordsList.append('wouldnt')
stopwordsList.extend(['good','morning','a','immediate','require','like'])
stopwordsList.extend(['also','action item','highlight area'])
stopwordsList.extend(['give u','would','immediate attention','review meeting','good morning'])
stopwordsList.extend(['Male Speaker 1', 'would like','information','sam','make sure','Kate'])
return stopwordsList
# -----
# Open the file and read lines
# NOTE: You need to give finder.score_ngrams a sizable corpus to work with.
rawText=open('transcript.txt').read()
# Lowercase and tokenize
# Remove single quote early since it causes problems with the tokenizer.
# wasn't turns into 2 entries; was, n't.
rawText = rawText.replace("'", "").replace(".","")
tokens = nltk.word_tokenize(rawText)
text = nltk.Text(tokens)
# Load default stop words and add a few more.
stopWords = prepareStopWords()
# Remove extra chars and remove stop words.
text_content = [''.join(re.split("[ .,;:!?‘’``''@#$%^_&*()<>{}~\n\t\\\-]", word)) for word in text]
text_content = [word for word in text_content if word not in stopWords]
# After the punctuation above is removed it still leaves empty entries in the list.
# Remove any entries where the len is zero.
text_content = [s for s in text_content if len(s) != 0]
# Best to get the lemmas of each word to reduce the number of similar words
# on the word cloud. The default lemmatize method is noun, but this could be
# expanded.
# ex: The lemma of 'characters' is 'character'.
text_content = [WNL.lemmatize(t) for t in text_content]
# setup and score the bigrams using the raw frequency.
finder = BigramCollocationFinder.from_words(text_content)
bigram_measures = BigramAssocMeasures()
scored = finder.score_ngrams(bigram_measures.raw_freq)
# setup and score the trigram using the raw frequency.
#finder = nltk.collocations.TrigramCollocationFinder.from_words(text_content)
#trigram_measures = nltk.collocations.TrigramAssocMeasures()
#scored = finder.score_ngrams(trigram_measures.raw_freq)
#
# By default finder.score_ngrams is sorted, however don't rely on this default behavior.
# Sort highest to lowest based on the score.
scoredList = sorted(scored, key=itemgetter(1), reverse=True)
# word_dict is the dictionary we'll use for the word cloud.
# Load dictionary with the FOR loop below.
# The dictionary will look like this with the bigram and the score from above.
# word_dict = {'bigram A': 0.000697411,
# 'bigram B': 0.000524882}
word_dict = {}
listLen = len(scoredList)
# Get the bigram and make a contiguous string for the dictionary key.
# Set the key to the scored value.
for i in range(listLen):
word_dict[' '.join(scoredList[i][0])] = scoredList[i][1]
# -----
# Set word cloud params and instantiate the word cloud.
# The height and width only affect the output image file.
WC_height = 500
WC_width = 1000
WC_max_words = 20
wordCloud = WordCloud(max_words=WC_max_words, height=WC_height, width=WC_width,
background_color ='white')
wordCloud.generate_from_frequencies(word_dict)
plt.title('Most frequently occurring bigrams connected with an underscore_')
plt.imshow(wordCloud, interpolation='bilinear')
plt.axis("off")
plt.show()
wordCloud.to_file("WordCloud_Bigrams_frequent_words.png")
doc.add_heading('Most Frequent Words \n',1)
doc.add_picture("WordCloud_Bigrams_frequent_words.png",width=Inches(3.0), height=Inches(3.0))
doc.save('demo.docx')