-
Notifications
You must be signed in to change notification settings - Fork 1
/
code.py
228 lines (182 loc) · 8.01 KB
/
code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
#Importing the libraries
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
#import seaborn as sns
import streamlit as st
from textblob import TextBlob
import numpy as np
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten,BatchNormalization
from keras.models import Sequential, load_model, model_from_config
from sklearn.model_selection import train_test_split
#from sklearn.metrics import mean_squared_error
#from sklearn.metrics import cohen_kappa_score
import random
from PIL import Image
import language_check
#Reading the dataset
data = pd.read_csv("training_set_rel3.tsv", sep='\t', encoding = "ISO-8859-1")
col_to_keep =['essay','domain1_score']
data = data[col_to_keep]
#Adding the image widget
image = Image.open('nlp.png')
st.image(image, use_column_width=True)
#Adding the title and subsequent text
st.title("Welcome to Automated Essay Grading System")
st.header("""
This is a system which lets you input a given essay for a given prompt and the system will return a score for the same
""")
st.warning("The following is the essay prompt. It belongs to the persuasive form of essay")
st.success("""
More and more people use computers, but not everyone agrees that this benefits society. Those who support advances in technology believe that computers have a positive effect on people. They teach hand-eye coordination, give people the ability to learn about faraway places and people, and even allow people to talk online with other people. Others have different ideas. Some experts are concerned that people are spending too much time on their computers and less time exercising, enjoying nature, and interacting with family and friends.
Write an essay to your local newspaper in which you state your opinion on the effects computers have on people. Persuade the readers to agree with you.
""")
#Basic input
name = st.text_input("Before we proceed, enter your name in the box below: ")
organ = ['Academic', 'Non Academic','Corporate','Other']
organisation = st.multiselect("Enter the purpose of using this software:", organ)
st.write("All righty then, Click on this button to view the dataset")
click = st.button("CLICK")
if click == True:
st.dataframe(data)
filter_data = data
#Preprocessing the dataset
def preprocessing():
#Calculates word count
def word_counting(x):
return (len(TextBlob(x).words))
filter_data['word_length'] = filter_data['essay'].apply(word_counting)
#Calculates sentence count
def sentence_counting(x):
sentence_len = len([len(sentence.split(' ')) for sentence in TextBlob(x).sentences])
return sentence_len
filter_data['no_of_sentence'] = filter_data['essay'].apply(sentence_counting)
#Calculates sentiment of sentence
def avg_sentence_sentiment(x):
sentiment_essay = TextBlob(x).sentiment.polarity
return sentiment_essay
filter_data['sentiment_essay'] = filter_data['essay'].apply(avg_sentence_sentiment)
#Calculates average length of words
def avg_length_of_words(x):
word_len = [len(word) for word in TextBlob(x).words]
return (sum(word_len) / len(word_len))
filter_data['avg_word_len'] = filter_data['essay'].apply(avg_length_of_words)
#Checks the grammatical error
def grammar_check(x):
tool = language_check.LanguageTool('en-US')
matches = tool.check(x)
return len(matches)
filter_data['Grammar_check'] = filter_data['essay'].apply(grammar_check)
#The next 5 lines can be commented as they take some time to load. If time is not an issue, feel free to go ahead
st.write("If you wish to, Click on this button to view the features of the essay set")
click_1 = st.checkbox("Check this box")
if click_1 == True:
preprocessing()
st.dataframe(filter_data)
#Enter theessay to be graded here
essay_to_be_graded = st.text_area("Enter here the essay to be graded")
data = data.append({'essay': essay_to_be_graded, 'domain1_score': random.randint(2,12)}, ignore_index=True)
#Processing of essay to be graded
def processing():
y = data['domain1_score']
#splitting into train and test set
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3, random_state=42)
train_e = X_train['essay'].tolist()
test_e = X_test['essay'].tolist()
train_sents = []
test_sents = []
stop_words = set(stopwords.words('english'))
def sent2word(x):
x = re.sub("[^A-Za-z]", " ", x)
x.lower()
filtered_sentence = []
words = x.split()
for w in words:
if w not in stop_words:
filtered_sentence.append(w)
return filtered_sentence
def essay2word(essay):
essay = essay.strip()
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
raw = tokenizer.tokenize(essay)
final_words = []
for i in raw:
if (len(i) > 0):
final_words.append(sent2word(i))
return final_words
for i in train_e:
train_sents += essay2word(i)
for i in test_e:
test_sents += essay2word(i)
#Layout of the LSTM Model
def get_model():
model = Sequential()
model.add(LSTM(300, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 300], return_sequences=True))
BatchNormalization()
model.add(LSTM(64, recurrent_dropout=0.4))
model.add(Dropout(0.5))
model.add(Dense(1, activation='relu', kernel_initializer='he_normal'))
model.compile(loss='mean_squared_error', optimizer='RMSProp', metrics=['mae'])
model.summary()
return model
# Training Word2Vec model
num_features = 300
min_word_count = 20
num_workers = -1
context = 10
downsampling = 1e-3
model = Word2Vec(train_sents,
workers=num_workers,
size=num_features,
min_count=min_word_count,
window=context,
sample=downsampling)
model.init_sims(replace=True)
model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)
def makeVec(words, model, num_features):
vec = np.zeros((num_features,), dtype="float32")
noOfWords = 0.
index2word_set = set(model.wv.index2word)
for i in words:
if i in index2word_set:
noOfWords += 1
vec = np.add(vec, model[i])
vec = np.divide(vec, noOfWords)
return vec
def getVecs(essays, model, num_features):
c = 0
essay_vecs = np.zeros((len(essays), num_features), dtype="float32")
for i in essays:
essay_vecs[c] = makeVec(i, model, num_features)
c += 1
return essay_vecs
clean_train = []
for i in train_e:
clean_train.append(sent2word(i))
training_vectors = getVecs(clean_train, model, num_features)
clean_test = []
for i in test_e:
clean_test.append(sent2word(i))
testing_vectors = getVecs(clean_test, model, num_features)
training_vectors = np.array(training_vectors)
testing_vectors = np.array(testing_vectors)
# Reshaping train and test vectors to 3 dimensions. (1 represnts one timestep)
training_vectors = np.reshape(training_vectors, (training_vectors.shape[0], 1, training_vectors.shape[1]))
testing_vectors = np.reshape(testing_vectors, (testing_vectors.shape[0], 1, testing_vectors.shape[1]))
lstm_model = get_model()
#fitting the model
lstm_model.fit(training_vectors, y_train, batch_size=64, epochs=150)
y_pred = lstm_model.predict(testing_vectors)
y_pred = np.around(y_pred)
st.write("Your score is", y_pred[8])
#button widget to calcuate score
button_two=st.button("Calculate Score")
#while click==True:
if button_two==True:
preprocessing()
processing()