-
Notifications
You must be signed in to change notification settings - Fork 10
/
test.R
148 lines (117 loc) · 4.67 KB
/
test.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
library(stringi); library(data.table)
setwd("/Volumes/Documents, Files, & Media/GitHub/TextPrediction")
source("milestone.R")
# # test data
twitter <- readLines("twitterTrain.txt", encoding = "UTF-8", skipNul = T)
blogs <- readLines("blogsTrain.txt", encoding = "UTF-8", skipNul = T)
news <- readLines("newsTrain.txt", encoding = "UTF-8", skipNul = T)
# twitter <- readLines("final/en_US//en_US.twitter.txt", encoding = "UTF-8", skipNul = T)
# blogs <- readLines("final/en_US//en_US.blogs.txt", encoding = "UTF-8", skipNul = T)
# news <- readLines("final/en_US//en_US.news.txt", encoding = "UTF-8", skipNul = T)
# clean data
cleaned <<- wrap(clean(c(twitter, news, blogs)))
#cleaned <<- wrap(clean(twitter[1:20000]))
index <- sample(1:length(cleaned), ceiling(.10*length(cleaned)))
train.set <- cleaned[-index]
test.set <- cleaned[index]
# cleaned <- wrap(clean(c(twitter.all[train], blogs.all[train], news.all[train])))
# ngram.offset(train.set)
k = 50
#unlink("ngrams", recursive=T)
if(divide.conquer(train.set, k)){
# rm(unigram, bigram, trigram, fourgram); gc()
print("loading and merging unigrams...")
unigram <- rbindlist(combine("unigram", k))
unigram <- unigram[, sum(N), by = "word"]
setnames(unigram, "V1", "N")
# unigram[, p:=N/sum(N)]
# setorder(unigram, -N)
setkey(unigram, word)
print("loading and merging bigrams...")
bigram <- rbindlist(combine("bigram", k))
bigram <- bigram[, sum(N), by = "word1,word2"]
setnames(bigram, "V1", "N")
# bigram[, p:=N/sum(N)]
# setorder(bigram, -N)
setkey(bigram, word1)
print("loading and merging trigrams...")
trigram <- rbindlist(combine("trigram", k))
trigram <- trigram[, sum(N), by = "word1,word2,word3"]
setnames(trigram, "V1", "N")
# trigram[, p:=N/sum(N)]
# setorder(trigram, -N)
setkey(trigram, word1, word2)
print("loading and merging fourgrams...")
fourgram <- rbindlist(combine("fourgram", k))
fourgram <- fourgram[, sum(N), by = "word1,word2,word3,word4"]
setnames(fourgram, "V1", "N")
# fourgram[, p:=N/sum(N)]
# setorder(fourgram, -N)
setkey(fourgram, word1, word2, word3)
print("DONE")
}
# unlink("finalngrams", recursive = T)
dir.create("finalngrams")
saveRDS(unigram, "finalngrams//unigram.Rds")
saveRDS(bigram, "finalngrams//bigram.Rds")
saveRDS(trigram, "finalngrams//trigram.Rds")
saveRDS(fourgram, "finalngrams//fourgram.Rds")
results <- test(test.set[1:200])
unigram[N==1, word:="<unk>"]
unigram <- unigram[, sum(N), by = "word"]
# setnames(unigram, "V1", "N")
# unigram[, p:=N/sum(N)]
# setorder(unigram, -N)
setkey(unigram, word)
bigram[!(word1 %in% unigram$word), word1:="<unk>"]
bigram[!(word2 %in% unigram$word), word2:="<unk>"]
bigram <- bigram[, sum(N), by="word1,word2"]
setkey(bigram, word1, word2)
trigram[!(word1 %in% unigram$word), word1:="<unk>"]
trigram[!(word2 %in% unigram$word), word2:="<unk>"]
trigram[!(word3 %in% unigram$word), word3:="<unk>"]
trigram <- trigram[, sum(N), by = "word1,word2,word3"]
setkey(trigram, word1, word2, word3)
fourgram[!(word1 %in% unigram$word), word1:="<unk>"]
fourgram[!(word2 %in% unigram$word), word2:="<unk>"]
fourgram[!(word3 %in% unigram$word), word3:="<unk>"]
fourgram[!(word4 %in% unigram$word), word4:="<unk>"]
fourgram <- fourgram[, sum(N), by = "word1,word2,word3,word4"]
setkey(fourgram, word1, word2, word3, word4)
setnames(unigram, "V1", "N")
setnames(bigram, "V1", "N")
setnames(trigram, "V1", "N")
setnames(fourgram, "V1", "N")
results2 <- test(test.set[1:200])
results3 <- test.interpolate(test.set[1:200])
bigram <- bigram[N!=1]
trigram <- trigram[N!=1]
fourgram <- fourgram[N!=1]
results <- test(test.set[1:200])
# results.three <- test.three(test.set[1:200])
unigram <- unigram[N!=1]
bigram <- bigram[N!=1]
trigram <- trigram[N!=1]
fourgram <- fourgram[N!=1]
results.minus1 <- test(test.set[1:200])
# results.three.minus1 <- test.three(test.set[1:200])
unigram <- unigram[N!=2]
bigram <- bigram[N!=2]
trigram <- trigram[N!=2]
fourgram <- fourgram[N!=2]
results.minus2 <- test(test.set[1:200])
# results.three.minus2 <- test.three(test.set[1:200])
unigram <- unigram[N!=3]
bigram <- bigram[N!=3]
trigram <- trigram[N!=3]
fourgram <- fourgram[N!=3]
results.minus3 <- test(test.set[1:200])
# results.three.minus3 <- test.three(test.set[1:200])
rbind(all = c(four=mean(as.logical(results$correct)),
three=mean(as.logical(results.three$correct))),
minus1 = c(mean(as.logical(results.minus1$correct)),
mean(as.logical(results.three.minus1$correct))),
minus2 = c(mean(as.logical(results.minus2$correct)),
mean(as.logical(results.three.minus2$correct))),
minus3 = c(mean(as.logical(results.minus3$correct)),
mean(as.logical(results.three.minus3$correct))))