-
Notifications
You must be signed in to change notification settings - Fork 0
/
my_tf_idf.R
54 lines (45 loc) · 1.27 KB
/
my_tf_idf.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# Trying to figure out tf-idf
m <- as.matrix(dtm99)
#m[ ,1:10]
ncorpus <- nrow(m)
MyIDF <- function(column, ndocs.total) {
idf <- 0
ndocs.found <- 0
for (i in 1:length(column)) {
if (column[i] > 0)
ndocs.found <- ndocs.found + 1
}
if (ndocs.found > 0)
idf = log (1 + ndocs.total / ndocs.found)
idf
}
m.idf <- apply(m, 2, function(r) MyIDF(r,ncorpus))
m.tf.idf <- m
for (i in 1:nrow(m)) {
for (j in 1:ncol(m)) {
m.tf.idf[i,j] <- m.tf.idf[i,j] * m.idf[j]
}
}
m[ ,1:10]
m.tf.idf[ ,1:10]
m.idf[1:10]
m.freq <- colSums(m)
m.freq[1:10]
m.tf.idf.total <- colSums(m.tf.idf)
m.tf.idf.total[1:10]
m[ ,"leesvill"]
# subset just the terms with the highest tf-idf value ---------------------
# First, add the tf-idf totals as an additional row
m.tf.idf.tmp <- m.tf.idf
m.tf.idf.tmp <- rbind(m.tf.idf.tmp, m.tf.idf.total)
m.tf.idf.tmp[ ,1:10]
# Now prepare for subset command - transpose terms to be rows
m.tf.idf.transpose <- t(m.tf.idf.tmp)
m.tf.idf.transpose[1:9,]
dim(m.tf.idf.transpose)
# subset on the tf-idf-total - experiment on the threshold
# but drop tf-idf-total by only getting columns 1:ncorpus
m.tf.idf.transpose2 <- subset(m.tf.idf.transpose, m.tf.idf.total > 30, c(1:ncorpus))
dim(m.tf.idf.transpose2)
m.tf.idf.transpose2
# m.tf.idfe <- t(m.tf.idf)