-
Notifications
You must be signed in to change notification settings - Fork 8
/
Dic.java
142 lines (126 loc) · 6.28 KB
/
Dic.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import datastructures.DicHashTreeNode;
public class Dic extends Algo {
protected DicHashTreeNode hashTreeRoot;
protected int M;
protected int lastIter = Integer.MAX_VALUE;
protected int estimatedNumOfBaskets;
protected Map<Integer, Long> kDiscoveryIndices;
protected int lastKWithDiscoveries;
protected int maxBasketSize;
public Dic(String pathFormat, int startIndex, int endIndex, int _M, Set<String> alphabetSet,
int _estimatedNumOfBaskets) throws Exception {
super(pathFormat, startIndex, endIndex, alphabetSet);
hashTreeRoot = new DicHashTreeNode(0, 0);
root = hashTreeRoot;
estimatedNumOfBaskets = _estimatedNumOfBaskets;
kDiscoveryIndices = new HashMap<Integer, Long>();
kDiscoveryIndices.put(0, 1L);
kDiscoveryIndices.put(1, 1L);
// Small inputs work around
// If M is greater than the input size we will set it to the input size
// and then the Algorithm will run like Apriori
M = _M < estimatedNumOfBaskets ? _M : estimatedNumOfBaskets;
}
protected void init() {
hashTreeRoot.depth = 0;
hashTreeRoot.set = null;
lastKWithDiscoveries = 0;
maxBasketSize = 0;
}
protected void startOfSingleIteration() {
foundSomething = false;
iterNum++;
}
public boolean endOfSingleIteration(double support, long basketIndex, boolean foundSomething) {
estimatedNumOfBaskets = numOfBaskets;
return true;
}
protected void pruneAndDiscover(double support) {
hashTreeRoot.pruneAtDepth(this, support, estimatedNumOfBaskets, k, basketIndex);
if (iterNum == 1 || k < maxBasketSize) {
k++;
kDiscoveryIndices.put(k, basketIndex);
}
foundSomething = (hashTreeRoot.discoverTillDepth(this, support, estimatedNumOfBaskets, k, basketIndex));
}
/**
* Return the threshold (natural number) which an Item must appear more than to be
* considered as common. Since during the first iteration we don't know the number of
* baskets in our run we will return 0 so no item will be missed
*
* @param support
* - The relative support
* @return - The support as integer
*/
public int getSupportAsInt(double support, long discoveryIndex) {
return (int) Math.floor(support * estimatedNumOfBaskets);
}
protected boolean processSingleLine(String line, double support) {
String[] tmpBasket = Common.parseLineToBasket(line);
if (iterNum == 1) {// Collect some stats
numOfBaskets++;
int basketSize = tmpBasket.length;
numOfItems += basketSize;
if (basketSize > maxBasketSize) {
maxBasketSize = basketSize;
}
}
handleBasket(tmpBasket, k, hashTreeRoot, basketIndex, support);
if (basketIndex % M == 0) {
pruneAndDiscover(support);
if (!foundSomething) {
long prevKDiscoveryIndex = kDiscoveryIndices.get(lastKWithDiscoveries);
if (basketIndex - prevKDiscoveryIndex > estimatedNumOfBaskets) {
return false;
}
} else {
lastKWithDiscoveries = k;
}
}
return true;
}
protected void handleBasket(String[] tmpBasket, int k, DicHashTreeNode root, long basketIndex, double support) {
Arrays.sort(tmpBasket);
int subsetsSize = k > tmpBasket.length ? tmpBasket.length : k;
for (int i = 1; i <= subsetsSize; i++) {
if (estimatedNumOfBaskets == Integer.MAX_VALUE
|| basketIndex - kDiscoveryIndices.get(i) <= 2 * estimatedNumOfBaskets) {
handleOneBasketPhase(tmpBasket, i, root, basketIndex, support);
}
}
}
protected void handleOneBasketPhase(String[] tmpBasket, int subsetsSize, DicHashTreeNode root, long basketIndex,
double support) {
List<List<String>> subsets = Common.getAllSubsets(tmpBasket, subsetsSize, getAlphabet());
String word;
if (subsets == null)
return;
DicHashTreeNode traverser;
for (List<String> list : subsets) {
traverser = root;
if (traverser.childrens == null)
continue;
// assert(k==list.size());
for (int i = 0; i < subsetsSize; i++) {
word = list.get(i);
if (traverser.childrens.containsKey(word)) {
traverser = traverser.childrens.get(word);
if (i == subsetsSize - 1) {
traverser.encounterNode(basketIndex, estimatedNumOfBaskets, getSupportAsInt(support, traverser.discoveryIndex),
iterNum);
}
} else if (i == 0) {
Algo.incStats(1);
traverser = new DicHashTreeNode(1, basketIndex, word, root, iterNum, estimatedNumOfBaskets, support);
} else {
break;
}
}
}
}
}