Skip to content

Commit

Permalink
Merge branch 'KubaSz4-lcp_array'
Browse files Browse the repository at this point in the history
  • Loading branch information
phishman3579 committed Jul 3, 2017
2 parents 9ac3354 + 2f826ce commit eb56f0e
Show file tree
Hide file tree
Showing 8 changed files with 601 additions and 2 deletions.
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ This is a collection of algorithms and data structures which I've implement over
* [Implicit Key Treap](src/com/jwetherell/algorithms/data_structures/ImplicitKeyTreap.java)
* [KD Tree (k-dimensional tree or k-d tree)](src/com/jwetherell/algorithms/data_structures/KDTree.java)
* [List [backed by an array or a linked list]](src/com/jwetherell/algorithms/data_structures/List.java)
* [LCP Array (Longest Common Prefix) [backed by a Suffix Array]](src/com/jwetherell/algorithms/data_structures/LCPArray.java)
* [Matrix](src/com/jwetherell/algorithms/data_structures/Matrix.java)
* [Patricia Trie](src/com/jwetherell/algorithms/data_structures/PatriciaTrie.java)
* [Quad-Tree (Point-Region or MX-CIF)](src/com/jwetherell/algorithms/data_structures/QuadTree.java)
Expand All @@ -57,6 +58,7 @@ This is a collection of algorithms and data structures which I've implement over
* [Skip List](src/com/jwetherell/algorithms/data_structures/SkipList.java)
* [Splay Tree](src/com/jwetherell/algorithms/data_structures/SplayTree.java)
* [Stack [backed by an array or a linked list]](src/com/jwetherell/algorithms/data_structures/Stack.java)
* [Suffix Array](src/com/jwetherell/algorithms/data_structures/SuffixArray.java)
* [Suffix Tree (Ukkonen's algorithm)](src/com/jwetherell/algorithms/data_structures/SuffixTree.java)
* [Suffix Trie [backed by a Trie]](src/com/jwetherell/algorithms/data_structures/SuffixTrie.java)
* [Treap](src/com/jwetherell/algorithms/data_structures/Treap.java)
Expand Down Expand Up @@ -150,7 +152,9 @@ This is a collection of algorithms and data structures which I've implement over
* Graph Traversal
- [Depth First Traversal](src/com/jwetherell/algorithms/graph/DepthFirstTraversal.java)
- [Breadth First Traversal](src/com/jwetherell/algorithms/graph/BreadthFirstTraversal.java)
* [Edmonds Karp](src/com/jwetherell/algorithms/graph/EdmondsKarp.java)
* [Edmonds Karp](src/com/jwetherell/algorithms/graph/EdmondsKarp.java)
* Matching
- [Turbo Matching](src/com/jwetherell/algorithms/graph/TurboMatching.java)

## Search
* Get index of value in array
Expand Down
2 changes: 1 addition & 1 deletion src/com/jwetherell/algorithms/data_structures/KdTree.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
* useful data structure for several applications, such as searches involving a
* multidimensional search key (e.g. range searches and nearest neighbor
* searches). k-d trees are a special case of binary space partitioning trees.
*
* <br>
* @author Justin Wetherell <[email protected]>
* @see <a href="http://en.wikipedia.org/wiki/K-d_tree">K-d_tree (Wikipedia)</a>
*/
Expand Down
77 changes: 77 additions & 0 deletions src/com/jwetherell/algorithms/data_structures/LCPArray.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package com.jwetherell.algorithms.data_structures;

import java.util.ArrayList;

/**
* In computer science, the longest common prefix array (LCP array) is an auxiliary
* data structure to the suffix array. It stores the lengths of the longest common
* prefixes (LCPs) between all pairs of consecutive suffixes in a sorted suffix array.
* <p>
* https://en.wikipedia.org/wiki/LCP_array
* <br>
* @author Jakub Szarawarski <[email protected]>
* @author Justin Wetherell <[email protected]>
*/
public class LCPArray {

private static final char DEFAULT_END_SEQ_CHAR = '$';

private char END_SEQ_CHAR;
private SuffixArray suffixArrayBuilder;
private ArrayList<Integer> LCP;

public LCPArray(CharSequence sequence){
this(sequence, DEFAULT_END_SEQ_CHAR);
}

public LCPArray(CharSequence sequence, char endChar) {
END_SEQ_CHAR = endChar;
suffixArrayBuilder = new SuffixArray(sequence, END_SEQ_CHAR);
}

public ArrayList<Integer> getLCPArray() {
if (LCP == null)
LCPAlgorithm();
return LCP;
}

private void LCPAlgorithm() {
final ArrayList<Integer> LCPR = getLCPR();
getLCPfromLCPR(LCPR);
}

private ArrayList<Integer> getLCPR() {
final ArrayList<Integer> KMRArray = suffixArrayBuilder.getKMRarray();
final ArrayList<Integer> suffixArray = suffixArrayBuilder.getSuffixArray();
final String string = suffixArrayBuilder.getString();
final int length = KMRArray.size();
final ArrayList<Integer> LCPR = new ArrayList<Integer>(); // helper array, LCP[i] = LCPR[suffixArray[i]]

int startingValue = 0;
for (int i=0; i<length; i++) {
if(KMRArray.get(i).equals(0)) {
LCPR.add(0);
startingValue = 0;
} else {
int LCPRValue = startingValue;
final int predecessor = suffixArray.get(KMRArray.get(i)-1);
while (string.charAt(i+LCPRValue) == string.charAt(predecessor+LCPRValue))
LCPRValue++;
LCPR.add(LCPRValue);
startingValue = LCPRValue-1 > 0 ? LCPRValue-1 : 0;
}
}

return LCPR;
}

private void getLCPfromLCPR(ArrayList<Integer> LCPR) {
final ArrayList<Integer> suffixArray = suffixArrayBuilder.getSuffixArray();
final int length = suffixArray.size();

LCP = new ArrayList<Integer>();
LCP.add(null); //no value for LCP[0]
for (int i=1; i<length; i++)
LCP.add(LCPR.get(suffixArray.get(i)));
}
}
176 changes: 176 additions & 0 deletions src/com/jwetherell/algorithms/data_structures/SuffixArray.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
package com.jwetherell.algorithms.data_structures;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;

/**
* In computer science, a suffix array is a sorted array of all suffixes of a string.
* It is a data structure used, among others, in full text indices, data compression
* algorithms and within the field of bibliometrics.
* <p>
* https://en.wikipedia.org/wiki/Suffix_array
* <p>
* NOTE: This implementation returns starting indexes instead of full suffixes
* <br>
* @author Jakub Szarawarski <[email protected]>
* @author Justin Wetherell <[email protected]>
*/
public class SuffixArray {

private static final StringBuilder STRING_BUILDER = new StringBuilder();
private static final char DEFAULT_END_SEQ_CHAR = '$';

private char END_SEQ_CHAR;
private String string;
private ArrayList<Integer> suffixArray = null;
private ArrayList<Integer> KMRarray = null;

public SuffixArray(CharSequence sequence) {
this(sequence, DEFAULT_END_SEQ_CHAR);
}

public SuffixArray(CharSequence sequence, char endChar) {
END_SEQ_CHAR = endChar;
string = buildStringWithEndChar(sequence);
}

public ArrayList<Integer> getSuffixArray() {
if (suffixArray == null)
KMRalgorithm();
return suffixArray;
}

/**
* @return inverted suffix array
*/
public ArrayList<Integer> getKMRarray() {
if (KMRarray == null)
KMRalgorithm();
return KMRarray;
}

public String getString(){
return string;
}

/**
* Creates suffix array using KMR algorithm with O(n log^2 n) complexity.
*
* For radius r:
* KMR[i] == k,
* when string[i..i+r-1] is kth r-letter substring of string sorted lexicographically
* KMR is counted for radius = 1,2,4,8 ...
* KMR for radius bigger than string length is the inverted suffix array
*/
private void KMRalgorithm() {
final int length = string.length();

ArrayList<KMRsWithIndex> KMRinvertedList = new ArrayList<KMRsWithIndex>();
ArrayList<Integer> KMR = getBasicKMR(length);

int radius = 1;
while (radius < length) {
KMRinvertedList = getKMRinvertedList(KMR, radius, length);
KMR = getKMR(KMRinvertedList, length);
radius *= 2;
}

KMRarray = new ArrayList<Integer>(KMR.subList(0, length));
suffixArray = new ArrayList<Integer>();
for (KMRsWithIndex kmr : KMRinvertedList)
suffixArray.add(kmr.index);
}

/**
* Creates KMR array for new radius from nearly inverted array.
* Elements from inverted array need to be grouped by substring tey represent.
*
* @param KMRinvertedList indexes are nearly inverted KMR array
* @param length string length
* @return KMR array for new radius
*/
private ArrayList<Integer> getKMR(ArrayList<KMRsWithIndex> KMRinvertedList, int length) {
final ArrayList<Integer> KMR = new ArrayList<Integer>(length*2);
for (int i=0; i<2*length; i++)
KMR.add(-1);

int counter = 0;
for (int i=0; i<length; i++){
if(i>0 && substringsAreEqual(KMRinvertedList, i))
counter++;
KMR.set(KMRinvertedList.get(i).index, counter);
}

return KMR;
}

private boolean substringsAreEqual(ArrayList<KMRsWithIndex> KMRinvertedList, int i) {
return (KMRinvertedList.get(i-1).beginKMR.equals(KMRinvertedList.get(i).beginKMR) == false) ||
(KMRinvertedList.get(i-1).endKMR.equals(KMRinvertedList.get(i).endKMR) == false);
}

/**
* helper method to create KMR array for radius = radius from KMR array for radius = radius/2
*
* @param KMR KMR array for radius = radius/2
* @param radius new radius
* @param length string length
* @return list of KMRsWithIndex which indexes are nearly inverted KMR array
*/
private ArrayList<KMRsWithIndex> getKMRinvertedList(ArrayList<Integer> KMR, int radius, int length) {
final ArrayList<KMRsWithIndex> KMRinvertedList = new ArrayList<KMRsWithIndex>();
for (int i=0; i<length; i++)
KMRinvertedList.add(new KMRsWithIndex(KMR.get(i), KMR.get(i+radius), i));

Collections.sort(KMRinvertedList,
new Comparator<KMRsWithIndex>() {
@Override
public int compare(KMRsWithIndex A, KMRsWithIndex B) {
if (A.beginKMR.equals(B.beginKMR) == false)
return A.beginKMR.compareTo(B.beginKMR);
if (A.endKMR.equals(B.endKMR) == false)
return A.endKMR.compareTo(B.endKMR);
return A.index.compareTo(B.index);
}
}
);
return KMRinvertedList;
}

/**
* KMR array for radius=1, instead of initial natural numbers ascii codes are used
*
* @param length length of string
* @return pseudo KMR array for radius=1
*/
private ArrayList<Integer> getBasicKMR(int length) {
final ArrayList<Integer> result = new ArrayList<Integer>(length*2);
final char[] characters = string.toCharArray();
for (int i=0; i<length; i++)
result.add(new Integer(characters[i]));
for (int i=0; i<length; i++)
result.add(-1);
return result;
}

private String buildStringWithEndChar(CharSequence sequence) {
STRING_BUILDER.setLength(0);
STRING_BUILDER.append(sequence);
if (STRING_BUILDER.indexOf(String.valueOf(END_SEQ_CHAR)) < 0)
STRING_BUILDER.append(END_SEQ_CHAR);
return STRING_BUILDER.toString();
}

private class KMRsWithIndex{
Integer beginKMR;
Integer endKMR;
Integer index;

KMRsWithIndex(Integer begin, Integer end, Integer index){
this.beginKMR = begin;
this.endKMR = end;
this.index = index;
}
}
}
Loading

0 comments on commit eb56f0e

Please sign in to comment.