Skip to content

Commit

Permalink
Optimize performance by using one ArrayBuffer for search values and s…
Browse files Browse the repository at this point in the history
…haring that between operations

Squashed commit of the following:

commit b582287b6f5d6930cf2e086f750e36a4cb92e3cf
Author: Hanno J. Gödecke <[email protected]>
Date:   Fri Sep 27 15:15:00 2024 +0200

    fix tests

commit 523901b6539d174f277f90934fe333bc816eef92
Author: Hanno J. Gödecke <[email protected]>
Date:   Fri Sep 27 15:14:04 2024 +0200

    optimize Int32Array -> Int8Array for search strings, as its all mapped to 0 - 26

commit 20f5c361a1fb7bc3a8d741c37ead68b500af562b
Author: Hanno J. Gödecke <[email protected]>
Date:   Fri Sep 27 15:08:16 2024 +0200

    explain index better

commit 9dc9c0ed8c0040697cd939583d9b95dc85000c4f
Author: Hanno J. Gödecke <[email protected]>
Date:   Fri Sep 27 15:05:05 2024 +0200

    add explaining code comments

commit d6f79d0b5ace740ddff70f2f2e727af15291fabc
Author: Hanno J. Gödecke <[email protected]>
Date:   Fri Sep 27 15:00:49 2024 +0200

    search use new clamp option

commit 88ca64ccc66dc7f81d442a368e4e5bc8ec63100e
Author: Hanno J. Gödecke <[email protected]>
Date:   Fri Sep 27 14:59:36 2024 +0200

    fix char set to skip not being respected

commit 5a8ba90d8531fbcb3911b66182c1227f35ab896a
Author: Hanno J. Gödecke <[email protected]>
Date:   Fri Sep 27 14:41:39 2024 +0200

    remove console logs

commit 8d1b8b5afeaef6a33ced7536f5d224f69ce5904b
Author: Hanno J. Gödecke <[email protected]>
Date:   Fri Sep 27 14:41:31 2024 +0200

    improve base26 conversions:

    - don't precalculate table, as we lookup too many chars that we never need
    - directly convert to numbers, not string representation, as we insert numbers (optimized 2 conversion steps)

commit 26acbe52cfcee6f4b535b8e79c2974e3046a621a
Author: Hanno J. Gödecke <[email protected]>
Date:   Fri Sep 27 14:16:56 2024 +0200

    fix search broken for unicode chars

commit 8d63f452ecfd6a53fd8041c7b9bc75e5117c9a70
Author: Hanno J. Gödecke <[email protected]>
Date:   Fri Sep 27 14:13:21 2024 +0200

    clean ups

commit 209da8d7204ec03ef444ac525990a634932d46c1
Author: Hanno J. Gödecke <[email protected]>
Date:   Thu Sep 26 16:55:49 2024 +0200

    wip: fixing search

commit d1236c98244e704a7ed08ac186e66388d0d90f1f
Author: Hanno J. Gödecke <[email protected]>
Date:   Thu Sep 26 16:03:54 2024 +0200

    wip: working, search broken

    Co-authored-by: Szymon Kapała <[email protected]>
  • Loading branch information
hannojg committed Sep 27, 2024
1 parent 02f562d commit e95c851
Show file tree
Hide file tree
Showing 4 changed files with 216 additions and 143 deletions.
116 changes: 61 additions & 55 deletions src/libs/FastSearch.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import CONST from '@src/CONST';
import Timing from './actions/Timing';
import {DELIMITER_CHAR_CODE, END_CHAR_CODE, makeTree, stringToNumeric} from './SuffixUkkonenTree';
import SuffixUkkonenTree from './SuffixUkkonenTree';

type SearchableData<T> = {
/**
Expand All @@ -21,26 +21,33 @@ const charSetToSkip = new Set(['@', '#', '$', '%', '&', '*', '+', '-', '/', ':',
* Creates a new "FastSearch" instance. "FastSearch" uses a suffix tree to search for (sub-)strings in a list of strings.
* You can provide multiple datasets. The search results will be returned for each dataset.
*/
function createFastSearch<T>(dataSet: Array<SearchableData<T>>) {
function createFastSearch<T>(dataSets: Array<SearchableData<T>>) {
// Create a numeric list for the suffix tree, and a look up indexes array
Timing.start(CONST.TIMING.SEARCH_CONVERT_SEARCH_VALUES);
const listsAsConcatedNumericList: number[] = [];
const indexesByList: Array<Array<T | undefined>> = [];
for (const {data, toSearchableString} of dataSet) {
const [numericRepresentation, searchIndexList] = dataToNumericRepresentation({data, toSearchableString});
// eslint-disable-next-line @typescript-eslint/prefer-for-of
for (let i = 0; i < numericRepresentation.length; i++) {
// Note: we had to use a loop here as push with spread yields a maximum call stack exceeded error
listsAsConcatedNumericList.push(numericRepresentation[i]);
}
indexesByList.push(searchIndexList);
// The user might provide multiple data sets, but internally, the search values will be stored in this one list:
let concatenatedNumericList = new Int8Array(new ArrayBuffer(200_000));
// Here we store the index of the data item in the original data list, so we can map the found occurrences back to the original data:
const occurrenceToIndex = new Int8Array(new ArrayBuffer(200_000));
// As we are working with ArrayBuffers, we need to keep track of the current offset:
const offset = {value: 0};
// We store the last offset for a dataSet, so we can map the found occurrences to the correct dataSet:
const listOffsets: number[] = [];

for (const {data, toSearchableString} of dataSets) {
// Performance critical: the array parameters are out parameters, so we don't want to create new arrays every time:
dataToNumericRepresentation(concatenatedNumericList, occurrenceToIndex, offset, {data, toSearchableString});
listOffsets.push(offset.value);
}
listsAsConcatedNumericList.push(END_CHAR_CODE);
concatenatedNumericList[offset.value++] = SuffixUkkonenTree.END_CHAR_CODE;
listOffsets[listOffsets.length - 1] = offset.value;
Timing.end(CONST.TIMING.SEARCH_CONVERT_SEARCH_VALUES);

// The list might be larger than necessary, so we clamp it to the actual size:
concatenatedNumericList = concatenatedNumericList.slice(0, offset.value);

// Create & build the suffix tree:
Timing.start(CONST.TIMING.SEARCH_MAKE_TREE);
const tree = makeTree(listsAsConcatedNumericList);
const tree = SuffixUkkonenTree.makeTree(concatenatedNumericList);
Timing.end(CONST.TIMING.SEARCH_MAKE_TREE);

Timing.start(CONST.TIMING.SEARCH_BUILD_TREE);
Expand All @@ -52,29 +59,32 @@ function createFastSearch<T>(dataSet: Array<SearchableData<T>>) {
*/
function search(searchInput: string): T[][] {
const cleanedSearchString = cleanString(searchInput);
const searchValueNumeric = stringToNumeric(cleanedSearchString, charSetToSkip);
const result = tree.findSubstring(searchValueNumeric);

// Map the results to the original options
const mappedResults = Array.from({length: indexesByList.length}, () => new Set<T>());
const {numeric} = SuffixUkkonenTree.stringToNumeric(cleanedSearchString, {
charSetToSkip,
// stringToNumeric might return a list that is larger than necessary, so we clamp it to the actual size
// (otherwise the search could fail as we include in our search empty array values):
clamp: true,
});
const result = tree.findSubstring(Array.from(numeric));

const resultsByDataSet = Array.from({length: dataSets.length}, () => new Set<T>());
// eslint-disable-next-line @typescript-eslint/prefer-for-of
for (let rI = 0; rI < result.length; rI++) {
let offset = 0;
const index = result[rI];
for (let i = 0; i < indexesByList.length; i++) {
const relativeIndex = index - offset + 1;
if (relativeIndex < indexesByList[i].length && relativeIndex >= 0) {
const option = indexesByList[i][relativeIndex];
if (option) {
mappedResults[i].add(option);
}
} else {
offset += indexesByList[i].length;
}
for (let i = 0; i < result.length; i++) {
const occurrenceIndex = result[i];
const itemIndexInDataSet = occurrenceToIndex[occurrenceIndex];
const dataSetIndex = listOffsets.findIndex((listOffset) => occurrenceIndex < listOffset);

if (dataSetIndex === -1) {
throw new Error('Programmatic error, this should never ever happen');
}
const item = dataSets[dataSetIndex].data[itemIndexInDataSet];
if (!item) {
throw new Error('Programmatic error, this should never ever happen');
}
resultsByDataSet[dataSetIndex].add(item);
}

return mappedResults.map((set) => Array.from(set));
return resultsByDataSet.map((set) => Array.from(set));
}

return {
Expand All @@ -87,9 +97,8 @@ function createFastSearch<T>(dataSet: Array<SearchableData<T>>) {
* This function converts the user data (which are most likely objects) to a numeric representation.
* Additionally a list of the original data and their index position in the numeric list is created, which is used to map the found occurrences back to the original data.
*/
function dataToNumericRepresentation<T>({data, toSearchableString}: SearchableData<T>): [number[], Array<T | undefined>] {
const searchIndexList: Array<T | undefined> = [];
const allDataAsNumbers: number[] = [];
function dataToNumericRepresentation<T>(allList: Int8Array, occurrenceToIndex: Int8Array, offset: {value: number}, {data, toSearchableString}: SearchableData<T>): void {
// const searchIndexList: Array<T | undefined> = [];

data.forEach((option, index) => {
const searchStringForTree = toSearchableString(option);
Expand All @@ -100,31 +109,28 @@ function dataToNumericRepresentation<T>({data, toSearchableString}: SearchableDa
return;
}

const numericRepresentation = stringToNumeric(cleanedSearchStringForTree, charSetToSkip);

// We need to push an array that has the same length as the length of the string we insert for this option:
const indexes = Array.from({length: numericRepresentation.length}, () => option);
// Note: we add undefined for the delimiter character
searchIndexList.push(...indexes, undefined);

allDataAsNumbers.push(...numericRepresentation);
if (index < data.length - 1) {
allDataAsNumbers.push(DELIMITER_CHAR_CODE);
}
// const oldOffset = offset.value;
SuffixUkkonenTree.stringToNumeric(cleanedSearchStringForTree, {
charSetToSkip,
out: {
outArray: allList,
offset,
outOccurrenceToIndex: occurrenceToIndex,
index,
},
});
// eslint-disable-next-line no-param-reassign
occurrenceToIndex[offset.value] = index;
// eslint-disable-next-line no-param-reassign
allList[offset.value++] = SuffixUkkonenTree.DELIMITER_CHAR_CODE;
});

return [allDataAsNumbers, searchIndexList];
}

// Removes any special characters, except for numbers and letters (including unicode letters)
// const nonAlphanumericRegex = /[^0-9\p{L}]/gu;

/**
* Everything in the tree is treated as lowercase. Strings will additionally be cleaned from
* special characters, as they are irrelevant for the search, and thus we can save some space.
* Everything in the tree is treated as lowercase.
*/
function cleanString(input: string) {
return input.toLowerCase(); // .replace(nonAlphanumericRegex, '');
return input.toLowerCase();
}

const FastSearch = {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
/* eslint-disable no-continue */
import {ALPHABET_SIZE, DELIMITER_CHAR_CODE, END_CHAR_CODE, stringToNumeric} from './utils';

/**
* This implements a suffix tree using Ukkonen's algorithm.
Expand All @@ -8,89 +9,14 @@
* You probably don't want to use this directly, but rather use @libs/FastSearch.ts as a easy to use wrapper around this.
*/

const ALPHABET = 'abcdefghijklmnopqrstuvwxyz';
const LETTER_ALPHABET_SIZE = ALPHABET.length;
const ALPHABET_SIZE = LETTER_ALPHABET_SIZE + 3; // +3: special char, delimiter char, end char
const SPECIAL_CHAR_CODE = ALPHABET_SIZE - 3;
const DELIMITER_CHAR_CODE = ALPHABET_SIZE - 2;
const END_CHAR_CODE = ALPHABET_SIZE - 1;

function convertToBase26(num: number): string {
if (num < 0) {
throw new Error('convertToBase26: Input must be a non-negative integer');
}

let result = '';

do {
// eslint-disable-next-line no-param-reassign
num--;
result = ALPHABET[num % 26] + result;
// eslint-disable-next-line no-bitwise, no-param-reassign
num >>= 5; // Equivalent to Math.floor(num / 26), but faster
} while (num > 0);

return result;
}

// Pre-compute base26 lookup table
const base26LookupTable = new Array<string>(65536);
for (let i = 0; i < 65536; i++) {
base26LookupTable[i] = convertToBase26(i);
}

const letterMap = Array.from(ALPHABET).reduce((acc, char, index) => {
acc[char] = index;
return acc;
}, {} as Record<string, number>);

/**
* Converts a string to an array of numbers representing the characters of the string.
* Every number in the array is in the range 0-ALPHABET_SIZE (0-28).
*
* The numbers are offset by the character code of 'a' (97).
* - This is so that the numbers from a-z are in the range 0-28.
* - 26 is for encoding special characters. Character numbers that are not within the range of a-z will be encoded as "specialCharacter + base26(charCode)"
* - 27 is for the delimiter character
* - 28 is for the end character
*
* Note: The string should be converted to lowercase first (otherwise uppercase letters get base26'ed taking more space than necessary).
*/
function stringToNumeric(input: string, charSetToSkip?: Set<string>): number[] {
const maxSize = input.length * 2; // Estimate maximum size
const res = new Array<number>(maxSize);
let index = 0;

for (let i = 0; i < input.length; i++) {
const char = input[i];
if (charSetToSkip?.has(char)) {
continue;
}

if (char >= 'a' && char <= 'z') {
res[index++] = letterMap[char];
} else {
const charCode = input.charCodeAt(i);
const asBase26String = base26LookupTable[charCode];
res[index++] = SPECIAL_CHAR_CODE;
// eslint-disable-next-line @typescript-eslint/prefer-for-of
for (let j = 0; j < asBase26String.length; j++) {
res[index++] = letterMap[asBase26String[j]];
}
}
}

return res.slice(0, index); // Trim to actual size
}

/**
* Creates a new tree instance that can be used to build a suffix tree and search in it.
* The input is a numeric representation of the search string, which can be create using {@link stringToNumeric}.
* Separate search values must be separated by the {@link DELIMITER_CHAR_CODE}. The search string must end with the {@link END_CHAR_CODE}.
*
* The tree will be built using the Ukkonen's algorithm: https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
*/
function makeTree(numericSearchValues: number[]) {
function makeTree(numericSearchValues: Int8Array) {
const maxNodes = 2 * numericSearchValues.length;
// Allocate an ArrayBuffer to store all transitions (flat buffer)
const buffer = new ArrayBuffer(maxNodes * ALPHABET_SIZE * 4); // 4 bytes per transition (Uint32)
Expand Down Expand Up @@ -211,16 +137,16 @@ function makeTree(numericSearchValues: number[]) {
* This function will return the index(es) of found occurrences within this big string.
* So, when searching for "an", it would return [1, 3, 8].
*/
function findSubstring(searchString: number[]) {
function findSubstring(searchValue: number[]) {
const occurrences: number[] = [];

function dfs(node: number, depth: number) {
const leftRange = leftEdges[node];
const rightRange = rightEdges[node] ?? defaultREdgeValue;
const rangeLen = node === 0 ? 0 : rightRange - leftRange + 1;

for (let i = 0; i < rangeLen && depth + i < searchString.length && leftRange + i < numericSearchValues.length; i++) {
if (searchString[depth + i] !== numericSearchValues[leftRange + i]) {
for (let i = 0; i < rangeLen && depth + i < searchValue.length && leftRange + i < numericSearchValues.length; i++) {
if (searchValue[depth + i] !== numericSearchValues[leftRange + i]) {
return;
}
}
Expand All @@ -230,15 +156,15 @@ function makeTree(numericSearchValues: number[]) {
const tNode = transitionNodes[node * ALPHABET_SIZE + i];

// Search speed optimization: don't go through the edge if it's different than the next char:
const correctChar = depth + rangeLen >= searchString.length || i === searchString[depth + rangeLen];
const correctChar = depth + rangeLen >= searchValue.length || i === searchValue[depth + rangeLen];

if (tNode && tNode !== -1 && correctChar) {
isLeaf = false;
dfs(tNode, depth + rangeLen);
}
}

if (isLeaf && depth + rangeLen >= searchString.length) {
if (isLeaf && depth + rangeLen >= searchValue.length) {
occurrences.push(numericSearchValues.length - (depth + rangeLen));
}
}
Expand All @@ -253,4 +179,13 @@ function makeTree(numericSearchValues: number[]) {
};
}

export {makeTree, stringToNumeric, DELIMITER_CHAR_CODE, END_CHAR_CODE};
const SuffixUkkonenTree = {
makeTree,

// Re-exported from utils:
DELIMITER_CHAR_CODE,
END_CHAR_CODE,
stringToNumeric,
};

export default SuffixUkkonenTree;
Loading

0 comments on commit e95c851

Please sign in to comment.