Optimize performance by using one ArrayBuffer for search values and s…

…haring that between operations Squashed commit of the following: commit b582287b6f5d6930cf2e086f750e36a4cb92e3cf Author: Hanno J. Gödecke <[email protected]> Date: Fri Sep 27 15:15:00 2024 +0200 fix tests commit 523901b6539d174f277f90934fe333bc816eef92 Author: Hanno J. Gödecke <[email protected]> Date: Fri Sep 27 15:14:04 2024 +0200 optimize Int32Array -> Int8Array for search strings, as its all mapped to 0 - 26 commit 20f5c361a1fb7bc3a8d741c37ead68b500af562b Author: Hanno J. Gödecke <[email protected]> Date: Fri Sep 27 15:08:16 2024 +0200 explain index better commit 9dc9c0ed8c0040697cd939583d9b95dc85000c4f Author: Hanno J. Gödecke <[email protected]> Date: Fri Sep 27 15:05:05 2024 +0200 add explaining code comments commit d6f79d0b5ace740ddff70f2f2e727af15291fabc Author: Hanno J. Gödecke <[email protected]> Date: Fri Sep 27 15:00:49 2024 +0200 search use new clamp option commit 88ca64ccc66dc7f81d442a368e4e5bc8ec63100e Author: Hanno J. Gödecke <[email protected]> Date: Fri Sep 27 14:59:36 2024 +0200 fix char set to skip not being respected commit 5a8ba90d8531fbcb3911b66182c1227f35ab896a Author: Hanno J. Gödecke <[email protected]> Date: Fri Sep 27 14:41:39 2024 +0200 remove console logs commit 8d1b8b5afeaef6a33ced7536f5d224f69ce5904b Author: Hanno J. Gödecke <[email protected]> Date: Fri Sep 27 14:41:31 2024 +0200 improve base26 conversions: - don't precalculate table, as we lookup too many chars that we never need - directly convert to numbers, not string representation, as we insert numbers (optimized 2 conversion steps) commit 26acbe52cfcee6f4b535b8e79c2974e3046a621a Author: Hanno J. Gödecke <[email protected]> Date: Fri Sep 27 14:16:56 2024 +0200 fix search broken for unicode chars commit 8d63f452ecfd6a53fd8041c7b9bc75e5117c9a70 Author: Hanno J. Gödecke <[email protected]> Date: Fri Sep 27 14:13:21 2024 +0200 clean ups commit 209da8d7204ec03ef444ac525990a634932d46c1 Author: Hanno J. Gödecke <[email protected]> Date: Thu Sep 26 16:55:49 2024 +0200 wip: fixing search commit d1236c98244e704a7ed08ac186e66388d0d90f1f Author: Hanno J. Gödecke <[email protected]> Date: Thu Sep 26 16:03:54 2024 +0200 wip: working, search broken Co-authored-by: Szymon Kapała <[email protected]>
Expensify · Sep 27, 2024 · e95c851 · e95c851
1 parent 02f562d
commit e95c851
Show file tree

Hide file tree

Showing 4 changed files with 216 additions and 143 deletions.
diff --git a/src/libs/FastSearch.ts b/src/libs/FastSearch.ts
@@ -1,6 +1,6 @@
 import CONST from '@src/CONST';
 import Timing from './actions/Timing';
-import {DELIMITER_CHAR_CODE, END_CHAR_CODE, makeTree, stringToNumeric} from './SuffixUkkonenTree';
+import SuffixUkkonenTree from './SuffixUkkonenTree';
 
 type SearchableData<T> = {
     /**
@@ -21,26 +21,33 @@ const charSetToSkip = new Set(['@', '#', '$', '%', '&', '*', '+', '-', '/', ':',
  * Creates a new "FastSearch" instance. "FastSearch" uses a suffix tree to search for (sub-)strings in a list of strings.
  * You can provide multiple datasets. The search results will be returned for each dataset.
  */
-function createFastSearch<T>(dataSet: Array<SearchableData<T>>) {
+function createFastSearch<T>(dataSets: Array<SearchableData<T>>) {
     // Create a numeric list for the suffix tree, and a look up indexes array
     Timing.start(CONST.TIMING.SEARCH_CONVERT_SEARCH_VALUES);
-    const listsAsConcatedNumericList: number[] = [];
-    const indexesByList: Array<Array<T | undefined>> = [];
-    for (const {data, toSearchableString} of dataSet) {
-        const [numericRepresentation, searchIndexList] = dataToNumericRepresentation({data, toSearchableString});
-        // eslint-disable-next-line @typescript-eslint/prefer-for-of
-        for (let i = 0; i < numericRepresentation.length; i++) {
-            // Note: we had to use a loop here as push with spread yields a maximum call stack exceeded error
-            listsAsConcatedNumericList.push(numericRepresentation[i]);
-        }
-        indexesByList.push(searchIndexList);
+    // The user might provide multiple data sets, but internally, the search values will be stored in this one list:
+    let concatenatedNumericList = new Int8Array(new ArrayBuffer(200_000));
+    // Here we store the index of the data item in the original data list, so we can map the found occurrences back to the original data:
+    const occurrenceToIndex = new Int8Array(new ArrayBuffer(200_000));
+    // As we are working with ArrayBuffers, we need to keep track of the current offset:
+    const offset = {value: 0};
+    // We store the last offset for a dataSet, so we can map the found occurrences to the correct dataSet:
+    const listOffsets: number[] = [];
+
+    for (const {data, toSearchableString} of dataSets) {
+        // Performance critical: the array parameters are out parameters, so we don't want to create new arrays every time:
+        dataToNumericRepresentation(concatenatedNumericList, occurrenceToIndex, offset, {data, toSearchableString});
+        listOffsets.push(offset.value);
     }
-    listsAsConcatedNumericList.push(END_CHAR_CODE);
+    concatenatedNumericList[offset.value++] = SuffixUkkonenTree.END_CHAR_CODE;
+    listOffsets[listOffsets.length - 1] = offset.value;
     Timing.end(CONST.TIMING.SEARCH_CONVERT_SEARCH_VALUES);
 
+    // The list might be larger than necessary, so we clamp it to the actual size:
+    concatenatedNumericList = concatenatedNumericList.slice(0, offset.value);
+
     // Create & build the suffix tree:
     Timing.start(CONST.TIMING.SEARCH_MAKE_TREE);
-    const tree = makeTree(listsAsConcatedNumericList);
+    const tree = SuffixUkkonenTree.makeTree(concatenatedNumericList);
     Timing.end(CONST.TIMING.SEARCH_MAKE_TREE);
 
     Timing.start(CONST.TIMING.SEARCH_BUILD_TREE);
@@ -52,29 +59,32 @@ function createFastSearch<T>(dataSet: Array<SearchableData<T>>) {
      */
     function search(searchInput: string): T[][] {
         const cleanedSearchString = cleanString(searchInput);
-        const searchValueNumeric = stringToNumeric(cleanedSearchString, charSetToSkip);
-        const result = tree.findSubstring(searchValueNumeric);
-
-        // Map the results to the original options
-        const mappedResults = Array.from({length: indexesByList.length}, () => new Set<T>());
+        const {numeric} = SuffixUkkonenTree.stringToNumeric(cleanedSearchString, {
+            charSetToSkip,
+            // stringToNumeric might return a list that is larger than necessary, so we clamp it to the actual size
+            // (otherwise the search could fail as we include in our search empty array values):
+            clamp: true,
+        });
+        const result = tree.findSubstring(Array.from(numeric));
+
+        const resultsByDataSet = Array.from({length: dataSets.length}, () => new Set<T>());
         // eslint-disable-next-line @typescript-eslint/prefer-for-of
-        for (let rI = 0; rI < result.length; rI++) {
-            let offset = 0;
-            const index = result[rI];
-            for (let i = 0; i < indexesByList.length; i++) {
-                const relativeIndex = index - offset + 1;
-                if (relativeIndex < indexesByList[i].length && relativeIndex >= 0) {
-                    const option = indexesByList[i][relativeIndex];
-                    if (option) {
-                        mappedResults[i].add(option);
-                    }
-                } else {
-                    offset += indexesByList[i].length;
-                }
+        for (let i = 0; i < result.length; i++) {
+            const occurrenceIndex = result[i];
+            const itemIndexInDataSet = occurrenceToIndex[occurrenceIndex];
+            const dataSetIndex = listOffsets.findIndex((listOffset) => occurrenceIndex < listOffset);
+
+            if (dataSetIndex === -1) {
+                throw new Error('Programmatic error, this should never ever happen');
+            }
+            const item = dataSets[dataSetIndex].data[itemIndexInDataSet];
+            if (!item) {
+                throw new Error('Programmatic error, this should never ever happen');
             }
+            resultsByDataSet[dataSetIndex].add(item);
         }
 
-        return mappedResults.map((set) => Array.from(set));
+        return resultsByDataSet.map((set) => Array.from(set));
     }
 
     return {
@@ -87,9 +97,8 @@ function createFastSearch<T>(dataSet: Array<SearchableData<T>>) {
  * This function converts the user data (which are most likely objects) to a numeric representation.
  * Additionally a list of the original data and their index position in the numeric list is created, which is used to map the found occurrences back to the original data.
  */
-function dataToNumericRepresentation<T>({data, toSearchableString}: SearchableData<T>): [number[], Array<T | undefined>] {
-    const searchIndexList: Array<T | undefined> = [];
-    const allDataAsNumbers: number[] = [];
+function dataToNumericRepresentation<T>(allList: Int8Array, occurrenceToIndex: Int8Array, offset: {value: number}, {data, toSearchableString}: SearchableData<T>): void {
+    // const searchIndexList: Array<T | undefined> = [];
 
     data.forEach((option, index) => {
         const searchStringForTree = toSearchableString(option);
@@ -100,31 +109,28 @@ function dataToNumericRepresentation<T>({data, toSearchableString}: SearchableDa
             return;
         }
 
-        const numericRepresentation = stringToNumeric(cleanedSearchStringForTree, charSetToSkip);
-
-        // We need to push an array that has the same length as the length of the string we insert for this option:
-        const indexes = Array.from({length: numericRepresentation.length}, () => option);
-        // Note: we add undefined for the delimiter character
-        searchIndexList.push(...indexes, undefined);
-
-        allDataAsNumbers.push(...numericRepresentation);
-        if (index < data.length - 1) {
-            allDataAsNumbers.push(DELIMITER_CHAR_CODE);
-        }
+        // const oldOffset = offset.value;
+        SuffixUkkonenTree.stringToNumeric(cleanedSearchStringForTree, {
+            charSetToSkip,
+            out: {
+                outArray: allList,
+                offset,
+                outOccurrenceToIndex: occurrenceToIndex,
+                index,
+            },
+        });
+        // eslint-disable-next-line no-param-reassign
+        occurrenceToIndex[offset.value] = index;
+        // eslint-disable-next-line no-param-reassign
+        allList[offset.value++] = SuffixUkkonenTree.DELIMITER_CHAR_CODE;
     });
-
-    return [allDataAsNumbers, searchIndexList];
 }
 
-// Removes any special characters, except for numbers and letters (including unicode letters)
-// const nonAlphanumericRegex = /[^0-9\p{L}]/gu;
-
 /**
- * Everything in the tree is treated as lowercase. Strings will additionally be cleaned from
- * special characters, as they are irrelevant for the search, and thus we can save some space.
+ * Everything in the tree is treated as lowercase.
  */
 function cleanString(input: string) {
-    return input.toLowerCase(); // .replace(nonAlphanumericRegex, '');
+    return input.toLowerCase();
 }
 
 const FastSearch = {

diff --git a/src/libs/SuffixUkkonenTree.ts → src/libs/SuffixUkkonenTree/index.ts b/src/libs/SuffixUkkonenTree.ts → src/libs/SuffixUkkonenTree/index.ts
@@ -1,4 +1,5 @@
 /* eslint-disable no-continue */
+import {ALPHABET_SIZE, DELIMITER_CHAR_CODE, END_CHAR_CODE, stringToNumeric} from './utils';
 
 /**
  * This implements a suffix tree using Ukkonen's algorithm.
@@ -8,89 +9,14 @@
  * You probably don't want to use this directly, but rather use @libs/FastSearch.ts as a easy to use wrapper around this.
  */
 
-const ALPHABET = 'abcdefghijklmnopqrstuvwxyz';
-const LETTER_ALPHABET_SIZE = ALPHABET.length;
-const ALPHABET_SIZE = LETTER_ALPHABET_SIZE + 3; // +3: special char, delimiter char, end char
-const SPECIAL_CHAR_CODE = ALPHABET_SIZE - 3;
-const DELIMITER_CHAR_CODE = ALPHABET_SIZE - 2;
-const END_CHAR_CODE = ALPHABET_SIZE - 1;
-
-function convertToBase26(num: number): string {
-    if (num < 0) {
-        throw new Error('convertToBase26: Input must be a non-negative integer');
-    }
-
-    let result = '';
-
-    do {
-        // eslint-disable-next-line no-param-reassign
-        num--;
-        result = ALPHABET[num % 26] + result;
-        // eslint-disable-next-line no-bitwise, no-param-reassign
-        num >>= 5; // Equivalent to Math.floor(num / 26), but faster
-    } while (num > 0);
-
-    return result;
-}
-
-// Pre-compute base26 lookup table
-const base26LookupTable = new Array<string>(65536);
-for (let i = 0; i < 65536; i++) {
-    base26LookupTable[i] = convertToBase26(i);
-}
-
-const letterMap = Array.from(ALPHABET).reduce((acc, char, index) => {
-    acc[char] = index;
-    return acc;
-}, {} as Record<string, number>);
-
-/**
- * Converts a string to an array of numbers representing the characters of the string.
- * Every number in the array is in the range 0-ALPHABET_SIZE (0-28).
- *
- * The numbers are offset by the character code of 'a' (97).
- * - This is so that the numbers from a-z are in the range 0-28.
- * - 26 is for encoding special characters. Character numbers that are not within the range of a-z will be encoded as "specialCharacter + base26(charCode)"
- * - 27 is for the delimiter character
- * - 28 is for the end character
- *
- * Note: The string should be converted to lowercase first (otherwise uppercase letters get base26'ed taking more space than necessary).
- */
-function stringToNumeric(input: string, charSetToSkip?: Set<string>): number[] {
-    const maxSize = input.length * 2; // Estimate maximum size
-    const res = new Array<number>(maxSize);
-    let index = 0;
-
-    for (let i = 0; i < input.length; i++) {
-        const char = input[i];
-        if (charSetToSkip?.has(char)) {
-            continue;
-        }
-
-        if (char >= 'a' && char <= 'z') {
-            res[index++] = letterMap[char];
-        } else {
-            const charCode = input.charCodeAt(i);
-            const asBase26String = base26LookupTable[charCode];
-            res[index++] = SPECIAL_CHAR_CODE;
-            // eslint-disable-next-line @typescript-eslint/prefer-for-of
-            for (let j = 0; j < asBase26String.length; j++) {
-                res[index++] = letterMap[asBase26String[j]];
-            }
-        }
-    }
-
-    return res.slice(0, index); // Trim to actual size
-}
-
 /**
  * Creates a new tree instance that can be used to build a suffix tree and search in it.
  * The input is a numeric representation of the search string, which can be create using {@link stringToNumeric}.
  * Separate search values must be separated by the {@link DELIMITER_CHAR_CODE}. The search string must end with the {@link END_CHAR_CODE}.
  *
  * The tree will be built using the Ukkonen's algorithm: https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
  */
-function makeTree(numericSearchValues: number[]) {
+function makeTree(numericSearchValues: Int8Array) {
     const maxNodes = 2 * numericSearchValues.length;
     // Allocate an ArrayBuffer to store all transitions (flat buffer)
     const buffer = new ArrayBuffer(maxNodes * ALPHABET_SIZE * 4); // 4 bytes per transition (Uint32)
@@ -211,16 +137,16 @@ function makeTree(numericSearchValues: number[]) {
      * This function will return the index(es) of found occurrences within this big string.
      * So, when searching for "an", it would return [1, 3, 8].
      */
-    function findSubstring(searchString: number[]) {
+    function findSubstring(searchValue: number[]) {
         const occurrences: number[] = [];
 
         function dfs(node: number, depth: number) {
             const leftRange = leftEdges[node];
             const rightRange = rightEdges[node] ?? defaultREdgeValue;
             const rangeLen = node === 0 ? 0 : rightRange - leftRange + 1;
 
-            for (let i = 0; i < rangeLen && depth + i < searchString.length && leftRange + i < numericSearchValues.length; i++) {
-                if (searchString[depth + i] !== numericSearchValues[leftRange + i]) {
+            for (let i = 0; i < rangeLen && depth + i < searchValue.length && leftRange + i < numericSearchValues.length; i++) {
+                if (searchValue[depth + i] !== numericSearchValues[leftRange + i]) {
                     return;
                 }
             }
@@ -230,15 +156,15 @@ function makeTree(numericSearchValues: number[]) {
                 const tNode = transitionNodes[node * ALPHABET_SIZE + i];
 
                 // Search speed optimization: don't go through the edge if it's different than the next char:
-                const correctChar = depth + rangeLen >= searchString.length || i === searchString[depth + rangeLen];
+                const correctChar = depth + rangeLen >= searchValue.length || i === searchValue[depth + rangeLen];
 
                 if (tNode && tNode !== -1 && correctChar) {
                     isLeaf = false;
                     dfs(tNode, depth + rangeLen);
                 }
             }
 
-            if (isLeaf && depth + rangeLen >= searchString.length) {
+            if (isLeaf && depth + rangeLen >= searchValue.length) {
                 occurrences.push(numericSearchValues.length - (depth + rangeLen));
             }
         }
@@ -253,4 +179,13 @@ function makeTree(numericSearchValues: number[]) {
     };
 }
 
-export {makeTree, stringToNumeric, DELIMITER_CHAR_CODE, END_CHAR_CODE};
+const SuffixUkkonenTree = {
+    makeTree,
+
+    // Re-exported from utils:
+    DELIMITER_CHAR_CODE,
+    END_CHAR_CODE,
+    stringToNumeric,
+};
+
+export default SuffixUkkonenTree;