fulltext bug fixes, performance improvement and support json_value pa…

…rser (#20230) bug fixes for #20217 #20213 #20175 #20149 and add json_value parser 1. limit the batch size to 8192 on both fulltext_index_scan() and fulltext_tokenize() function 2. In fulltext_index_scan function, create a new thread to evaluate the score in 8192 documents per batch instead of waiting for all results from SQL. It will speed up and avoid OOM in the function. However, the score will be calculated based on each mini-batch instead of complete batch. I think it doesn't matter as long as we have the correct answer. 3. support json_value parser 4. Pre-allocation of memory in fulltext_tokenize() function to avoid malloc 5. bug fix #20149 Delete table. pkPos, pkType is needed but (doc_id, INT) is given. 6. add monpl tokenizer repo to matrixone 7. bug fix json tokenizer to truncate value and increase the limit to 127 bytes 8. pushdown limit 9. bug fix #20311. data race occurred during bvt test 10. alter table drop column with fulltext index 11. SQL executor add streaming mode. Approved by: @fengttt, @badboynt1, @zhangxu19830126, @m-schen, @aunjgr, @ouyuanning, @aressu1985, @XuPeng-SH, @sukki37, @qingxinhome
matrixorigin · Nov 29, 2024 · 03635a8 · 03635a8
1 parent 66b483d
commit 03635a8
Show file tree

Hide file tree

Showing 31 changed files with 126,399 additions and 159 deletions.
diff --git a/go.mod b/go.mod
@@ -53,7 +53,6 @@ require (
 	github.com/lni/dragonboat/v4 v4.0.0-20220815145555-6f622e8bcbef
 	github.com/lni/goutils v1.3.1-0.20220604063047-388d67b4dbc4
 	github.com/lni/vfs v0.2.1-0.20220616104132-8852fd867376
-	github.com/matrixorigin/monlp v0.0.0-20240825091235-be436dc30e78
 	github.com/matrixorigin/mysql v1.8.2-0.20241106110439-6ac9ee94770d
 	github.com/matrixorigin/simdcsv v0.0.0-20230210060146-09b8e45209dd
 	github.com/minio/minio-go/v7 v7.0.78

diff --git a/go.sum b/go.sum
@@ -503,8 +503,6 @@ github.com/matrixorigin/goutils v1.3.1-0.20220604063047-388d67b4dbc4 h1:+SmZP2bG
 github.com/matrixorigin/goutils v1.3.1-0.20220604063047-388d67b4dbc4/go.mod h1:LIHvF0fflR+zyXUQFQOiHPpKANf3UIr7DFIv5CBPOoU=
 github.com/matrixorigin/memberlist v0.5.1-0.20230322082342-95015c95ee76 h1:MpmqMPooJ0Ea7W4ldIGbQV4D3z+sEiCu6C6aTibiwiQ=
 github.com/matrixorigin/memberlist v0.5.1-0.20230322082342-95015c95ee76/go.mod h1:yvyXLpo0QaGE59Y7hDTsTzDD25JYBZ4mHgHUZ8lrOI0=
-github.com/matrixorigin/monlp v0.0.0-20240825091235-be436dc30e78 h1:1NvZ4SBw0lH7h38VhCVxYEa61K8N+0DBv9JQhAwU48Q=
-github.com/matrixorigin/monlp v0.0.0-20240825091235-be436dc30e78/go.mod h1:RQQhaM4xSocKuNi0ZvKZZAiErpINJgZrPB+vZDvBkeU=
 github.com/matrixorigin/mysql v1.8.2-0.20241106110439-6ac9ee94770d h1:27vD3JGbrFmaQtDYQT/W1jFFr0xvipdwH5R4bZPGQdE=
 github.com/matrixorigin/mysql v1.8.2-0.20241106110439-6ac9ee94770d/go.mod h1:RJNMd/LBgWRCpGanqXvqjVaoYXeYBS+i0MSeoN3hBMo=
 github.com/matrixorigin/simdcsv v0.0.0-20230210060146-09b8e45209dd h1:DvqhuH3kOpsE6vXZA5WEaRNAUUUcf44S1p5VInbjdfU=

diff --git a/pkg/catalog/secondary_index_utils.go b/pkg/catalog/secondary_index_utils.go
@@ -178,7 +178,7 @@ func fullTextIndexParamsToMap(def *tree.FullTextIndex) (map[string]string, error
 	// fulltext index here
 	if def.IndexOption != nil {
 		parsername := strings.ToLower(def.IndexOption.ParserName)
-		if parsername != "ngram" && parsername != "default" && parsername != "json" {
+		if parsername != "ngram" && parsername != "default" && parsername != "json" && parsername != "json_value" {
 			return nil, moerr.NewInternalErrorNoCtx(fmt.Sprintf("invalid parser %s", parsername))
 		}
 		res["parser"] = parsername

diff --git a/pkg/container/bytejson/fttokenizer.go b/pkg/container/bytejson/fttokenizer.go
@@ -17,31 +17,39 @@ package bytejson
 import (
 	"iter"
 	"strconv"
+)
 
-	"github.com/matrixorigin/monlp/tokenizer"
+const (
+	MAX_TOKEN_SIZE = 127
 )
 
+type Token struct {
+	TokenBytes [1 + MAX_TOKEN_SIZE]byte
+	TokenPos   int32
+	BytePos    int32
+}
+
 // TokenizeValue tokenizes the values of the ByteJson object
 // note that we do not break word with space, do not normalize
-// case, 3-gram, etc etc, only truncate the string to 23 bytes.
-func (bj ByteJson) TokenizeValue(includeKey bool) iter.Seq[tokenizer.Token] {
-	return func(yield func(tokenizer.Token) bool) {
+// case, 3-gram, etc etc, only truncate the string to 127 bytes.
+func (bj ByteJson) TokenizeValue(includeKey bool) iter.Seq[Token] {
+	return func(yield func(Token) bool) {
 		tokenizeOne(bj, 1, includeKey, yield)
 	}
 }
 
-func fillToken(t *tokenizer.Token, s []byte, pos int32) {
+func fillToken(t *Token, s []byte, pos int32) {
 	copy(t.TokenBytes[1:], s)
-	if len(s) > tokenizer.MAX_TOKEN_SIZE {
-		t.TokenBytes[0] = tokenizer.MAX_TOKEN_SIZE
+	if len(s) > MAX_TOKEN_SIZE {
+		t.TokenBytes[0] = MAX_TOKEN_SIZE
 	} else {
 		t.TokenBytes[0] = byte(len(s))
 	}
 	t.TokenPos = pos
 }
 
-func tokenizeOne(bj ByteJson, pos int32, includeKey bool, yield func(tokenizer.Token) bool) int32 {
-	var t tokenizer.Token
+func tokenizeOne(bj ByteJson, pos int32, includeKey bool, yield func(Token) bool) int32 {
+	var t Token
 
 	switch bj.Type {
 	case TpCodeObject:

diff --git a/pkg/container/bytejson/fttokenizer_test.go b/pkg/container/bytejson/fttokenizer_test.go
@@ -16,9 +16,10 @@ package bytejson
 
 import (
 	"encoding/json"
+	"fmt"
 	"testing"
 
-	"github.com/matrixorigin/monlp/tokenizer"
+	"github.com/stretchr/testify/require"
 )
 
 type tokenTestCase struct {
@@ -27,15 +28,15 @@ type tokenTestCase struct {
 	tokensWithKey []string
 }
 
-func checkTokens(t *testing.T, tokens []tokenizer.Token, expected []string) {
+func checkTokens(t *testing.T, tokens []Token, expected []string) {
 	if len(tokens) != len(expected) {
 		t.Fatalf("expected %d tokens, got %d", len(expected), len(tokens))
 	}
 
 	for i := range tokens {
-		var tk tokenizer.Token
-		if len(expected[i]) > tokenizer.MAX_TOKEN_SIZE {
-			tk.TokenBytes[0] = byte(tokenizer.MAX_TOKEN_SIZE)
+		var tk Token
+		if len(expected[i]) > MAX_TOKEN_SIZE {
+			tk.TokenBytes[0] = byte(MAX_TOKEN_SIZE)
 		} else {
 			tk.TokenBytes[0] = byte(len(expected[i]))
 		}
@@ -61,8 +62,8 @@ func TestByteJson(t *testing.T) {
 		},
 		{
 			input:         `{"a": [1.2, 2.0], "b": [3, true, "hello"], "c": "abcdefghijklmnopqrstuvwxyz"}`,
-			tokens:        []string{"1.2", "2", "3", "hello", "abcdefghijklmnopqrstuvw"},
-			tokensWithKey: []string{"a", "1.2", "2", "b", "3", "hello", "c", "abcdefghijklmnopqrstuvw"},
+			tokens:        []string{"1.2", "2", "3", "hello", "abcdefghijklmnopqrstuvwxyz"},
+			tokensWithKey: []string{"a", "1.2", "2", "b", "3", "hello", "c", "abcdefghijklmnopqrstuvwxyz"},
 		},
 		{
 			input:         `{"a": "相见时难别亦难", "b": "I come, I see, I 征服", "c": "相见时难别亦难，东风无力百花残。 春蚕到死丝方尽，蜡炬成灰泪始干。"}`,
@@ -72,7 +73,7 @@ func TestByteJson(t *testing.T) {
 		{
 			input:         `{"a bcdefghijklmnopqrstuvwxyz": 1, "学而时习之，不亦说乎": "说什么说， 就你话多"}`,
 			tokens:        []string{"1", "说什么说， 就你话多"},
-			tokensWithKey: []string{"a bcdefghijklmnopqrstuv", "1", "学而时习之，不亦说乎", "说什么说， 就你话多"},
+			tokensWithKey: []string{"a bcdefghijklmnopqrstuvwxyz", "1", "学而时习之，不亦说乎", "说什么说， 就你话多"},
 		},
 	}
 
@@ -82,16 +83,31 @@ func TestByteJson(t *testing.T) {
 			t.Fatal(err)
 		}
 
-		var tokens []tokenizer.Token
+		var tokens []Token
 		for tk := range bj.TokenizeValue(false) {
 			tokens = append(tokens, tk)
 		}
 		checkTokens(t, tokens, tc.tokens)
 
-		var tokensWithKey []tokenizer.Token
+		var tokensWithKey []Token
 		for tk := range bj.TokenizeValue(true) {
 			tokensWithKey = append(tokensWithKey, tk)
 		}
 		checkTokens(t, tokensWithKey, tc.tokensWithKey)
 	}
 }
+
+func TestFillToken(t *testing.T) {
+	var tok Token
+	lv := "1234567890"
+	fmt.Printf("%s %d\n", lv, len(lv))
+
+	fillToken(&tok, []byte(lv), 0)
+	require.Equal(t, 10, int(tok.TokenBytes[0]))
+
+	for i := 0; i < 20; i++ {
+		lv += lv
+	}
+	fillToken(&tok, []byte(lv), 0)
+	require.Equal(t, 127, int(tok.TokenBytes[0]))
+}
diff --git a/pkg/fulltext/fulltext.go b/pkg/fulltext/fulltext.go
@@ -20,8 +20,8 @@ import (
 	"strings"
 
 	"github.com/matrixorigin/matrixone/pkg/common/moerr"
+	"github.com/matrixorigin/matrixone/pkg/monlp/tokenizer"
 	"github.com/matrixorigin/matrixone/pkg/sql/parsers/tree"
-	"github.com/matrixorigin/monlp/tokenizer"
 )
 
 /*