matrixorigin · cpegeric · Nov 19, 2024 · Nov 19, 2024 · Nov 19, 2024 · Nov 20, 2024
@@ -53,7 +53,6 @@ require (
 	github.com/lni/dragonboat/v4 v4.0.0-20220815145555-6f622e8bcbef
 	github.com/lni/goutils v1.3.1-0.20220604063047-388d67b4dbc4
 	github.com/lni/vfs v0.2.1-0.20220616104132-8852fd867376
-	github.com/matrixorigin/monlp v0.0.0-20240825091235-be436dc30e78
 	github.com/matrixorigin/mysql v1.8.2-0.20241106110439-6ac9ee94770d
 	github.com/matrixorigin/simdcsv v0.0.0-20230210060146-09b8e45209dd
 	github.com/minio/minio-go/v7 v7.0.78

@@ -503,8 +503,6 @@ github.com/matrixorigin/goutils v1.3.1-0.20220604063047-388d67b4dbc4 h1:+SmZP2bG
 github.com/matrixorigin/goutils v1.3.1-0.20220604063047-388d67b4dbc4/go.mod h1:LIHvF0fflR+zyXUQFQOiHPpKANf3UIr7DFIv5CBPOoU=
 github.com/matrixorigin/memberlist v0.5.1-0.20230322082342-95015c95ee76 h1:MpmqMPooJ0Ea7W4ldIGbQV4D3z+sEiCu6C6aTibiwiQ=
 github.com/matrixorigin/memberlist v0.5.1-0.20230322082342-95015c95ee76/go.mod h1:yvyXLpo0QaGE59Y7hDTsTzDD25JYBZ4mHgHUZ8lrOI0=
-github.com/matrixorigin/monlp v0.0.0-20240825091235-be436dc30e78 h1:1NvZ4SBw0lH7h38VhCVxYEa61K8N+0DBv9JQhAwU48Q=
-github.com/matrixorigin/monlp v0.0.0-20240825091235-be436dc30e78/go.mod h1:RQQhaM4xSocKuNi0ZvKZZAiErpINJgZrPB+vZDvBkeU=
 github.com/matrixorigin/mysql v1.8.2-0.20241106110439-6ac9ee94770d h1:27vD3JGbrFmaQtDYQT/W1jFFr0xvipdwH5R4bZPGQdE=
 github.com/matrixorigin/mysql v1.8.2-0.20241106110439-6ac9ee94770d/go.mod h1:RJNMd/LBgWRCpGanqXvqjVaoYXeYBS+i0MSeoN3hBMo=
 github.com/matrixorigin/simdcsv v0.0.0-20230210060146-09b8e45209dd h1:DvqhuH3kOpsE6vXZA5WEaRNAUUUcf44S1p5VInbjdfU=

@@ -178,7 +178,7 @@ func fullTextIndexParamsToMap(def *tree.FullTextIndex) (map[string]string, error
 	// fulltext index here
 	if def.IndexOption != nil {
 		parsername := strings.ToLower(def.IndexOption.ParserName)
-		if parsername != "ngram" && parsername != "default" && parsername != "json" {
+		if parsername != "ngram" && parsername != "default" && parsername != "json" && parsername != "json_value" {
 			return nil, moerr.NewInternalErrorNoCtx(fmt.Sprintf("invalid parser %s", parsername))
 		}
 		res["parser"] = parsername

@@ -17,31 +17,39 @@ package bytejson
 import (
 	"iter"
 	"strconv"
+)
 
-	"github.com/matrixorigin/monlp/tokenizer"
+const (
+	MAX_TOKEN_SIZE = 127
 )
 
+type Token struct {
+	TokenBytes [1 + MAX_TOKEN_SIZE]byte
+	TokenPos   int32
+	BytePos    int32
+}
+
 // TokenizeValue tokenizes the values of the ByteJson object
 // note that we do not break word with space, do not normalize
-// case, 3-gram, etc etc, only truncate the string to 23 bytes.
-func (bj ByteJson) TokenizeValue(includeKey bool) iter.Seq[tokenizer.Token] {
-	return func(yield func(tokenizer.Token) bool) {
+// case, 3-gram, etc etc, only truncate the string to 127 bytes.
+func (bj ByteJson) TokenizeValue(includeKey bool) iter.Seq[Token] {
+	return func(yield func(Token) bool) {
 		tokenizeOne(bj, 1, includeKey, yield)
 	}
 }
 
-func fillToken(t *tokenizer.Token, s []byte, pos int32) {
+func fillToken(t *Token, s []byte, pos int32) {
 	copy(t.TokenBytes[1:], s)
-	if len(s) > tokenizer.MAX_TOKEN_SIZE {
-		t.TokenBytes[0] = tokenizer.MAX_TOKEN_SIZE
+	if len(s) > MAX_TOKEN_SIZE {
+		t.TokenBytes[0] = MAX_TOKEN_SIZE
 	} else {
 		t.TokenBytes[0] = byte(len(s))
 	}
 	t.TokenPos = pos
 }
 
-func tokenizeOne(bj ByteJson, pos int32, includeKey bool, yield func(tokenizer.Token) bool) int32 {
-	var t tokenizer.Token
+func tokenizeOne(bj ByteJson, pos int32, includeKey bool, yield func(Token) bool) int32 {
+	var t Token
 
 	switch bj.Type {
 	case TpCodeObject:

@@ -16,9 +16,10 @@ package bytejson
 
 import (
 	"encoding/json"
+	"fmt"
 	"testing"
 
-	"github.com/matrixorigin/monlp/tokenizer"
+	"github.com/stretchr/testify/require"
 )
 
 type tokenTestCase struct {
@@ -27,15 +28,15 @@ type tokenTestCase struct {
 	tokensWithKey []string
 }
 
-func checkTokens(t *testing.T, tokens []tokenizer.Token, expected []string) {
+func checkTokens(t *testing.T, tokens []Token, expected []string) {
 	if len(tokens) != len(expected) {
 		t.Fatalf("expected %d tokens, got %d", len(expected), len(tokens))
 	}
 
 	for i := range tokens {
-		var tk tokenizer.Token
-		if len(expected[i]) > tokenizer.MAX_TOKEN_SIZE {
-			tk.TokenBytes[0] = byte(tokenizer.MAX_TOKEN_SIZE)
+		var tk Token
+		if len(expected[i]) > MAX_TOKEN_SIZE {
+			tk.TokenBytes[0] = byte(MAX_TOKEN_SIZE)
 		} else {
 			tk.TokenBytes[0] = byte(len(expected[i]))
 		}
@@ -61,8 +62,8 @@ func TestByteJson(t *testing.T) {
 		},
 		{
 			input:         `{"a": [1.2, 2.0], "b": [3, true, "hello"], "c": "abcdefghijklmnopqrstuvwxyz"}`,
-			tokens:        []string{"1.2", "2", "3", "hello", "abcdefghijklmnopqrstuvw"},
-			tokensWithKey: []string{"a", "1.2", "2", "b", "3", "hello", "c", "abcdefghijklmnopqrstuvw"},
+			tokens:        []string{"1.2", "2", "3", "hello", "abcdefghijklmnopqrstuvwxyz"},
+			tokensWithKey: []string{"a", "1.2", "2", "b", "3", "hello", "c", "abcdefghijklmnopqrstuvwxyz"},
 		},
 		{
 			input:         `{"a": "相见时难别亦难", "b": "I come, I see, I 征服", "c": "相见时难别亦难，东风无力百花残。 春蚕到死丝方尽，蜡炬成灰泪始干。"}`,
@@ -72,7 +73,7 @@ func TestByteJson(t *testing.T) {
 		{
 			input:         `{"a bcdefghijklmnopqrstuvwxyz": 1, "学而时习之，不亦说乎": "说什么说， 就你话多"}`,
 			tokens:        []string{"1", "说什么说， 就你话多"},
-			tokensWithKey: []string{"a bcdefghijklmnopqrstuv", "1", "学而时习之，不亦说乎", "说什么说， 就你话多"},
+			tokensWithKey: []string{"a bcdefghijklmnopqrstuvwxyz", "1", "学而时习之，不亦说乎", "说什么说， 就你话多"},
 		},
 	}
 
@@ -82,16 +83,31 @@ func TestByteJson(t *testing.T) {
 			t.Fatal(err)
 		}
 
-		var tokens []tokenizer.Token
+		var tokens []Token
 		for tk := range bj.TokenizeValue(false) {
 			tokens = append(tokens, tk)
 		}
 		checkTokens(t, tokens, tc.tokens)
 
-		var tokensWithKey []tokenizer.Token
+		var tokensWithKey []Token
 		for tk := range bj.TokenizeValue(true) {
 			tokensWithKey = append(tokensWithKey, tk)
 		}
 		checkTokens(t, tokensWithKey, tc.tokensWithKey)
 	}
 }
+
+func TestFillToken(t *testing.T) {
+	var tok Token
+	lv := "1234567890"
+	fmt.Printf("%s %d\n", lv, len(lv))
+
+	fillToken(&tok, []byte(lv), 0)
+	require.Equal(t, 10, int(tok.TokenBytes[0]))
+
+	for i := 0; i < 20; i++ {
+		lv += lv
+	}
+	fillToken(&tok, []byte(lv), 0)
+	require.Equal(t, 127, int(tok.TokenBytes[0]))
+}
@@ -20,8 +20,8 @@ import (
 	"strings"
 
 	"github.com/matrixorigin/matrixone/pkg/common/moerr"
+	"github.com/matrixorigin/matrixone/pkg/monlp/tokenizer"
 	"github.com/matrixorigin/matrixone/pkg/sql/parsers/tree"
-	"github.com/matrixorigin/monlp/tokenizer"
 )
 
 /*