Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fulltext bug fixes, performance improvement and support json_value parser #20230

Open
wants to merge 47 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
030823b
json_value
cpegeric Nov 19, 2024
08d7346
bug fix 20188
cpegeric Nov 19, 2024
d7a77d6
8192 batch size
cpegeric Nov 19, 2024
0a55284
Merge branch 'main' into json_value
cpegeric Nov 20, 2024
804231f
run with thread
cpegeric Nov 20, 2024
00e02ab
bug fix
cpegeric Nov 20, 2024
624d3e3
prealloc memory
cpegeric Nov 20, 2024
d15327e
reset slice
cpegeric Nov 20, 2024
d26a2f0
prealloc position
cpegeric Nov 20, 2024
30e534e
add tests
cpegeric Nov 20, 2024
13dfd7f
Merge branch 'main' into json_value
mergify[bot] Nov 20, 2024
f2fbc5c
increase channel size to 2
cpegeric Nov 21, 2024
ba48ca5
bug fix json value truncate to 23 byte. increase limit to 127 bytes
cpegeric Nov 21, 2024
7082237
fix tests after change max token size to 127
cpegeric Nov 21, 2024
fb52b31
add monlp
cpegeric Nov 21, 2024
894aef6
prealloc
cpegeric Nov 21, 2024
c793ede
bug fix remove one thread
cpegeric Nov 21, 2024
d23f4dd
fix sca
cpegeric Nov 21, 2024
e4fc7f2
remove shakespeare
cpegeric Nov 21, 2024
4d4ddd2
Merge branch 'main' into json_value
cpegeric Nov 21, 2024
99e24fc
file not in utf8
cpegeric Nov 21, 2024
96e1be0
Merge branch 'main' into json_value
cpegeric Nov 22, 2024
bf36c0d
pushdown limit and avoid deadlock
cpegeric Nov 22, 2024
bdfe61f
don't close errors channel
cpegeric Nov 22, 2024
1f4ff6d
remove printout
cpegeric Nov 22, 2024
0825474
more tests
cpegeric Nov 22, 2024
4aa17c8
add ut test
cpegeric Nov 22, 2024
60a912c
add tests
cpegeric Nov 23, 2024
cb64158
add toknize ut test
cpegeric Nov 23, 2024
2c78b36
go fmt
cpegeric Nov 23, 2024
3e49bca
more tests
cpegeric Nov 23, 2024
d845ee5
streaming
cpegeric Nov 23, 2024
69b6a40
streaming fix
cpegeric Nov 23, 2024
4675dd2
fix streaming close
cpegeric Nov 23, 2024
578a5e9
streaming with sql executor in separate thread
cpegeric Nov 25, 2024
fd85bc8
cleanup
cpegeric Nov 25, 2024
854af4b
cleanup
cpegeric Nov 25, 2024
9fefe12
bug fix tests
cpegeric Nov 25, 2024
783469c
fix sca test
cpegeric Nov 25, 2024
4a2cb1a
bug fix
cpegeric Nov 25, 2024
eab5567
add test
cpegeric Nov 26, 2024
282a722
fix license and add test
cpegeric Nov 26, 2024
abb08ed
add test
cpegeric Nov 26, 2024
7268b54
add license
cpegeric Nov 26, 2024
cbb9b0a
license
cpegeric Nov 26, 2024
21960d0
update tests
cpegeric Nov 26, 2024
481462e
Merge branch 'main' into json_value
cpegeric Nov 26, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ require (
github.com/lni/dragonboat/v4 v4.0.0-20220815145555-6f622e8bcbef
github.com/lni/goutils v1.3.1-0.20220604063047-388d67b4dbc4
github.com/lni/vfs v0.2.1-0.20220616104132-8852fd867376
github.com/matrixorigin/monlp v0.0.0-20240825091235-be436dc30e78
github.com/matrixorigin/mysql v1.8.2-0.20241106110439-6ac9ee94770d
github.com/matrixorigin/simdcsv v0.0.0-20230210060146-09b8e45209dd
github.com/minio/minio-go/v7 v7.0.78
Expand Down
2 changes: 0 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -503,8 +503,6 @@ github.com/matrixorigin/goutils v1.3.1-0.20220604063047-388d67b4dbc4 h1:+SmZP2bG
github.com/matrixorigin/goutils v1.3.1-0.20220604063047-388d67b4dbc4/go.mod h1:LIHvF0fflR+zyXUQFQOiHPpKANf3UIr7DFIv5CBPOoU=
github.com/matrixorigin/memberlist v0.5.1-0.20230322082342-95015c95ee76 h1:MpmqMPooJ0Ea7W4ldIGbQV4D3z+sEiCu6C6aTibiwiQ=
github.com/matrixorigin/memberlist v0.5.1-0.20230322082342-95015c95ee76/go.mod h1:yvyXLpo0QaGE59Y7hDTsTzDD25JYBZ4mHgHUZ8lrOI0=
github.com/matrixorigin/monlp v0.0.0-20240825091235-be436dc30e78 h1:1NvZ4SBw0lH7h38VhCVxYEa61K8N+0DBv9JQhAwU48Q=
github.com/matrixorigin/monlp v0.0.0-20240825091235-be436dc30e78/go.mod h1:RQQhaM4xSocKuNi0ZvKZZAiErpINJgZrPB+vZDvBkeU=
github.com/matrixorigin/mysql v1.8.2-0.20241106110439-6ac9ee94770d h1:27vD3JGbrFmaQtDYQT/W1jFFr0xvipdwH5R4bZPGQdE=
github.com/matrixorigin/mysql v1.8.2-0.20241106110439-6ac9ee94770d/go.mod h1:RJNMd/LBgWRCpGanqXvqjVaoYXeYBS+i0MSeoN3hBMo=
github.com/matrixorigin/simdcsv v0.0.0-20230210060146-09b8e45209dd h1:DvqhuH3kOpsE6vXZA5WEaRNAUUUcf44S1p5VInbjdfU=
Expand Down
2 changes: 1 addition & 1 deletion pkg/catalog/secondary_index_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ func fullTextIndexParamsToMap(def *tree.FullTextIndex) (map[string]string, error
// fulltext index here
if def.IndexOption != nil {
parsername := strings.ToLower(def.IndexOption.ParserName)
if parsername != "ngram" && parsername != "default" && parsername != "json" {
if parsername != "ngram" && parsername != "default" && parsername != "json" && parsername != "json_value" {
return nil, moerr.NewInternalErrorNoCtx(fmt.Sprintf("invalid parser %s", parsername))
}
res["parser"] = parsername
Expand Down
26 changes: 17 additions & 9 deletions pkg/container/bytejson/fttokenizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,31 +17,39 @@ package bytejson
import (
"iter"
"strconv"
)

"github.com/matrixorigin/monlp/tokenizer"
const (
MAX_TOKEN_SIZE = 127
)

type Token struct {
TokenBytes [1 + MAX_TOKEN_SIZE]byte
TokenPos int32
BytePos int32
}

// TokenizeValue tokenizes the values of the ByteJson object
// note that we do not break word with space, do not normalize
// case, 3-gram, etc etc, only truncate the string to 23 bytes.
func (bj ByteJson) TokenizeValue(includeKey bool) iter.Seq[tokenizer.Token] {
return func(yield func(tokenizer.Token) bool) {
// case, 3-gram, etc etc, only truncate the string to 127 bytes.
func (bj ByteJson) TokenizeValue(includeKey bool) iter.Seq[Token] {
return func(yield func(Token) bool) {
tokenizeOne(bj, 1, includeKey, yield)
}
}

func fillToken(t *tokenizer.Token, s []byte, pos int32) {
func fillToken(t *Token, s []byte, pos int32) {
copy(t.TokenBytes[1:], s)
if len(s) > tokenizer.MAX_TOKEN_SIZE {
t.TokenBytes[0] = tokenizer.MAX_TOKEN_SIZE
if len(s) > MAX_TOKEN_SIZE {
t.TokenBytes[0] = MAX_TOKEN_SIZE
} else {
t.TokenBytes[0] = byte(len(s))
}
t.TokenPos = pos
}

func tokenizeOne(bj ByteJson, pos int32, includeKey bool, yield func(tokenizer.Token) bool) int32 {
var t tokenizer.Token
func tokenizeOne(bj ByteJson, pos int32, includeKey bool, yield func(Token) bool) int32 {
var t Token

switch bj.Type {
case TpCodeObject:
Expand Down
36 changes: 26 additions & 10 deletions pkg/container/bytejson/fttokenizer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@ package bytejson

import (
"encoding/json"
"fmt"
"testing"

"github.com/matrixorigin/monlp/tokenizer"
"github.com/stretchr/testify/require"
)

type tokenTestCase struct {
Expand All @@ -27,15 +28,15 @@ type tokenTestCase struct {
tokensWithKey []string
}

func checkTokens(t *testing.T, tokens []tokenizer.Token, expected []string) {
func checkTokens(t *testing.T, tokens []Token, expected []string) {
if len(tokens) != len(expected) {
t.Fatalf("expected %d tokens, got %d", len(expected), len(tokens))
}

for i := range tokens {
var tk tokenizer.Token
if len(expected[i]) > tokenizer.MAX_TOKEN_SIZE {
tk.TokenBytes[0] = byte(tokenizer.MAX_TOKEN_SIZE)
var tk Token
if len(expected[i]) > MAX_TOKEN_SIZE {
tk.TokenBytes[0] = byte(MAX_TOKEN_SIZE)
} else {
tk.TokenBytes[0] = byte(len(expected[i]))
}
Expand All @@ -61,8 +62,8 @@ func TestByteJson(t *testing.T) {
},
{
input: `{"a": [1.2, 2.0], "b": [3, true, "hello"], "c": "abcdefghijklmnopqrstuvwxyz"}`,
tokens: []string{"1.2", "2", "3", "hello", "abcdefghijklmnopqrstuvw"},
tokensWithKey: []string{"a", "1.2", "2", "b", "3", "hello", "c", "abcdefghijklmnopqrstuvw"},
tokens: []string{"1.2", "2", "3", "hello", "abcdefghijklmnopqrstuvwxyz"},
tokensWithKey: []string{"a", "1.2", "2", "b", "3", "hello", "c", "abcdefghijklmnopqrstuvwxyz"},
},
{
input: `{"a": "相见时难别亦难", "b": "I come, I see, I 征服", "c": "相见时难别亦难,东风无力百花残。 春蚕到死丝方尽,蜡炬成灰泪始干。"}`,
Expand All @@ -72,7 +73,7 @@ func TestByteJson(t *testing.T) {
{
input: `{"a bcdefghijklmnopqrstuvwxyz": 1, "学而时习之,不亦说乎": "说什么说, 就你话多"}`,
tokens: []string{"1", "说什么说, 就你话多"},
tokensWithKey: []string{"a bcdefghijklmnopqrstuv", "1", "学而时习之,不亦说乎", "说什么说, 就你话多"},
tokensWithKey: []string{"a bcdefghijklmnopqrstuvwxyz", "1", "学而时习之,不亦说乎", "说什么说, 就你话多"},
},
}

Expand All @@ -82,16 +83,31 @@ func TestByteJson(t *testing.T) {
t.Fatal(err)
}

var tokens []tokenizer.Token
var tokens []Token
for tk := range bj.TokenizeValue(false) {
tokens = append(tokens, tk)
}
checkTokens(t, tokens, tc.tokens)

var tokensWithKey []tokenizer.Token
var tokensWithKey []Token
for tk := range bj.TokenizeValue(true) {
tokensWithKey = append(tokensWithKey, tk)
}
checkTokens(t, tokensWithKey, tc.tokensWithKey)
}
}

func TestFillToken(t *testing.T) {
var tok Token
lv := "1234567890"
fmt.Printf("%s %d\n", lv, len(lv))

fillToken(&tok, []byte(lv), 0)
require.Equal(t, 10, int(tok.TokenBytes[0]))

for i := 0; i < 20; i++ {
lv += lv
}
fillToken(&tok, []byte(lv), 0)
require.Equal(t, 127, int(tok.TokenBytes[0]))
}
2 changes: 1 addition & 1 deletion pkg/fulltext/fulltext.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ import (
"strings"

"github.com/matrixorigin/matrixone/pkg/common/moerr"
"github.com/matrixorigin/matrixone/pkg/monlp/tokenizer"
"github.com/matrixorigin/matrixone/pkg/sql/parsers/tree"
"github.com/matrixorigin/monlp/tokenizer"
)

/*
Expand Down
Loading