Skip to content

Commit

Permalink
fulltext bug fixes, performance improvement and support json_value pa…
Browse files Browse the repository at this point in the history
…rser (#20230)

bug fixes for #20217 #20213 #20175 #20149
and add json_value parser

1. limit the batch size to 8192 on both fulltext_index_scan() and fulltext_tokenize() function
2. In fulltext_index_scan function, create a new thread to evaluate the score in 8192 documents per batch instead of waiting for all results from SQL.  It will speed up and avoid OOM in the function.  However, the score will be calculated based on each mini-batch instead of complete batch.  I think it doesn't matter as long as we have the correct answer.
3. support json_value parser
4. Pre-allocation of memory in fulltext_tokenize() function to avoid malloc
5.  bug fix #20149 Delete table.   pkPos, pkType is needed but (doc_id, INT) is given.
6. add monpl tokenizer repo to matrixone
7. bug fix json tokenizer to truncate value and increase the limit to 127 bytes
8. pushdown limit
9. bug fix #20311. data race occurred during bvt test
10. alter table drop column with fulltext index
11. SQL executor add streaming mode.

Approved by: @fengttt, @badboynt1, @zhangxu19830126, @m-schen, @aunjgr, @ouyuanning, @aressu1985, @XuPeng-SH, @sukki37, @qingxinhome
  • Loading branch information
cpegeric authored Nov 29, 2024
1 parent 66b483d commit 03635a8
Show file tree
Hide file tree
Showing 31 changed files with 126,399 additions and 159 deletions.
1 change: 0 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ require (
github.com/lni/dragonboat/v4 v4.0.0-20220815145555-6f622e8bcbef
github.com/lni/goutils v1.3.1-0.20220604063047-388d67b4dbc4
github.com/lni/vfs v0.2.1-0.20220616104132-8852fd867376
github.com/matrixorigin/monlp v0.0.0-20240825091235-be436dc30e78
github.com/matrixorigin/mysql v1.8.2-0.20241106110439-6ac9ee94770d
github.com/matrixorigin/simdcsv v0.0.0-20230210060146-09b8e45209dd
github.com/minio/minio-go/v7 v7.0.78
Expand Down
2 changes: 0 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -503,8 +503,6 @@ github.com/matrixorigin/goutils v1.3.1-0.20220604063047-388d67b4dbc4 h1:+SmZP2bG
github.com/matrixorigin/goutils v1.3.1-0.20220604063047-388d67b4dbc4/go.mod h1:LIHvF0fflR+zyXUQFQOiHPpKANf3UIr7DFIv5CBPOoU=
github.com/matrixorigin/memberlist v0.5.1-0.20230322082342-95015c95ee76 h1:MpmqMPooJ0Ea7W4ldIGbQV4D3z+sEiCu6C6aTibiwiQ=
github.com/matrixorigin/memberlist v0.5.1-0.20230322082342-95015c95ee76/go.mod h1:yvyXLpo0QaGE59Y7hDTsTzDD25JYBZ4mHgHUZ8lrOI0=
github.com/matrixorigin/monlp v0.0.0-20240825091235-be436dc30e78 h1:1NvZ4SBw0lH7h38VhCVxYEa61K8N+0DBv9JQhAwU48Q=
github.com/matrixorigin/monlp v0.0.0-20240825091235-be436dc30e78/go.mod h1:RQQhaM4xSocKuNi0ZvKZZAiErpINJgZrPB+vZDvBkeU=
github.com/matrixorigin/mysql v1.8.2-0.20241106110439-6ac9ee94770d h1:27vD3JGbrFmaQtDYQT/W1jFFr0xvipdwH5R4bZPGQdE=
github.com/matrixorigin/mysql v1.8.2-0.20241106110439-6ac9ee94770d/go.mod h1:RJNMd/LBgWRCpGanqXvqjVaoYXeYBS+i0MSeoN3hBMo=
github.com/matrixorigin/simdcsv v0.0.0-20230210060146-09b8e45209dd h1:DvqhuH3kOpsE6vXZA5WEaRNAUUUcf44S1p5VInbjdfU=
Expand Down
2 changes: 1 addition & 1 deletion pkg/catalog/secondary_index_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ func fullTextIndexParamsToMap(def *tree.FullTextIndex) (map[string]string, error
// fulltext index here
if def.IndexOption != nil {
parsername := strings.ToLower(def.IndexOption.ParserName)
if parsername != "ngram" && parsername != "default" && parsername != "json" {
if parsername != "ngram" && parsername != "default" && parsername != "json" && parsername != "json_value" {
return nil, moerr.NewInternalErrorNoCtx(fmt.Sprintf("invalid parser %s", parsername))
}
res["parser"] = parsername
Expand Down
26 changes: 17 additions & 9 deletions pkg/container/bytejson/fttokenizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,31 +17,39 @@ package bytejson
import (
"iter"
"strconv"
)

"github.com/matrixorigin/monlp/tokenizer"
const (
MAX_TOKEN_SIZE = 127
)

type Token struct {
TokenBytes [1 + MAX_TOKEN_SIZE]byte
TokenPos int32
BytePos int32
}

// TokenizeValue tokenizes the values of the ByteJson object
// note that we do not break word with space, do not normalize
// case, 3-gram, etc etc, only truncate the string to 23 bytes.
func (bj ByteJson) TokenizeValue(includeKey bool) iter.Seq[tokenizer.Token] {
return func(yield func(tokenizer.Token) bool) {
// case, 3-gram, etc etc, only truncate the string to 127 bytes.
func (bj ByteJson) TokenizeValue(includeKey bool) iter.Seq[Token] {
return func(yield func(Token) bool) {
tokenizeOne(bj, 1, includeKey, yield)
}
}

func fillToken(t *tokenizer.Token, s []byte, pos int32) {
func fillToken(t *Token, s []byte, pos int32) {
copy(t.TokenBytes[1:], s)
if len(s) > tokenizer.MAX_TOKEN_SIZE {
t.TokenBytes[0] = tokenizer.MAX_TOKEN_SIZE
if len(s) > MAX_TOKEN_SIZE {
t.TokenBytes[0] = MAX_TOKEN_SIZE
} else {
t.TokenBytes[0] = byte(len(s))
}
t.TokenPos = pos
}

func tokenizeOne(bj ByteJson, pos int32, includeKey bool, yield func(tokenizer.Token) bool) int32 {
var t tokenizer.Token
func tokenizeOne(bj ByteJson, pos int32, includeKey bool, yield func(Token) bool) int32 {
var t Token

switch bj.Type {
case TpCodeObject:
Expand Down
36 changes: 26 additions & 10 deletions pkg/container/bytejson/fttokenizer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@ package bytejson

import (
"encoding/json"
"fmt"
"testing"

"github.com/matrixorigin/monlp/tokenizer"
"github.com/stretchr/testify/require"
)

type tokenTestCase struct {
Expand All @@ -27,15 +28,15 @@ type tokenTestCase struct {
tokensWithKey []string
}

func checkTokens(t *testing.T, tokens []tokenizer.Token, expected []string) {
func checkTokens(t *testing.T, tokens []Token, expected []string) {
if len(tokens) != len(expected) {
t.Fatalf("expected %d tokens, got %d", len(expected), len(tokens))
}

for i := range tokens {
var tk tokenizer.Token
if len(expected[i]) > tokenizer.MAX_TOKEN_SIZE {
tk.TokenBytes[0] = byte(tokenizer.MAX_TOKEN_SIZE)
var tk Token
if len(expected[i]) > MAX_TOKEN_SIZE {
tk.TokenBytes[0] = byte(MAX_TOKEN_SIZE)
} else {
tk.TokenBytes[0] = byte(len(expected[i]))
}
Expand All @@ -61,8 +62,8 @@ func TestByteJson(t *testing.T) {
},
{
input: `{"a": [1.2, 2.0], "b": [3, true, "hello"], "c": "abcdefghijklmnopqrstuvwxyz"}`,
tokens: []string{"1.2", "2", "3", "hello", "abcdefghijklmnopqrstuvw"},
tokensWithKey: []string{"a", "1.2", "2", "b", "3", "hello", "c", "abcdefghijklmnopqrstuvw"},
tokens: []string{"1.2", "2", "3", "hello", "abcdefghijklmnopqrstuvwxyz"},
tokensWithKey: []string{"a", "1.2", "2", "b", "3", "hello", "c", "abcdefghijklmnopqrstuvwxyz"},
},
{
input: `{"a": "相见时难别亦难", "b": "I come, I see, I 征服", "c": "相见时难别亦难,东风无力百花残。 春蚕到死丝方尽,蜡炬成灰泪始干。"}`,
Expand All @@ -72,7 +73,7 @@ func TestByteJson(t *testing.T) {
{
input: `{"a bcdefghijklmnopqrstuvwxyz": 1, "学而时习之,不亦说乎": "说什么说, 就你话多"}`,
tokens: []string{"1", "说什么说, 就你话多"},
tokensWithKey: []string{"a bcdefghijklmnopqrstuv", "1", "学而时习之,不亦说乎", "说什么说, 就你话多"},
tokensWithKey: []string{"a bcdefghijklmnopqrstuvwxyz", "1", "学而时习之,不亦说乎", "说什么说, 就你话多"},
},
}

Expand All @@ -82,16 +83,31 @@ func TestByteJson(t *testing.T) {
t.Fatal(err)
}

var tokens []tokenizer.Token
var tokens []Token
for tk := range bj.TokenizeValue(false) {
tokens = append(tokens, tk)
}
checkTokens(t, tokens, tc.tokens)

var tokensWithKey []tokenizer.Token
var tokensWithKey []Token
for tk := range bj.TokenizeValue(true) {
tokensWithKey = append(tokensWithKey, tk)
}
checkTokens(t, tokensWithKey, tc.tokensWithKey)
}
}

func TestFillToken(t *testing.T) {
var tok Token
lv := "1234567890"
fmt.Printf("%s %d\n", lv, len(lv))

fillToken(&tok, []byte(lv), 0)
require.Equal(t, 10, int(tok.TokenBytes[0]))

for i := 0; i < 20; i++ {
lv += lv
}
fillToken(&tok, []byte(lv), 0)
require.Equal(t, 127, int(tok.TokenBytes[0]))
}
2 changes: 1 addition & 1 deletion pkg/fulltext/fulltext.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ import (
"strings"

"github.com/matrixorigin/matrixone/pkg/common/moerr"
"github.com/matrixorigin/matrixone/pkg/monlp/tokenizer"
"github.com/matrixorigin/matrixone/pkg/sql/parsers/tree"
"github.com/matrixorigin/monlp/tokenizer"
)

/*
Expand Down
Loading

0 comments on commit 03635a8

Please sign in to comment.