diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a1ede3c9b..e00a87d8d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -209,7 +209,7 @@ jobs: run: | if [[ $OS == ubuntu* ]]; then sudo ./dependencies.sh - python3 -m pip install setuptools + python3 -m pip install setuptools build else ./dependencies.sh fi @@ -238,7 +238,9 @@ jobs: with: repository: restream/reindexer-py - name: Install PyReindexer - run: sudo python3 setup.py install + run: | + python -m build + python -m pip install . - name: Test PyReindexer run: | cd pyreindexer diff --git a/clang-tidy/.clang-tidy b/clang-tidy/.clang-tidy deleted file mode 100644 index 063df74f1..000000000 --- a/clang-tidy/.clang-tidy +++ /dev/null @@ -1,31 +0,0 @@ -Checks: 'clang-diagnostic-*, - clang-analyzer-*, - performance-*, - bugprone-*, - -bugprone-exception-escape, - -bugprone-branch-clone, - -bugprone-easily-swappable-parameters, - -bugprone-macro-parentheses, - -bugprone-signed-char-misuse, - -bugprone-narrowing-conversions, - -bugprone-reserved-identifier, - -bugprone-implicit-widening-of-multiplication-result, - -bugprone-assignment-in-if-condition, - -bugprone-parent-virtual-call, - -bugprone-integer-division, - -bugprone-unhandled-self-assignment, - -bugprone-inc-dec-in-conditions, - -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling, - -performance-no-int-to-ptr, - -performance-enum-size, - -performance-avoid-endl' -# clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling - too many unnecessary warning in vendored code -# performance-no-int-to-ptr - consider how to fix this -# bugprone-macro-parentheses - consider fixing -WarningsAsErrors: '*' -HeaderFilterRegex: '.*(?= 4.0.0 are given under - # the top level key 'Diagnostics' in the output yaml files - mergekey = "Diagnostics" - merged=[] - for replacefile in glob.iglob(os.path.join(tmpdir, '*.yaml')): - content = yaml.safe_load(open(replacefile, 'r')) - if not content: - continue # Skip empty files. - merged.extend(content.get(mergekey, [])) - - if merged: - # MainSourceFile: The key is required by the definition inside - # include/clang/Tooling/ReplacementsYaml.h, but the value - # is actually never used inside clang-apply-replacements, - # so we set it to '' here. - output = {'MainSourceFile': '', mergekey: merged} - with open(mergefile, 'w') as out: - yaml.safe_dump(output, out) - else: - # Empty the file: - open(mergefile, 'w').close() - - -def find_binary(arg, name, build_path): - """Get the path for a binary or exit""" - if arg: - if shutil.which(arg): - return arg - else: - raise SystemExit( - "error: passed binary '{}' was not found or is not executable" - .format(arg)) - - built_path = os.path.join(build_path, "bin", name) - binary = shutil.which(name) or shutil.which(built_path) - if binary: - return binary - else: - raise SystemExit( - "error: failed to find {} in $PATH or at {}" - .format(name, built_path)) - - -def apply_fixes(args, clang_apply_replacements_binary, tmpdir): - """Calls clang-apply-fixes on a given directory.""" - invocation = [clang_apply_replacements_binary] - invocation.append('-ignore-insert-conflict') - if args.format: - invocation.append('-format') - if args.style: - invocation.append('-style=' + args.style) - invocation.append(tmpdir) - subprocess.call(invocation) - - -def run_tidy(args, clang_tidy_binary, tmpdir, build_path, queue, lock, - failed_files): - """Takes filenames out of queue and runs clang-tidy on them.""" - while True: - name = queue.get() - invocation = get_tidy_invocation(name, clang_tidy_binary, args.checks, - tmpdir, build_path, args.header_filter, - args.allow_enabling_alpha_checkers, - args.extra_arg, args.extra_arg_before, - args.quiet, args.config_file, args.config, - args.line_filter, args.use_color, - args.plugins) - - proc = subprocess.Popen(invocation, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - output, err = proc.communicate() - if proc.returncode != 0: - if proc.returncode < 0: - msg = "%s: terminated by signal %d\n" % (name, -proc.returncode) - err += msg.encode('utf-8') - failed_files.append(name) - with lock: - sys.stdout.write(' '.join(invocation) + '\n' + output.decode('utf-8')) - if len(err) > 0: - sys.stdout.flush() - sys.stderr.write(err.decode('utf-8')) - queue.task_done() - - -def main(): - parser = argparse.ArgumentParser(description='Runs clang-tidy over all files ' - 'in a compilation database. Requires ' - 'clang-tidy and clang-apply-replacements in ' - '$PATH or in your build directory.') - parser.add_argument('-allow-enabling-alpha-checkers', - action='store_true', help='allow alpha checkers from ' - 'clang-analyzer.') - parser.add_argument('-clang-tidy-binary', metavar='PATH', - default='clang-tidy-18', - help='path to clang-tidy binary') - parser.add_argument('-clang-apply-replacements-binary', metavar='PATH', - default='clang-apply-replacements-18', - help='path to clang-apply-replacements binary') - parser.add_argument('-checks', default=None, - help='checks filter, when not specified, use clang-tidy ' - 'default') - config_group = parser.add_mutually_exclusive_group() - config_group.add_argument('-config', default=None, - help='Specifies a configuration in YAML/JSON format: ' - ' -config="{Checks: \'*\', ' - ' CheckOptions: {x: y}}" ' - 'When the value is empty, clang-tidy will ' - 'attempt to find a file named .clang-tidy for ' - 'each source file in its parent directories.') - config_group.add_argument('-config-file', default=None, - help='Specify the path of .clang-tidy or custom config ' - 'file: e.g. -config-file=/some/path/myTidyConfigFile. ' - 'This option internally works exactly the same way as ' - '-config option after reading specified config file. ' - 'Use either -config-file or -config, not both.') - parser.add_argument('-header-filter', default=None, - help='regular expression matching the names of the ' - 'headers to output diagnostics from. Diagnostics from ' - 'the main file of each translation unit are always ' - 'displayed.') - parser.add_argument('-line-filter', default=None, - help='List of files with line ranges to filter the' - 'warnings.') - if yaml: - parser.add_argument('-export-fixes', metavar='filename', dest='export_fixes', - help='Create a yaml file to store suggested fixes in, ' - 'which can be applied with clang-apply-replacements.') - parser.add_argument('-j', type=int, default=0, - help='number of tidy instances to be run in parallel.') - parser.add_argument('files', nargs='*', default=['.*'], - help='files to be processed (regex on path)') - parser.add_argument('-fix', action='store_true', help='apply fix-its') - parser.add_argument('-format', action='store_true', help='Reformat code ' - 'after applying fixes') - parser.add_argument('-style', default='file', help='The style of reformat ' - 'code after applying fixes') - parser.add_argument('-use-color', type=strtobool, nargs='?', const=True, - help='Use colors in diagnostics, overriding clang-tidy\'s' - ' default behavior. This option overrides the \'UseColor' - '\' option in .clang-tidy file, if any.') - parser.add_argument('-p', dest='build_path', - help='Path used to read a compile command database.') - parser.add_argument('-extra-arg', dest='extra_arg', - action='append', default=[], - help='Additional argument to append to the compiler ' - 'command line.') - parser.add_argument('-extra-arg-before', dest='extra_arg_before', - action='append', default=[], - help='Additional argument to prepend to the compiler ' - 'command line.') - parser.add_argument('-ignore', default=DEFAULT_CLANG_TIDY_IGNORE, - help='File path to clang-tidy-ignore') - parser.add_argument('-quiet', action='store_true', - help='Run clang-tidy in quiet mode') - parser.add_argument('-load', dest='plugins', - action='append', default=[], - help='Load the specified plugin in clang-tidy.') - args = parser.parse_args() - - db_path = 'compile_commands.json' - - if args.build_path is not None: - build_path = args.build_path - else: - # Find our database - build_path = find_compilation_database(db_path) - - clang_tidy_binary = find_binary(args.clang_tidy_binary, "clang-tidy", - build_path) - - tmpdir = None - if args.fix or (yaml and args.export_fixes): - clang_apply_replacements_binary = find_binary( - args.clang_apply_replacements_binary, "clang-apply-replacements", - build_path) - tmpdir = tempfile.mkdtemp() - - try: - invocation = get_tidy_invocation("", clang_tidy_binary, args.checks, - None, build_path, args.header_filter, - args.allow_enabling_alpha_checkers, - args.extra_arg, args.extra_arg_before, - args.quiet, args.config_file, args.config, - args.line_filter, args.use_color, - args.plugins) - invocation.append('-list-checks') - invocation.append('-') - if args.quiet: - # Even with -quiet we still want to check if we can call clang-tidy. - with open(os.devnull, 'w') as dev_null: - subprocess.check_call(invocation, stdout=dev_null) - else: - subprocess.check_call(invocation) - except: - print("Unable to run clang-tidy.", file=sys.stderr) - sys.exit(1) - - # Load the database and extract all files. - database = json.load(open(os.path.join(build_path, db_path))) - files = set([make_absolute(entry['file'], entry['directory']) - for entry in database]) - files, excluded = filter_files(args.ignore, files) - if excluded: - print("Excluding the following files:\n" + "\n".join(excluded) + "\n") - - max_task = args.j - if max_task == 0: - max_task = multiprocessing.cpu_count() - - # Build up a big regexy filter from all command line arguments. - file_name_re = re.compile('|'.join(args.files)) - - return_code = 0 - try: - # Spin up a bunch of tidy-launching threads. - task_queue = queue.Queue(max_task) - # List of files with a non-zero return code. - failed_files = [] - lock = threading.Lock() - for _ in range(max_task): - t = threading.Thread(target=run_tidy, - args=(args, clang_tidy_binary, tmpdir, build_path, - task_queue, lock, failed_files)) - t.daemon = True - t.start() - - # Fill the queue with files. - for name in files: - if file_name_re.search(name): - task_queue.put(name) - - # Wait for all threads to be done. - task_queue.join() - if len(failed_files): - return_code = 1 - - except KeyboardInterrupt: - # This is a sad hack. Unfortunately subprocess goes - # bonkers with ctrl-c and we start forking merrily. - print('\nCtrl-C detected, goodbye.') - if tmpdir: - shutil.rmtree(tmpdir) - os.kill(0, 9) - - if yaml and args.export_fixes: - print('Writing fixes to ' + args.export_fixes + ' ...') - try: - merge_replacement_files(tmpdir, args.export_fixes) - except: - print('Error exporting fixes.\n', file=sys.stderr) - traceback.print_exc() - return_code=1 - - if args.fix: - print('Applying fixes ...') - try: - apply_fixes(args, clang_apply_replacements_binary, tmpdir) - except: - print('Error applying fixes.\n', file=sys.stderr) - traceback.print_exc() - return_code = 1 - - if tmpdir: - shutil.rmtree(tmpdir) - sys.exit(return_code) - - -if __name__ == '__main__': - main() diff --git a/cpp_src/cmd/reindexer_server/test/test_storage_compatibility.sh b/cpp_src/cmd/reindexer_server/test/test_storage_compatibility.sh new file mode 100755 index 000000000..873181e20 --- /dev/null +++ b/cpp_src/cmd/reindexer_server/test/test_storage_compatibility.sh @@ -0,0 +1,197 @@ +#!/bin/bash +# Task: https://github.com/restream/reindexer/-/issues/1188 +set -e + +function KillAndRemoveServer { + local pid=$1 + kill $pid + wait $pid + yum remove -y 'reindexer*' > /dev/null +} + +function WaitForDB { + # wait until DB is loaded + set +e # disable "exit on error" so the script won't stop when DB's not loaded yet + is_connected=$(reindexer_tool --dsn $ADDRESS --command '\databases list'); + while [[ $is_connected != "test" ]] + do + sleep 2 + is_connected=$(reindexer_tool --dsn $ADDRESS --command '\databases list'); + done + set -e +} + +function CompareNamespacesLists { + local ns_list_actual=$1 + local ns_list_expected=$2 + local pid=$3 + + diff=$(echo ${ns_list_actual[@]} ${ns_list_expected[@]} | tr ' ' '\n' | sort | uniq -u) # compare in any order + if [ "$diff" == "" ]; then + echo "## PASS: namespaces list not changed" + else + echo "##### FAIL: namespaces list was changed" + echo "expected: $ns_list_expected" + echo "actual: $ns_list_actual" + KillAndRemoveServer $pid; + exit 1 + fi +} + +function CompareMemstats { + local actual=$1 + local expected=$2 + local pid=$3 + diff=$(echo ${actual[@]} ${expected[@]} | tr ' ' '\n' | sed 's/\(.*\),$/\1/' | sort | uniq -u) # compare in any order + if [ "$diff" == "" ]; then + echo "## PASS: memstats not changed" + else + echo "##### FAIL: memstats was changed" + echo "expected: $expected" + echo "actual: $actual" + KillAndRemoveServer $pid; + exit 1 + fi +} + + +RX_SERVER_CURRENT_VERSION_RPM="$(basename build/reindexer-*server*.rpm)" +VERSION_FROM_RPM=$(echo "$RX_SERVER_CURRENT_VERSION_RPM" | grep -o '.*server-..') +VERSION=$(echo ${VERSION_FROM_RPM: -2:1}) # one-digit version + +echo "## choose latest release rpm file" +if [ $VERSION == 3 ]; then + LATEST_RELEASE=$(python3 cpp_src/cmd/reindexer_server/test/get_last_rx_version.py -v 3) + namespaces_list_expected=$'purchase_options_ext_dict\nchild_account_recommendations\n#config\n#activitystats\nradio_channels\ncollections\n#namespaces\nwp_imports_tasks\nepg_genres\nrecom_media_items_personal\nrecom_epg_archive_default\n#perfstats\nrecom_epg_live_default\nmedia_view_templates\nasset_video_servers\nwp_tasks_schedule\nadmin_roles\n#clientsstats\nrecom_epg_archive_personal\nrecom_media_items_similars\nmenu_items\naccount_recommendations\nkaraoke_items\nmedia_items\nbanners\n#queriesperfstats\nrecom_media_items_default\nrecom_epg_live_personal\nservices\n#memstats\nchannels\nmedia_item_recommendations\nwp_tasks_tasks\nepg' +elif [ $VERSION == 4 ]; then + LATEST_RELEASE=$(python3 cpp_src/cmd/reindexer_server/test/get_last_rx_version.py -v 4) + # replicationstats ns added for v4 + namespaces_list_expected=$'purchase_options_ext_dict\nchild_account_recommendations\n#config\n#activitystats\n#replicationstats\nradio_channels\ncollections\n#namespaces\nwp_imports_tasks\nepg_genres\nrecom_media_items_personal\nrecom_epg_archive_default\n#perfstats\nrecom_epg_live_default\nmedia_view_templates\nasset_video_servers\nwp_tasks_schedule\nadmin_roles\n#clientsstats\nrecom_epg_archive_personal\nrecom_media_items_similars\nmenu_items\naccount_recommendations\nkaraoke_items\nmedia_items\nbanners\n#queriesperfstats\nrecom_media_items_default\nrecom_epg_live_personal\nservices\n#memstats\nchannels\nmedia_item_recommendations\nwp_tasks_tasks\nepg' +else + echo "Unknown version" + exit 1 +fi + +echo "## downloading latest release rpm file: $LATEST_RELEASE" +curl "http://repo.itv.restr.im/itv-api-ng/7/x86_64/$LATEST_RELEASE" --output $LATEST_RELEASE; +echo "## downloading example DB" +curl "https://github.com/restream/reindexer_testdata/-/raw/main/dump_demo.zip" --output dump_demo.zip; +unzip -o dump_demo.zip # unzips into demo_test.rxdump; + +ADDRESS="cproto://127.0.0.1:6534/" +DB_NAME="test" + +memstats_expected=$'[ +{"name":"account_recommendations","replication":{"data_hash":6833710705,"data_count":1}}, +{"name":"admin_roles","replication":{"data_hash":1896088071,"data_count":2}}, +{"name":"asset_video_servers","replication":{"data_hash":7404222244,"data_count":97}}, +{"name":"banners","replication":{"data_hash":0,"data_count":0}}, +{"name":"channels","replication":{"data_hash":457292509431319,"data_count":3941}}, +{"name":"child_account_recommendations","replication":{"data_hash":6252344969,"data_count":1}}, +{"name":"collections","replication":{"data_hash":0,"data_count":0}}, +{"name":"epg","replication":{"data_hash":-7049751653258,"data_count":1623116}}, +{"name":"epg_genres","replication":{"data_hash":8373644068,"data_count":1315}}, +{"name":"karaoke_items","replication":{"data_hash":5858155773472,"data_count":4500}}, +{"name":"media_item_recommendations","replication":{"data_hash":-6520334670,"data_count":35886}}, +{"name":"media_items","replication":{"data_hash":-1824301168479972392,"data_count":65448}}, +{"name":"media_view_templates","replication":{"data_hash":0,"data_count":0}}, +{"name":"menu_items","replication":{"data_hash":0,"data_count":0}}, +{"name":"purchase_options_ext_dict","replication":{"data_hash":24651210926,"data_count":3}}, +{"name":"radio_channels","replication":{"data_hash":37734732881,"data_count":28}}, +{"name":"recom_epg_archive_default","replication":{"data_hash":0,"data_count":0}}, +{"name":"recom_epg_archive_personal","replication":{"data_hash":0,"data_count":0}}, +{"name":"recom_epg_live_default","replication":{"data_hash":0,"data_count":0}}, +{"name":"recom_epg_live_personal","replication":{"data_hash":0,"data_count":0}}, +{"name":"recom_media_items_default","replication":{"data_hash":8288213744,"data_count":3}}, +{"name":"recom_media_items_personal","replication":{"data_hash":0,"data_count":0}}, +{"name":"recom_media_items_similars","replication":{"data_hash":-672103903,"data_count":33538}}, +{"name":"services","replication":{"data_hash":0,"data_count":0}}, +{"name":"wp_imports_tasks","replication":{"data_hash":777859741066,"data_count":1145}}, +{"name":"wp_tasks_schedule","replication":{"data_hash":12595790956,"data_count":4}}, +{"name":"wp_tasks_tasks","replication":{"data_hash":28692716680,"data_count":281}} +] +Returned 27 rows' + +echo "##### Forward compatibility test #####" + +DB_PATH=$(pwd)"/rx_db" + +echo "Database: "$DB_PATH + +echo "## installing latest release: $LATEST_RELEASE" +yum install -y $LATEST_RELEASE > /dev/null; +# run RX server with disabled logging +reindexer_server -l warning --httplog=none --rpclog=none --db $DB_PATH & +server_pid=$! +sleep 2; + +reindexer_tool --dsn $ADDRESS$DB_NAME -f demo_test.rxdump --createdb; +sleep 1; + +namespaces_1=$(reindexer_tool --dsn $ADDRESS$DB_NAME --command '\namespaces list'); +echo $namespaces_1; +CompareNamespacesLists "${namespaces_1[@]}" "${namespaces_list_expected[@]}" $server_pid; + +memstats_1=$(reindexer_tool --dsn $ADDRESS$DB_NAME --command 'select name, replication.data_hash, replication.data_count from #memstats order by name'); +CompareMemstats "${memstats_1[@]}" "${memstats_expected[@]}" $server_pid; + +KillAndRemoveServer $server_pid; + +echo "## installing current version: $RX_SERVER_CURRENT_VERSION_RPM" +yum install -y build/*.rpm > /dev/null; +reindexer_server -l0 --corelog=none --httplog=none --rpclog=none --db $DB_PATH & +server_pid=$! +sleep 2; + +WaitForDB + +namespaces_2=$(reindexer_tool --dsn $ADDRESS$DB_NAME --command '\namespaces list'); +echo $namespaces_2; +CompareNamespacesLists "${namespaces_2[@]}" "${namespaces_1[@]}" $server_pid; + + +memstats_2=$(reindexer_tool --dsn $ADDRESS$DB_NAME --command 'select name, replication.data_hash, replication.data_count from #memstats order by name'); +CompareMemstats "${memstats_2[@]}" "${memstats_1[@]}" $server_pid; + +KillAndRemoveServer $server_pid; +rm -rf $DB_PATH; +sleep 1; + +echo "##### Backward compatibility test #####" + +echo "## installing current version: $RX_SERVER_CURRENT_VERSION_RPM" +yum install -y build/*.rpm > /dev/null; +reindexer_server -l warning --httplog=none --rpclog=none --db $DB_PATH & +server_pid=$! +sleep 2; + +reindexer_tool --dsn $ADDRESS$DB_NAME -f demo_test.rxdump --createdb; +sleep 1; + +namespaces_3=$(reindexer_tool --dsn $ADDRESS$DB_NAME --command '\namespaces list'); +echo $namespaces_3; +CompareNamespacesLists "${namespaces_3[@]}" "${namespaces_list_expected[@]}" $server_pid; + + +memstats_3=$(reindexer_tool --dsn $ADDRESS$DB_NAME --command 'select name, replication.data_hash, replication.data_count from #memstats order by name'); +CompareMemstats "${memstats_3[@]}" "${memstats_expected[@]}" $server_pid; + +KillAndRemoveServer $server_pid; + +echo "## installing latest release: $LATEST_RELEASE" +yum install -y $LATEST_RELEASE > /dev/null; +reindexer_server -l warning --httplog=none --rpclog=none --db $DB_PATH & +server_pid=$! +sleep 2; + +WaitForDB + +namespaces_4=$(reindexer_tool --dsn $ADDRESS$DB_NAME --command '\namespaces list'); +echo $namespaces_4; +CompareNamespacesLists "${namespaces_4[@]}" "${namespaces_3[@]}" $server_pid; + +memstats_4=$(reindexer_tool --dsn $ADDRESS$DB_NAME --command 'select name, replication.data_hash, replication.data_count from #memstats order by name'); +CompareMemstats "${memstats_4[@]}" "${memstats_3[@]}" $server_pid; + +KillAndRemoveServer $server_pid; +rm -rf $DB_PATH; diff --git a/cpp_src/core/ft/config/ftfastconfig.cc b/cpp_src/core/ft/config/ftfastconfig.cc index 5a1ec4cb0..e900abf27 100644 --- a/cpp_src/core/ft/config/ftfastconfig.cc +++ b/cpp_src/core/ft/config/ftfastconfig.cc @@ -131,8 +131,8 @@ void FtFastConfig::parse(std::string_view json, const RHashMap const std::string splitterStr = toLower(root["splitter"].As("fast")); if (splitterStr == "fast") { splitterType = Splitter::Fast; - } else if (splitterStr == "friso") { - splitterType = Splitter::Friso; + } else if (splitterStr == "friso" || splitterStr == "mmseg_cn") { + splitterType = Splitter::MMSegCN; } else { throw Error(errParseJson, "FtFastConfig: unknown splitter value: %s", splitterStr); } @@ -185,8 +185,8 @@ std::string FtFastConfig::GetJson(const fast_hash_map& fields) case Splitter::Fast: jsonBuilder.Put("splitter", "fast"); break; - case Splitter::Friso: - jsonBuilder.Put("splitter", "friso"); + case Splitter::MMSegCN: + jsonBuilder.Put("splitter", "mmseg_cn"); break; } diff --git a/cpp_src/core/ft/config/ftfastconfig.h b/cpp_src/core/ft/config/ftfastconfig.h index 76a9a99bd..3e8fdfd59 100644 --- a/cpp_src/core/ft/config/ftfastconfig.h +++ b/cpp_src/core/ft/config/ftfastconfig.h @@ -53,7 +53,7 @@ struct FtFastConfig : public BaseFTConfig { int maxAreasInDoc = 5; int maxTotalAreasToCache = -1; - enum class Splitter { Fast, Friso } splitterType = Splitter::Fast; + enum class Splitter { Fast, MMSegCN } splitterType = Splitter::Fast; RVector fieldsCfg; enum class Optimization { CPU, Memory } optimization = Optimization::Memory; diff --git a/cpp_src/core/ft/ft_fast/dataholder.cc b/cpp_src/core/ft/ft_fast/dataholder.cc index 55ab52d97..fad988b72 100644 --- a/cpp_src/core/ft/ft_fast/dataholder.cc +++ b/cpp_src/core/ft/ft_fast/dataholder.cc @@ -132,7 +132,7 @@ DataHolder::DataHolder(FtFastConfig* c) { cfg_ = c; if (cfg_->splitterType == FtFastConfig::Splitter::Fast) { splitter_ = make_intrusive(cfg_->extraWordSymbols); - } else if (cfg_->splitterType == FtFastConfig::Splitter::Friso) { + } else if (cfg_->splitterType == FtFastConfig::Splitter::MMSegCN) { splitter_ = make_intrusive(); } else { assertrx_throw(false); diff --git a/cpp_src/core/namespacedef.h b/cpp_src/core/namespacedef.h index 05f12ecd5..6450b262f 100644 --- a/cpp_src/core/namespacedef.h +++ b/cpp_src/core/namespacedef.h @@ -14,7 +14,7 @@ class WrSerializer; struct NamespaceDef { NamespaceDef() = default; - NamespaceDef(const std::string& iname, StorageOpts istorage = StorageOpts().Enabled().CreateIfMissing()) + explicit NamespaceDef(const std::string& iname, StorageOpts istorage = StorageOpts().Enabled().CreateIfMissing()) : name(iname), storage(istorage) {} NamespaceDef& AddIndex(const std::string& iname, const std::string& indexType, const std::string& fieldType, diff --git a/cpp_src/core/query/query.cc b/cpp_src/core/query/query.cc index e57e1d95d..ce4cf0cab 100644 --- a/cpp_src/core/query/query.cc +++ b/cpp_src/core/query/query.cc @@ -45,7 +45,7 @@ void Query::checkSubQuery() const { void Query::checkSubQueryNoData() const { if rx_unlikely (!aggregations_.empty()) { - throw Error{errQueryExec, "Aggregaton cannot be in subquery with condition Any or Empty"}; + throw Error{errQueryExec, "Aggregation cannot be in subquery with condition Any or Empty"}; } if rx_unlikely (HasLimit() && Limit() != 0) { throw Error{errQueryExec, "Limit cannot be in subquery with condition Any or Empty"}; @@ -175,7 +175,7 @@ void Query::checkSetObjectValue(const Variant& value) const { } } -VariantArray Query::deserializeValues(Serializer& ser, CondType cond) { +VariantArray Query::deserializeValues(Serializer& ser, CondType cond) const { VariantArray values; auto cnt = ser.GetVarUint(); if (cond == CondDWithin) { @@ -198,6 +198,10 @@ VariantArray Query::deserializeValues(Serializer& ser, CondType cond) { return values; } +void Query::deserializeJoinOn(Serializer&) { + throw Error(errLogic, "Unexpected call. JoinOn actual only for JoinQuery"); +} + void Query::deserialize(Serializer& ser, bool& hasJoinConditions) { bool end = false; std::vector> equalPositions; @@ -252,7 +256,7 @@ void Query::deserialize(Serializer& ser, bool& hasJoinConditions) { aggregations_.emplace_back(type, std::move(fields)); auto& ae = aggregations_.back(); while (!ser.Eof() && !aggEnd) { - int atype = ser.GetVarUint(); + auto atype = ser.GetVarUint(); switch (atype) { case QueryAggregationSort: { auto fieldName = ser.GetVString(); @@ -287,7 +291,7 @@ void Query::deserialize(Serializer& ser, bool& hasJoinConditions) { if (sortingEntry.expression.length()) { sortingEntries_.push_back(std::move(sortingEntry)); } - int cnt = ser.GetVarUint(); + auto cnt = ser.GetVarUint(); if (cnt != 0 && sortingEntries_.size() != 1) { throw Error(errParams, "Forced sort order is allowed for the first sorting entry only"); } @@ -298,12 +302,7 @@ void Query::deserialize(Serializer& ser, bool& hasJoinConditions) { break; } case QueryJoinOn: { - const OpType op = static_cast(ser.GetVarUint()); - const CondType condition = static_cast(ser.GetVarUint()); - std::string leftFieldName{ser.GetVString()}; - std::string rightFieldName{ser.GetVString()}; - reinterpret_cast(this)->joinEntries_.emplace_back(op, condition, std::move(leftFieldName), - std::move(rightFieldName)); + deserializeJoinOn(ser); break; } case QueryDebugLevel: @@ -350,7 +349,7 @@ void Query::deserialize(Serializer& ser, bool& hasJoinConditions) { VariantArray val; std::string field(ser.GetVString()); bool isArray = ser.GetVarUint(); - int numValues = ser.GetVarUint(); + auto numValues = ser.GetVarUint(); bool hasExpressions = false; while (numValues--) { hasExpressions = ser.GetVarUint(); @@ -362,7 +361,7 @@ void Query::deserialize(Serializer& ser, bool& hasJoinConditions) { case QueryUpdateField: { VariantArray val; std::string field(ser.GetVString()); - int numValues = ser.GetVarUint(); + auto numValues = ser.GetVarUint(); bool isArray = numValues > 1; bool hasExpressions = false; while (numValues--) { @@ -376,7 +375,7 @@ void Query::deserialize(Serializer& ser, bool& hasJoinConditions) { VariantArray val; std::string field(ser.GetVString()); bool hasExpressions = false; - int numValues = ser.GetVarUint(); + auto numValues = ser.GetVarUint(); val.MarkArray(ser.GetVarUint() == 1); while (numValues--) { hasExpressions = ser.GetVarUint(); @@ -428,7 +427,10 @@ void Query::deserialize(Serializer& ser, bool& hasJoinConditions) { entries_.Get(eqPos.first - 1).equalPositions.emplace_back(std::move(eqPos.second)); } } - return; +} + +void Query::serializeJoinEntries(WrSerializer&) const { + throw Error(errLogic, "Unexpected call. JoinEntries actual only for JoinQuery"); } void Query::Serialize(WrSerializer& ser, uint8_t mode) const { @@ -473,13 +475,7 @@ void Query::Serialize(WrSerializer& ser, uint8_t mode) const { } if (mode & WithJoinEntries) { - for (const auto& qje : reinterpret_cast(this)->joinEntries_) { - ser.PutVarUint(QueryJoinOn); - ser.PutVarUint(qje.Operation()); - ser.PutVarUint(qje.Condition()); - ser.PutVString(qje.LeftFieldName()); - ser.PutVString(qje.RightFieldName()); - } + serializeJoinEntries(ser); } for (const auto& equalPoses : entries_.equalPositions) { @@ -747,4 +743,22 @@ bool Query::IsWALQuery() const noexcept { return false; } +void JoinedQuery::deserializeJoinOn(Serializer& ser) { + const OpType op = static_cast(ser.GetVarUint()); + const CondType condition = static_cast(ser.GetVarUint()); + std::string leftFieldName{ser.GetVString()}; + std::string rightFieldName{ser.GetVString()}; + joinEntries_.emplace_back(op, condition, std::move(leftFieldName), std::move(rightFieldName)); +} + +void JoinedQuery::serializeJoinEntries(WrSerializer& ser) const { + for (const auto& qje : joinEntries_) { + ser.PutVarUint(QueryJoinOn); + ser.PutVarUint(qje.Operation()); + ser.PutVarUint(qje.Condition()); + ser.PutVString(qje.LeftFieldName()); + ser.PutVString(qje.RightFieldName()); + } +} + } // namespace reindexer diff --git a/cpp_src/core/query/query.h b/cpp_src/core/query/query.h index 7e8f9bbd6..77945cbd4 100644 --- a/cpp_src/core/query/query.h +++ b/cpp_src/core/query/query.h @@ -34,6 +34,12 @@ class Query { : namespace_(std::forward(nsName)), start_(start), count_(count), calcTotal_(calcTotal) {} Query() = default; + virtual ~Query() = default; + + Query(Query&& other) noexcept = default; + Query& operator=(Query&& other) noexcept = default; + Query(const Query& other) = default; + Query& operator=(const Query& other) = delete; /// Allows to compare 2 Query objects. [[nodiscard]] bool operator==(const Query&) const; @@ -970,8 +976,10 @@ class Query { using OnHelperR = OnHelperTempl; void checkSetObjectValue(const Variant& value) const; + virtual void deserializeJoinOn(Serializer& ser); void deserialize(Serializer& ser, bool& hasJoinConditions); - VariantArray deserializeValues(Serializer&, CondType); + VariantArray deserializeValues(Serializer&, CondType) const; + virtual void serializeJoinEntries(WrSerializer& ser) const; void checkSubQueryNoData() const; void checkSubQueryWithData() const; void checkSubQuery() const; @@ -996,7 +1004,7 @@ class Query { OpType nextOp_ = OpAnd; /// Next operation constant. }; -class JoinedQuery : public Query { +class JoinedQuery final : public Query { public: JoinedQuery(JoinType jt, const Query& q) : Query(q), joinType{jt} {} JoinedQuery(JoinType jt, Query&& q) : Query(std::move(q)), joinType{jt} {} @@ -1005,6 +1013,10 @@ class JoinedQuery : public Query { JoinType joinType{JoinType::LeftJoin}; /// Default join type. h_vector joinEntries_; /// Condition for join. Filled in each subqueries, empty in root query + +private: + void deserializeJoinOn(Serializer& ser); + void serializeJoinEntries(WrSerializer& ser) const; }; template diff --git a/cpp_src/gtests/tests/unit/ft/ft_generic.cc b/cpp_src/gtests/tests/unit/ft/ft_generic.cc index 6672b748c..6bd145c25 100644 --- a/cpp_src/gtests/tests/unit/ft/ft_generic.cc +++ b/cpp_src/gtests/tests/unit/ft/ft_generic.cc @@ -1893,7 +1893,7 @@ TEST_P(FTGenericApi, FrisoTest) { TEST_P(FTGenericApi, FrisoTestSelect) { reindexer::FtFastConfig cfg = GetDefaultConfig(); cfg.stopWords = {}; - cfg.splitterType = reindexer::FtFastConfig::Splitter::Friso; + cfg.splitterType = reindexer::FtFastConfig::Splitter::MMSegCN; Init(cfg); std::unordered_map> index; @@ -1948,7 +1948,7 @@ TEST_P(FTGenericApi, FrisoTestSelect) { TEST_P(FTGenericApi, FrisoTextPostprocess) { reindexer::FtFastConfig cfg = GetDefaultConfig(); - cfg.splitterType = reindexer::FtFastConfig::Splitter::Friso; + cfg.splitterType = reindexer::FtFastConfig::Splitter::MMSegCN; cfg.stopWords = {}; cfg.maxAreasInDoc = 10; Init(cfg); diff --git a/cpp_src/replicator/replicator.cc b/cpp_src/replicator/replicator.cc index e7f478b1e..905cde0bb 100644 --- a/cpp_src/replicator/replicator.cc +++ b/cpp_src/replicator/replicator.cc @@ -609,7 +609,7 @@ Error Replicator::syncNamespaceForced(const NamespaceDef& ns, std::string_view r err = syncMetaForced(tmpNs, ns.name); if (err.ok()) { - err = syncSchemaForced(tmpNs, ns.name); + err = syncSchemaForced(tmpNs, NamespaceDef(ns.name)); } // Make query to complete master's namespace data diff --git a/cpp_src/server/contrib/server.md b/cpp_src/server/contrib/server.md index 28d7951b8..a89768979 100644 --- a/cpp_src/server/contrib/server.md +++ b/cpp_src/server/contrib/server.md @@ -2364,7 +2364,7 @@ Fulltext Index configuration |**partial_match_decrease**
*optional*|Decrease of relevancy in case of partial match by value: partial_match_decrease * (non matched symbols) / (matched symbols)
**Minimum value** : `0`
**Maximum value** : `100`|integer| |**position_boost**
*optional*|Boost of search query term position
**Default** : `1.0`
**Minimum value** : `0`
**Maximum value** : `10`|number (float)| |**position_weight**
*optional*|Weight of search query term position in final rank. 0: term position will not change final rank. 1: term position will affect to final rank in 0 - 100% range
**Default** : `0.1`
**Minimum value** : `0`
**Maximum value** : `1`|number (float)| -|**splitter**
*optional*|Text tokenization algorithm. 'fast' - splits text by spaces, special characters and unsupported UTF-8 symbols. Each token is a combination of letters from supported UTF-8 subset, numbers and extra word symbols. 'friso' - algorithm based on mmseg for Chinese and English
**Default** : `"fast"`|enum (fast, friso)| +|**splitter**
*optional*|Text tokenization algorithm. 'fast' - splits text by spaces, special characters and unsupported UTF-8 symbols. Each token is a combination of letters from supported UTF-8 subset, numbers and extra word symbols. 'mmseg_cn' - algorithm based on friso implementation of mmseg for Chinese and English"
**Default** : `"fast"`|enum (fast, mmseg_cn)| |**stemmers**
*optional*|List of stemmers to use|< string > array| |**stop_words**
*optional*|List of objects of stop words. Words from this list will be ignored when building indexes|< [FtStopWordObject](#ftstopwordobject) > array| |**sum_ranks_by_fields_ratio**
*optional*|Ratio to summation of ranks of match one term in several fields. For example, if value of this ratio is K, request is '@+f1,+f2,+f3 word', ranks of match in fields are R1, R2, R3 and R2 < R1 < R3, final rank will be R = R2 + K*R1 + K*K*R3
**Default** : `0.0`
**Minimum value** : `0`
**Maximum value** : `1`|number (float)| diff --git a/cpp_src/server/contrib/server.yml b/cpp_src/server/contrib/server.yml index 7e04c6f7a..a2e82cf6b 100644 --- a/cpp_src/server/contrib/server.yml +++ b/cpp_src/server/contrib/server.yml @@ -3124,10 +3124,10 @@ definitions: maximum: 500 splitter: type: string - description: "Text tokenization algorithm. 'fast' - splits text by spaces, special characters and unsupported UTF-8 symbols. Each token is a combination of letters from supported UTF-8 subset, numbers and extra word symbols. 'friso' - algorithm based on mmseg for Chinese and English" + description: "Text tokenization algorithm. 'fast' - splits text by spaces, special characters and unsupported UTF-8 symbols. Each token is a combination of letters from supported UTF-8 subset, numbers and extra word symbols. 'mmseg_cn' - algorithm based on friso implementation of mmseg for Chinese and English" enum: - "fast" - - "friso" + - "mmseg_cn" default: "fast" FulltextFieldConfig: diff --git a/ftfastconfig.go b/ftfastconfig.go index f2e3ab9be..e97a0f800 100644 --- a/ftfastconfig.go +++ b/ftfastconfig.go @@ -191,9 +191,9 @@ type FtFastConfig struct { // Config for document ranking Bm25Config *Bm25ConfigType `json:"bm25_config,omitempty"` // Text tokenization algorithm. Default 'fast'. - // 'fast' : splits text by spaces, special characters and unsupported UTF-8 symbols. - // Each token is a combination of letters from supported UTF-8 subset, numbers and extra word symbols. - // 'friso': algorithm based on mmseg for Chinese and English + // 'fast' : splits text by spaces, special characters and unsupported UTF-8 symbols. + // Each token is a combination of letters from supported UTF-8 subset, numbers and extra word symbols. + // 'mmseg_cn': algorithm based on friso mmseg for Chinese and English SplitterType string `json:"splitter,omitempty"` } diff --git a/fulltext.md b/fulltext.md index c5258e823..d8065fd97 100644 --- a/fulltext.md +++ b/fulltext.md @@ -385,7 +385,7 @@ Several parameters of full text search engine can be configured from application | | Optimization | string | Optimize the index by 'memory' or by 'cpu' | "memory" | | | FtBaseRanking | struct | Relevance of the word in different forms | | | | Bm25Config | struct | Document ranking function parameters [More...](#basic-document-ranking-algorithms) | | -| | SplitterType | string | Text breakdown algorithm. Available values: 'friso' and 'fast' | "fast" | +| | SplitterType | string | Text breakdown algorithm. Available values: 'mmseg_cn' and 'fast' | "fast" | ### Stopwords details The list item can be either a string or a structure containing a string (the stopword) and a bool attribute (`is_morpheme`) indicating whether the stopword can be part of a word that can be shown in query-results. @@ -451,7 +451,7 @@ FtBaseRanking: config for the base relevancy of the word in different forms. ### Text splitters -Reindexer supports two algorithms to break texts into words: `fast` and `friso`. +Reindexer supports two algorithms to break texts into words: `fast` and `mmseg_cn`. Default `fast` algorithm is based on the definition of a word in the form of a alpha (from supported Unicode subset), number and an extended character, everything else (whitespaces, special characters, unsopported Unicode subsets, etc) will be threated as a delimiters. @@ -487,7 +487,7 @@ Reindexer supports the following unicode block codes / extra symbols: This algorithm is simple and provides high performance, but it can not handle texts without delimiters (for example, Chinese language does not requires whitespaces between words, so `fast`-splitter will not be able to index it properly). -Alternative `friso`-splitter is based on [friso](https://github.com/lionsoul2014/friso) implementation of `mmseg` algorithm and uses dictionaries for tokenization. Currently this splitter supports only Chinese and English languages. +Alternative `mmseg_cn`-splitter is based on [friso](https://github.com/lionsoul2014/friso) implementation of `mmseg` algorithm and uses dictionaries for tokenization. Currently this splitter supports only Chinese and English languages. ### Basic document ranking algorithms diff --git a/test/compatibility_test/compatibility_test.sh b/test/compatibility_test/compatibility_test.sh index 23d5edbe2..4cf7fd816 100755 --- a/test/compatibility_test/compatibility_test.sh +++ b/test/compatibility_test/compatibility_test.sh @@ -39,19 +39,19 @@ test_outdated_instance() { echo "====Master: ${master_cmd}" echo "====Slave: ${slave_cmd}" init_storages - ${master_cmd} --db "${master_db_path}" -l0 --serverlog=\"reindexer_master_$3.1.log\" --corelog=\"reindexer_master_$3.1.log\" --httplog=\"\" --rpclog=\"\" & + ${master_cmd} --db "${master_db_path}" -l trace --serverlog=reindexer_master_$3.1.log --corelog=reindexer_master_$3.1.log --httplog=none --rpclog=none & master_pid=$! sleep 8 go run ${script_dir}/filler.go --dsn "${master_dsn}/${db_name}" --offset 0 echo "====Force sync" - ${slave_cmd} --db "${slave_db_path}" -p 9089 -r 6535 -l0 --serverlog=\"reindexer_slave_$3.1.log\" --corelog=\"reindexer_slave_$3.1.log\" --httplog=\"\" --rpclog=\"\" & + ${slave_cmd} --db "${slave_db_path}" -p 9089 -r 6535 -l trace --serverlog=reindexer_slave_$3.1.log --corelog=reindexer_slave_$3.1.log --httplog=none --rpclog=none & slave_pid=$! sleep 8 kill $slave_pid - wait $slave_pid + wait $slave_pid go run ${script_dir}/filler.go --dsn "${master_dsn}/${db_name}" --offset 100 echo "====Sync by WAL" - ${slave_cmd} --db "${slave_db_path}" -p 9089 -r 6535 -l0 --serverlog=\"reindexer_slave_$3.2.log\" --corelog=\"reindexer_slave_$3.2.log\" --httplog=\"\" --rpclog=\"\" & + ${slave_cmd} --db "${slave_db_path}" -p 9089 -r 6535 -l trace --serverlog=reindexer_slave_$3.2.log --corelog=reindexer_slave_$3.2.log --httplog=none --rpclog=none & slave_pid=$! sleep 12 echo "====Online sync" @@ -60,11 +60,10 @@ test_outdated_instance() { build/cpp_src/cmd/reindexer_tool/reindexer_tool --dsn "${master_dsn}/${db_name}" --command "\dump ${ns_name}" --output "${master_dump}" build/cpp_src/cmd/reindexer_tool/reindexer_tool --dsn "${slave_dsn}/${db_name}" --command "\dump ${ns_name}" --output "${slave_dump}" kill $slave_pid - wait $slave_pid - kill $master_pid - wait $master_pid - sed -i -E "s/(\\NAMESPACES ADD.*)(\"schema\":\"\{.*\}\")/\1\"schema\":\"\{\}\"/" "${master_dump}" - ${script_dir}/compare_dumps.sh "${master_dump}" "${slave_dump}" + wait $slave_pid + kill $master_pid + wait $master_pid + ${script_dir}/compare_dumps.sh "${master_dump}" "${slave_dump}" } echo "====Installing reindexer package===="