Library to tokenize text (UTF-8) using regular expression.
- CMake -- Build, test and package software
- PCRE2 -- Perl Compatible Regular Expressions
- spdlog -- Super fast C++ logging library
- ICU -- Library for Unicode and Globalization
- Jansson -- C library for working with JSON
- APR -- Apache Portable Runtime
- APR-util -- Apache Portable Runtime Utility
mkdir ./build
cd ./build
cmake ..
make [-j]
cd ./scripts
./build.dic.sh
# check dictionary files
ls ../db
phrase.db token.sgmt.db
cd ./scripts
./db.import.sh
cd ./tests
# test script for text tokenizer and extract phrase pattern from text
./test.phrase.sh
# test script for text tokenizer
./test.token.sh
cd ./scripts
./db.free.sh
Licensed under an Apache-2.0 license.