From bfe3fc0637a0daaaf6fb4d6da6658e1f925c4552 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sun, 7 Feb 2016 12:07:27 -0600 Subject: [PATCH 01/48] #130 make ZLIB a required dependency --- CMakeLists.txt | 6 ++--- include/meta/corpus/all.h | 4 +-- include/meta/parser/sr_parser.h | 6 ----- include/meta/sequence/sequence_analyzer.h | 6 ----- src/corpus/CMakeLists.txt | 23 +++++++--------- src/corpus/corpus_factory.cpp | 2 -- src/io/CMakeLists.txt | 16 ++++-------- src/io/tools/CMakeLists.txt | 6 ++--- src/parser/sr_parser.cpp | 32 ++++------------------- src/parser/transition_map.cpp | 21 ++------------- src/sequence/perceptron.cpp | 21 ++------------- src/sequence/sequence_analyzer.cpp | 20 +------------- tests/forward_index_test.cpp | 2 -- tests/inverted_index_test.cpp | 2 -- 14 files changed, 30 insertions(+), 137 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c9262053f..ba2d99755 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -119,9 +119,9 @@ if(STDOPT) endif() if(ZLIB_FOUND) - target_include_directories(meta-definitions SYSTEM INTERFACE - ${ZLIB_INCLUDE_DIRS}) - target_compile_definitions(meta-definitions INTERFACE -DMETA_HAS_ZLIB) + target_include_directories(meta-definitions SYSTEM INTERFACE ${ZLIB_INCLUDE_DIRS}) +else() + message(FATAL_ERROR "Failed to find required dependency ZLIB") endif() if(LIBDL_LIBRARY) diff --git a/include/meta/corpus/all.h b/include/meta/corpus/all.h index 701868f71..a8839b7b0 100644 --- a/include/meta/corpus/all.h +++ b/include/meta/corpus/all.h @@ -1,7 +1,5 @@ #include "meta/corpus/corpus.h" -#if META_HAS_ZLIB -#include "meta/corpus/gz_corpus.h" -#endif #include "meta/corpus/file_corpus.h" +#include "meta/corpus/gz_corpus.h" #include "meta/corpus/libsvm_corpus.h" #include "meta/corpus/line_corpus.h" diff --git a/include/meta/parser/sr_parser.h b/include/meta/parser/sr_parser.h index 111bcc7c1..f2e5154ba 100644 --- a/include/meta/parser/sr_parser.h +++ b/include/meta/parser/sr_parser.h @@ -259,12 +259,6 @@ class sr_parser best_transitions(const feature_vector& features, const state& state, size_t num, bool check_legality = false) const; - /** - * Loads the parser model file from the given stream. - * @param model The input stream to read from - */ - void load(std::istream& model); - /** * Storage for the ids for each transition */ diff --git a/include/meta/sequence/sequence_analyzer.h b/include/meta/sequence/sequence_analyzer.h index 629742f58..3d93f902e 100644 --- a/include/meta/sequence/sequence_analyzer.h +++ b/include/meta/sequence/sequence_analyzer.h @@ -318,12 +318,6 @@ class sequence_analyzer */ void load_feature_id_mapping(const std::string& prefix); - /** - * Loads the feature_id mapping from disk using an input stream. - * @param input The input stream of the feature_id mapping - */ - void load_feature_id_mapping(std::istream& input); - /** * Loads the label_id mapping from disk. * @param prefix The folder to load the mapping from diff --git a/src/corpus/CMakeLists.txt b/src/corpus/CMakeLists.txt index 664f8c7f4..0d43eca45 100644 --- a/src/corpus/CMakeLists.txt +++ b/src/corpus/CMakeLists.txt @@ -2,19 +2,14 @@ project(meta-corpus) add_subdirectory(tools) -set(CORPUS_SOURCES corpus.cpp - corpus_factory.cpp - document.cpp - file_corpus.cpp - libsvm_corpus.cpp - line_corpus.cpp - metadata.cpp - metadata_parser.cpp) - -if (ZLIB_FOUND) - list(APPEND CORPUS_SOURCES gz_corpus.cpp) -endif() - -add_library(meta-corpus ${CORPUS_SOURCES}) +add_library(meta-corpus corpus.cpp + corpus_factory.cpp + document.cpp + file_corpus.cpp + libsvm_corpus.cpp + line_corpus.cpp + gz_corpus.cpp + metadata.cpp + metadata_parser.cpp) target_link_libraries(meta-corpus meta-io meta-utf cpptoml) diff --git a/src/corpus/corpus_factory.cpp b/src/corpus/corpus_factory.cpp index c134633c7..3de3a984a 100644 --- a/src/corpus/corpus_factory.cpp +++ b/src/corpus/corpus_factory.cpp @@ -22,9 +22,7 @@ corpus_factory::corpus_factory() // built-in corpora reg(); reg(); -#if META_HAS_ZLIB reg(); -#endif reg(); } diff --git a/src/io/CMakeLists.txt b/src/io/CMakeLists.txt index 2c1721a4d..c4619330f 100644 --- a/src/io/CMakeLists.txt +++ b/src/io/CMakeLists.txt @@ -2,21 +2,15 @@ project(meta-io) add_subdirectory(tools) -set(IO_SOURCES filesystem.cpp - libsvm_parser.cpp - mmap_file.cpp) - -set(IO_DEPS meta-util) - -if (ZLIB_FOUND) - list(APPEND IO_SOURCES gzstream.cpp) - list(APPEND IO_DEPS ${ZLIB_LIBRARIES}) -endif() +set(IO_DEPS meta-util ${ZLIB_LIBRARIES}) if (WIN32) add_subdirectory(mman-win32) list(APPEND IO_DEPS mman-win32) endif() -add_library(meta-io ${IO_SOURCES}) +add_library(meta-io filesystem.cpp + gzstream.cpp + libsvm_parser.cpp + mmap_file.cpp) target_link_libraries(meta-io ${IO_DEPS}) diff --git a/src/io/tools/CMakeLists.txt b/src/io/tools/CMakeLists.txt index 535f6d2d1..d317cbee1 100644 --- a/src/io/tools/CMakeLists.txt +++ b/src/io/tools/CMakeLists.txt @@ -1,4 +1,2 @@ -if (ZLIB_FOUND) - add_executable(compressor-test compressor_test.cpp) - target_link_libraries(compressor-test meta-io) -endif() +add_executable(compressor-test compressor_test.cpp) +target_link_libraries(compressor-test meta-io) diff --git a/src/parser/sr_parser.cpp b/src/parser/sr_parser.cpp index 5940bdbec..d783add33 100644 --- a/src/parser/sr_parser.cpp +++ b/src/parser/sr_parser.cpp @@ -7,6 +7,7 @@ #include #include "meta/io/filesystem.h" +#include "meta/io/gzstream.h" #include "meta/io/packed.h" #include "meta/logging/logger.h" #include "meta/parallel/parallel_for.h" @@ -22,10 +23,6 @@ #include "meta/util/range.h" #include "meta/util/time.h" -#ifdef META_HAS_ZLIB -#include "meta/io/gzstream.h" -#endif - namespace meta { namespace parser @@ -264,9 +261,9 @@ std::pair sr_parser::train_instance( } std::pair - sr_parser::train_early_termination(const parse_tree& tree, - const std::vector& transitions, - weight_vectors& update) const +sr_parser::train_early_termination(const parse_tree& tree, + const std::vector& transitions, + weight_vectors& update) const { std::pair result{0, 0}; state state{tree}; @@ -439,34 +436,15 @@ auto sr_parser::best_transitions(const feature_vector& features, void sr_parser::save(const std::string& prefix) const { trans_.save(prefix); - -#ifdef META_HAS_ZLIB io::gzofstream model{prefix + "/parser.model.gz"}; -#else - std::ofstream model{prefix + "/parser.model", std::ios::binary}; -#endif - io::packed::write(model, beam_size_); - model_.save(model); } void sr_parser::load(const std::string& prefix) { -#ifdef META_HAS_ZLIB - if (filesystem::file_exists(prefix + "/parser.model.gz")) - { - io::gzifstream model{prefix + "/parser.model.gz"}; - load(model); - return; - } -#endif - std::ifstream model{prefix + "/parser.model", std::ios::binary}; - load(model); -} + io::gzifstream model{prefix + "/parser.model.gz"}; -void sr_parser::load(std::istream& model) -{ if (!model) throw sr_parser_exception{"model file not found"}; diff --git a/src/parser/transition_map.cpp b/src/parser/transition_map.cpp index 0faaa35d6..774aed20a 100644 --- a/src/parser/transition_map.cpp +++ b/src/parser/transition_map.cpp @@ -4,16 +4,12 @@ */ #include -#include #include "meta/io/filesystem.h" +#include "meta/io/gzstream.h" #include "meta/io/packed.h" #include "meta/parser/transition_map.h" -#ifdef META_HAS_ZLIB -#include "meta/io/gzstream.h" -#endif - namespace meta { namespace parser @@ -21,15 +17,7 @@ namespace parser transition_map::transition_map(const std::string& prefix) { -#ifdef META_HAS_ZLIB - if (filesystem::file_exists(prefix + "/parser.trans.gz")) - { - io::gzifstream store{prefix + "/parser.trans.gz"}; - load(store); - return; - } -#endif - std::ifstream store{prefix + "/parser.trans", std::ios::binary}; + io::gzifstream store{prefix + "/parser.trans.gz"}; load(store); } @@ -114,12 +102,7 @@ uint64_t transition_map::size() const void transition_map::save(const std::string& prefix) const { -#ifdef META_HAS_ZLIB io::gzofstream store{prefix + "/parser.trans.gz"}; -#else - std::ofstream store{prefix + "/parser.trans", std::ios::binary}; -#endif - io::packed::write(store, transitions_.size()); for (const auto& trans : transitions_) { diff --git a/src/sequence/perceptron.cpp b/src/sequence/perceptron.cpp index 1a53333db..5a77e526f 100644 --- a/src/sequence/perceptron.cpp +++ b/src/sequence/perceptron.cpp @@ -6,15 +6,12 @@ #include #include "meta/io/filesystem.h" +#include "meta/io/gzstream.h" #include "meta/sequence/perceptron.h" #include "meta/utf/utf.h" #include "meta/util/progress.h" #include "meta/util/time.h" -#if META_HAS_ZLIB -#include "meta/io/gzstream.h" -#endif - namespace meta { namespace sequence @@ -50,16 +47,7 @@ perceptron::perceptron() : analyzer_{default_pos_analyzer()} perceptron::perceptron(const std::string& prefix) : perceptron() { analyzer_.load(prefix); - -#if META_HAS_ZLIB - if (filesystem::file_exists(prefix + "/tagger.model.gz")) - { - io::gzifstream file{prefix + "/tagger.model.gz"}; - model_.load(file); - return; - } -#endif - std::ifstream file{prefix + "/tagger.model"}; + io::gzifstream file{prefix + "/tagger.model.gz"}; model_.load(file); } @@ -150,12 +138,7 @@ void perceptron::train(std::vector& sequences, void perceptron::save(const std::string& prefix) const { analyzer_.save(prefix); - -#if META_HAS_ZLIB io::gzofstream file{prefix + "/tagger.model.gz"}; -#else - std::ofstream file{prefix + "/tagger.model"}; -#endif model_.save(file); } } diff --git a/src/sequence/sequence_analyzer.cpp b/src/sequence/sequence_analyzer.cpp index 8b4ff840c..9d02f2aa6 100644 --- a/src/sequence/sequence_analyzer.cpp +++ b/src/sequence/sequence_analyzer.cpp @@ -8,9 +8,7 @@ #include #include "meta/io/packed.h" #include "meta/io/filesystem.h" -#if META_HAS_ZLIB #include "meta/io/gzstream.h" -#endif #include "meta/sequence/sequence_analyzer.h" #include "meta/utf/utf.h" #include "meta/util/mapping.h" @@ -36,20 +34,8 @@ void sequence_analyzer::load(const std::string& prefix) void sequence_analyzer::load_feature_id_mapping(const std::string& prefix) { -#if META_HAS_ZLIB - if (filesystem::file_exists(prefix + "/feature.mapping.gz")) - { - io::gzifstream input{prefix + "/feature.mapping.gz"}; - load_feature_id_mapping(input); - return; - } -#endif - std::ifstream input{prefix + "/feature.mapping", std::ios::binary}; - load_feature_id_mapping(input); -} + io::gzifstream input{prefix + "/feature.mapping.gz"}; -void sequence_analyzer::load_feature_id_mapping(std::istream& input) -{ if (!input) throw exception{"missing feature id mapping"}; @@ -81,11 +67,7 @@ void sequence_analyzer::save(const std::string& prefix) const printing::progress progress{" > Saving feature mapping: ", feature_id_mapping_.size()}; -#if META_HAS_ZLIB io::gzofstream output{prefix + "/feature.mapping.gz"}; -#else - std::ofstream output{prefix + "/feature.mapping", std::ios::binary}; -#endif io::packed::write(output, feature_id_mapping_.size()); uint64_t i = 0; for (const auto& pair : feature_id_mapping_) diff --git a/tests/forward_index_test.cpp b/tests/forward_index_test.cpp index 96c996145..2a20fc8cf 100644 --- a/tests/forward_index_test.cpp +++ b/tests/forward_index_test.cpp @@ -168,7 +168,6 @@ go_bandit([]() { it("should load the index", [&]() { bcancer_forward_test(*svm_cfg); }); }); -#if META_HAS_ZLIB describe("[forward-index] with zlib", []() { filesystem::remove_all("ceeaus-fwd"); @@ -183,7 +182,6 @@ go_bandit([]() { it("should load the index", [&]() { ceeaus_forward_test(*gz_cfg); }); }); -#endif filesystem::remove_all("ceeaus-inv"); filesystem::remove_all("ceeaus-fwd"); diff --git a/tests/inverted_index_test.cpp b/tests/inverted_index_test.cpp index 4181757c1..483a73f86 100644 --- a/tests/inverted_index_test.cpp +++ b/tests/inverted_index_test.cpp @@ -123,7 +123,6 @@ go_bandit([]() { }); }); -#if META_HAS_ZLIB describe("[inverted-index] with zlib", []() { filesystem::remove_all("ceeaus-inv"); @@ -141,7 +140,6 @@ go_bandit([]() { }); }); -#endif filesystem::remove_all("ceeaus-inv"); }); From a710f309e054947ab805c9119251f30bc8c8f86b Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sun, 7 Feb 2016 12:15:13 -0600 Subject: [PATCH 02/48] better error handling and messages for sr_parser and sequence_analyzer model files --- src/parser/sr_parser.cpp | 8 ++++---- src/sequence/sequence_analyzer.cpp | 14 ++++++++------ 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/parser/sr_parser.cpp b/src/parser/sr_parser.cpp index d783add33..d022f1b5e 100644 --- a/src/parser/sr_parser.cpp +++ b/src/parser/sr_parser.cpp @@ -443,11 +443,11 @@ void sr_parser::save(const std::string& prefix) const void sr_parser::load(const std::string& prefix) { - io::gzifstream model{prefix + "/parser.model.gz"}; - - if (!model) - throw sr_parser_exception{"model file not found"}; + auto model_file = prefix + "/parser.model.gz"; + if (!filesystem::file_exists(model_file)) + throw sr_parser_exception{"model file not found: " + model_file}; + io::gzifstream model{model_file}; io::packed::read(model, beam_size_); model_.load(model); } diff --git a/src/sequence/sequence_analyzer.cpp b/src/sequence/sequence_analyzer.cpp index 9d02f2aa6..b93f70ec7 100644 --- a/src/sequence/sequence_analyzer.cpp +++ b/src/sequence/sequence_analyzer.cpp @@ -34,10 +34,11 @@ void sequence_analyzer::load(const std::string& prefix) void sequence_analyzer::load_feature_id_mapping(const std::string& prefix) { - io::gzifstream input{prefix + "/feature.mapping.gz"}; + auto feature_file = prefix + "/feature.mapping.gz"; + if (!filesystem::file_exists(feature_file)) + throw exception{"missing feature id mapping: " + feature_file}; - if (!input) - throw exception{"missing feature id mapping"}; + io::gzifstream input{feature_file}; uint64_t total_num_keys; io::packed::read(input, total_num_keys); @@ -56,10 +57,11 @@ void sequence_analyzer::load_feature_id_mapping(const std::string& prefix) void sequence_analyzer::load_label_id_mapping(const std::string& prefix) { - if (!filesystem::file_exists(prefix + "/label.mapping")) - throw exception{"missing label mapping"}; + auto label_file = prefix + "/label.mapping"; + if (!filesystem::file_exists(label_file)) + throw exception{"missing label mapping: " + label_file}; - map::load_mapping(label_id_mapping_, prefix + "/label.mapping"); + map::load_mapping(label_id_mapping_, label_file); } void sequence_analyzer::save(const std::string& prefix) const From c777e84f5a4b7ef1011fd83fccc302b8bf18c6c1 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Sun, 7 Feb 2016 12:42:44 -0600 Subject: [PATCH 03/48] add zlib dependency to setup guide --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 18b75dff9..782e5a9f6 100644 --- a/README.md +++ b/README.md @@ -136,7 +136,7 @@ sudo add-apt-repository ppa:ubuntu-toolchain-r/test sudo apt-get update # this will probably take a while -sudo apt-get install g++ g++-4.8 git make wget libjemalloc-dev +sudo apt-get install g++ g++-4.8 git make wget libjemalloc-dev zlib1g-dev wget http://www.cmake.org/files/v3.2/cmake-3.2.0-Linux-x86_64.sh sudo sh cmake-3.2.0-Linux-x86_64.sh --prefix=/usr/local @@ -216,7 +216,7 @@ sudo add-apt-repository ppa:george-edison55/cmake-3.x sudo apt-get update # install dependencies -sudo apt-get install cmake libicu-dev git libjemalloc-dev +sudo apt-get install cmake libicu-dev git libjemalloc-dev zlib1g-dev ``` Once the dependencies are all installed, you should double check your @@ -283,7 +283,7 @@ To install the dependencies, run the following commands. ```bash sudo pacman -Sy -sudo pacman -S clang cmake git icu libc++ make jemalloc +sudo pacman -S clang cmake git icu libc++ make jemalloc zlib ``` Once the dependencies are all installed, you should be ready to build. Run @@ -470,7 +470,7 @@ you should run the following commands to download dependencies and related software needed for building: ```bash -pacman -Syu git make mingw-w64-x86_64-{gcc,cmake,icu,jemalloc} +pacman -Syu git make mingw-w64-x86_64-{gcc,cmake,icu,jemalloc,zlib} ``` Then, exit the shell and launch the "MinGW-w64 Win64" shell. You can obtain From d92b3bd2ea8ba7a8bbbba073038d5653a28deb67 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Mon, 8 Feb 2016 13:00:51 -0600 Subject: [PATCH 04/48] Simplify zlib detection in root CMakeLists.txt. --- CMakeLists.txt | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ba2d99755..040f92e37 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,10 +19,10 @@ include(CheckCXXSourceCompiles) include(CheckCXXSourceRuns) include(CMakePushCheckState) include(ExternalProject) -include(FindZLIB) include(cmake/FindOrBuildICU.cmake) find_package(Threads REQUIRED) +find_package(ZLIB REQUIRED) cmake_push_check_state() @@ -118,11 +118,7 @@ if(STDOPT) target_compile_options(meta-definitions INTERFACE ${STDOPT}) endif() -if(ZLIB_FOUND) - target_include_directories(meta-definitions SYSTEM INTERFACE ${ZLIB_INCLUDE_DIRS}) -else() - message(FATAL_ERROR "Failed to find required dependency ZLIB") -endif() +target_include_directories(meta-definitions SYSTEM INTERFACE ${ZLIB_INCLUDE_DIRS}) if(LIBDL_LIBRARY) target_link_libraries(meta-definitions INTERFACE ${LIBDL_LIBRARY}) From b2dc9b46a26fd2d22a8f3e5261c574020ddaab53 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Mon, 8 Feb 2016 13:01:33 -0600 Subject: [PATCH 05/48] Explicitly install zlib in Appveyor MSYS2 build. This isn't required, but we might as well explicitly install all of our needed dependencies in case Appveyor changes the default packages installed in MSYS2. --- .appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.appveyor.yml b/.appveyor.yml index bf45a5fbe..c286bef9e 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -9,7 +9,7 @@ install: - bash -lc "pacman --noconfirm --needed -Sy bash pacman pacman-mirrors msys2-runtime msys2-runtime-devel" # we don't actually need ada, fortran, libgfortran, or objc, but in # order to update gcc we need to also update those packages as well... - - bash -lc "pacman --noconfirm -S mingw-w64-x86_64-{gcc,gcc-ada,gcc-fortran,gcc-libgfortran,gcc-objc,cmake,make,icu,jemalloc}" + - bash -lc "pacman --noconfirm -S mingw-w64-x86_64-{gcc,gcc-ada,gcc-fortran,gcc-libgfortran,gcc-objc,cmake,make,icu,jemalloc,zlib}" before_build: - cd C:\projects\meta - git submodule update --init --recursive From 22df263ffe435d0a03eb0e574006b511aeb9a7be Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Mon, 8 Feb 2016 22:07:01 -0600 Subject: [PATCH 06/48] Improve performance of printing::progress. Before, progress::operator() in tight loops could dramatically hurt performance, particularly due to frequent calls to std::chrono::steady_clock::now() in that function to determine that progress did not need to be reported. This was noticed while writing the shuffle phase for the GloVe trainer, where adding progress reporting slowed the first-pass shuffle phase by nearly 3x. Instead, progress::operator() now simply sets an atomic iteration counter. A background thread will periodically wake (in interval_ millisecond windows) and update the progress output. This pushes the responsibility of delaying progress output to the background thread rather than the main thread. The only overhead now is setting that atomic iteration count, which should be significantly lower than before. This removes an argument from printing::progress's constructor since it is not really needed anymore. --- include/meta/util/progress.h | 49 ++++++------ src/index/forward_index.cpp | 4 +- src/index/inverted_index.cpp | 12 +-- src/io/filesystem.cpp | 22 +++--- src/topics/lda_scvb.cpp | 24 +++--- src/util/CMakeLists.txt | 2 +- src/util/progress.cpp | 141 ++++++++++++++++------------------- 7 files changed, 118 insertions(+), 136 deletions(-) diff --git a/include/meta/util/progress.h b/include/meta/util/progress.h index 3671de7e0..201871b2b 100644 --- a/include/meta/util/progress.h +++ b/include/meta/util/progress.h @@ -10,7 +10,11 @@ #ifndef META_UTIL_PROGRESS_H_ #define META_UTIL_PROGRESS_H_ +#include #include +#include +#include +#include #include namespace meta @@ -36,11 +40,8 @@ class progress * @param length The number of iterations * @param interval The length of time, in milliseconds, to wait * between updates. Default = 500ms. - * @param min_iters The minimum number of iterations that must pass - * before progress reporting will be considered */ - progress(const std::string& prefix, uint64_t length, - int interval = 500, uint64_t min_iters = 10); + progress(const std::string& prefix, uint64_t length, int interval = 500); /** * Sets whether or not an endline should be printed at completion. @@ -56,7 +57,9 @@ class progress ~progress(); /** - * Updates the progress indicator. + * Updates the progress indicator. Since progress is printed + * asynchronously, you may not immediately see results after calling + * this function, but they will be reflected in the next update tick. * @param iter The current iteration number to update to */ void operator()(uint64_t iter); @@ -72,27 +75,27 @@ class progress void clear() const; private: - /// The prefix for the progress report message. - std::string prefix_; + void print(); + void progress_thread(); + + /// The background thread for printing progress updates + std::thread thread_; + /// The mutex for the condition variable. + std::mutex mutex_; + /// The condition variable used by the background thread for sleeping + std::condition_variable cond_var_; + /// The output line + std::string output_; + /// The length of the prefix + const std::size_t prefix_len_; /// The start time of the job. - std::chrono::steady_clock::time_point start_; - /// The time of the last update. - std::chrono::steady_clock::time_point last_update_; - /// The last iteration number. - uint64_t last_iter_; + const std::chrono::steady_clock::time_point start_; + /// The current iteration number. + std::atomic iter_; /// The total number of iterations. - uint64_t length_; + const uint64_t length_; /// The length of time, in milliseconds, to wait between updates. - int interval_; - /** - * The minimum number of iterations that must pass before progress - * reporting will be considered. - */ - uint64_t min_iters_; - /// The length of the last progress output message. - int str_len_; - /// Whether or not we have finished the job. - bool finished_; + const int interval_; /// Whether or not we should print an endline when done. bool endline_; }; diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index 556b22d32..c341fe890 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -604,9 +604,7 @@ void forward_index::impl::compress(const std::string& filename, std::ifstream in{ucfilename, std::ios::binary}; uint64_t byte_pos = 0; - printing::progress progress{ - " > Compressing postings: ", length, 500, 1024 /* 1KB */ - }; + printing::progress progress{" > Compressing postings: ", length}; // note: we will be accessing pdata in sorted order, but not every // doc_id is guaranteed to exist, so we must be mindful of document // gaps diff --git a/src/index/inverted_index.cpp b/src/index/inverted_index.cpp index 5dd34ec5c..f47478570 100644 --- a/src/index/inverted_index.cpp +++ b/src/index/inverted_index.cpp @@ -79,9 +79,7 @@ class inverted_index::impl }; inverted_index::impl::impl(inverted_index* idx, const cpptoml::table& config) - : idx_{idx}, - analyzer_{analyzers::load(config)}, - total_corpus_terms_{0} + : idx_{idx}, analyzer_{analyzers::load(config)}, total_corpus_terms_{0} { // nothing } @@ -260,9 +258,7 @@ void inverted_index::impl::compress(const std::string& filename, std::ifstream in{ucfilename, std::ios::binary}; uint64_t byte_pos = 0; - printing::progress progress{ - " > Compressing postings: ", length, 500, 1024 /* 1KB */ - }; + printing::progress progress{" > Compressing postings: ", length}; // note: we will be accessing pdata in sorted order while (auto bytes = pdata.read_packed(in)) { @@ -319,7 +315,7 @@ float inverted_index::avg_doc_length() } analyzers::feature_map - inverted_index::tokenize(const corpus::document& doc) +inverted_index::tokenize(const corpus::document& doc) { return inv_impl_->analyzer_->analyze(doc); } @@ -336,7 +332,7 @@ auto inverted_index::search_primary(term_id t_id) const } util::optional> - inverted_index::stream_for(term_id t_id) const +inverted_index::stream_for(term_id t_id) const { return inv_impl_->postings_->find_stream(t_id); } diff --git a/src/io/filesystem.cpp b/src/io/filesystem.cpp index 67d93cec8..1a0f26e20 100644 --- a/src/io/filesystem.cpp +++ b/src/io/filesystem.cpp @@ -175,7 +175,6 @@ bool copy_file(const std::string& source, const std::string& dest) dest_file.write(buffer.data(), processed); prog(total_processed); } - prog.end(); } // otherwise, copy the file normally else @@ -201,20 +200,21 @@ uint64_t num_lines(const std::string& filename, char delimiter /*= '\n'*/) io::mmap_file file{filename}; uint64_t num = 0; - printing::progress progress{" > Counting lines in file: ", file.size(), 500, - 32 * 1024 * 1024}; - for (uint64_t idx = 0; idx < file.size(); ++idx) { - progress(idx); - if (file[idx] == delimiter) + printing::progress progress{" > Counting lines in file: ", file.size()}; + for (uint64_t idx = 0; idx < file.size(); ++idx) + { + progress(idx); + if (file[idx] == delimiter) + ++num; + } + + // this fixes a potential off-by-one if the last line in the file + // doesn't end with the delimiter + if (file[file.size() - 1] != delimiter) ++num; } - // this fixes a potential off-by-one if the last line in the file - // doesn't end with the delimiter - if (file[file.size() - 1] != delimiter) - ++num; - return num; } } diff --git a/src/topics/lda_scvb.cpp b/src/topics/lda_scvb.cpp index b0b317b2e..dc81bafd7 100644 --- a/src/topics/lda_scvb.cpp +++ b/src/topics/lda_scvb.cpp @@ -76,7 +76,7 @@ void lda_scvb::initialize(std::mt19937& rng) void lda_scvb::perform_iteration(uint64_t iter, const std::vector& docs) { printing::progress progress{"Minibatch " + std::to_string(iter) + ": ", - minibatch_size_, 100, 1}; + minibatch_size_}; std::vector> batch_topic_term_count_( num_topics_); @@ -104,9 +104,9 @@ void lda_scvb::perform_iteration(uint64_t iter, const std::vector& docs) gamma[k] /= sum; auto lr = 1.0 / std::pow(10 + t, 0.9); auto weight = std::pow(1 - lr, freq.second); - doc_topic_count_[d][k] = - weight * doc_topic_count_[d][k] - + (1 - weight) * idx_->doc_size(d) * gamma[k]; + doc_topic_count_[d][k] + = weight * doc_topic_count_[d][k] + + (1 - weight) * idx_->doc_size(d) * gamma[k]; } t += freq.second; } @@ -131,12 +131,12 @@ void lda_scvb::perform_iteration(uint64_t iter, const std::vector& docs) auto lr = 1.0 / std::pow(10 + t, 0.9); auto weight = std::pow(1 - lr, freq.second); - doc_topic_count_[d][k] = - weight * doc_topic_count_[d][k] - + (1 - weight) * idx_->doc_size(d) * gamma[k]; + doc_topic_count_[d][k] + = weight * doc_topic_count_[d][k] + + (1 - weight) * idx_->doc_size(d) * gamma[k]; - batch_topic_term_count_[k][freq.first] += - idx_->num_docs() * gamma[k]; + batch_topic_term_count_[k][freq.first] + += idx_->num_docs() * gamma[k]; batch_topic_count_[k] += idx_->num_docs() * gamma[k]; } @@ -157,9 +157,9 @@ void lda_scvb::perform_iteration(uint64_t iter, const std::vector& docs) { for (term_id i{0}; i < num_words_; ++i) { - topic_term_count_[k][i] = - (1 - lr) * topic_term_count_[k][i] - + lr * (batch_topic_term_count_[k][i] / minibatch_size_); + topic_term_count_[k][i] + = (1 - lr) * topic_term_count_[k][i] + + lr * (batch_topic_term_count_[k][i] / minibatch_size_); } topic_count_[k] = (1 - lr) * topic_count_[k] + lr * (batch_topic_count_[k] / minibatch_size_); diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index 84edefdbf..fff88d6f4 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -1,4 +1,4 @@ project(meta-util) add_library(meta-util progress.cpp) -target_link_libraries(meta-util meta-definitions) +target_link_libraries(meta-util meta-definitions ${CMAKE_THREAD_LIBS_INIT}) diff --git a/src/util/progress.cpp b/src/util/progress.cpp index aba483ddc..64935cdfc 100644 --- a/src/util/progress.cpp +++ b/src/util/progress.cpp @@ -3,6 +3,9 @@ * @author Chase Geigle */ +#include +#include +#include #include #include #include @@ -15,105 +18,88 @@ namespace meta namespace printing { -namespace -{ -std::string eta_str(int milliseconds) -{ - int hrs = milliseconds / 1000 / 60 / 60; - int ms = milliseconds % (1000 * 60 * 60); - - int min = ms / 1000 / 60; - ms = ms % (1000 * 60); - - int sec = ms / 1000; - - std::stringstream ss; - ss << "ETA " << std::setfill('0') << std::setw(2) << hrs << ':' - << std::setfill('0') << std::setw(2) << min << ':' << std::setfill('0') - << std::setw(2) << sec; - return ss.str(); -} -} - -progress::progress(const std::string& prefix, uint64_t length, int interval, - uint64_t min_iters) - : prefix_{prefix}, +progress::progress(const std::string& prefix, uint64_t length, int interval) + : prefix_len_{prefix.length()}, start_{std::chrono::steady_clock::now()}, - last_update_{start_}, - last_iter_{0}, + iter_{0}, length_{length}, interval_{interval}, - min_iters_{min_iters}, - str_len_{0}, - finished_{false}, endline_{true} { - // nothing -} + output_.resize(80, ' '); + assert(prefix_len_ < 80 - 20); + std::copy(prefix.begin(), prefix.end(), output_.begin()); + output_[prefix_len_] = '['; -void progress::print_endline(bool endline) -{ - endline_ = endline; + thread_ = std::thread(std::bind(&progress::progress_thread, this)); } -void progress::operator()(uint64_t iter) +void progress::print() { using namespace std::chrono; - if (iter - last_iter_ < min_iters_ && iter != length_) - return; - + uint64_t iter = iter_; + if (iter == 0) + iter = 1; auto tp = steady_clock::now(); - if (duration_cast(tp - last_update_).count() < interval_ - && iter != length_) - return; - - last_update_ = tp; - last_iter_ = iter; - auto percent = static_cast(iter) / length_; auto elapsed = duration_cast(tp - start_).count(); - auto remain = static_cast(elapsed) / iter * (length_ - iter); + auto remain = (length_ - iter) * static_cast(elapsed) / (iter); + + auto secs = static_cast(remain / 1000); + auto mins = secs / 60; + auto hrs = mins / 60; - std::stringstream ss; - ss << prefix_; + std::ptrdiff_t max_len = 80 - static_cast(prefix_len_) - 20; + if (hrs > 100) + max_len -= 1; - auto eta = eta_str(static_cast(remain)); - // 4 comes from +2 for the [], +5 for the %, +2 for space - auto remaining_width - = std::max(0, 80 - static_cast(prefix_.length()) - - static_cast(eta.length()) - 9); - if (remaining_width > 15) + auto it = output_.begin() + static_cast(prefix_len_) + 1; + auto barend = it + max_len; + auto end = it + static_cast(max_len * percent); + std::fill(it, end, '='); + *end = '>'; + it = barend; + *it++ = ']'; + *it++ = ' '; + + it += ::sprintf(&(*it), "%d%%", static_cast(percent * 100)); + it += ::sprintf(&(*it), " ETA %02d:%02d:%02d", hrs, mins % 60, secs % 60); + + LOG(progress) << '\r' << output_ << ENDLG; +} + +void progress::progress_thread() +{ + while (iter_ != length_) { - auto filled = static_cast(remaining_width * percent); - auto empty = remaining_width - filled - 1; - - ss << '[' << std::string(static_cast(filled), '='); - if (filled != remaining_width) - { - ss << '>' << std::string( - static_cast(std::max(0, empty)), ' '); - } - ss << ']'; - } + print(); - ss << ' ' << static_cast(percent * 100) << "% " << eta; + std::unique_lock lock{mutex_}; + cond_var_.wait_for(lock, std::chrono::milliseconds(interval_)); + } + print(); +} - std::string rem(static_cast( - std::max(0, str_len_ - static_cast(ss.tellp()))), - ' '); - str_len_ = static_cast(ss.tellp()); - ss << rem; +void progress::print_endline(bool endline) +{ + endline_ = endline; +} - LOG(progress) << '\r' << ss.str() << ENDLG; +void progress::operator()(uint64_t iter) +{ + iter_ = (iter < length_) ? iter : length_; } void progress::end() { - finished_ = true; - if (last_iter_ != length_) - (*this)(length_); - if (endline_) - LOG(progress) << '\n' << ENDLG; + if (thread_.joinable()) + { + iter_ = length_; + cond_var_.notify_all(); + thread_.join(); + if (endline_) + LOG(progress) << '\n' << ENDLG; + } } void progress::clear() const @@ -123,8 +109,7 @@ void progress::clear() const progress::~progress() { - if (!finished_) - end(); + end(); } } } From 27166b0953a9a340f27b43500f6d48f2381b14fd Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 2 Feb 2016 18:45:26 -0600 Subject: [PATCH 07/48] Begin work on word embedding support. --- include/meta/hashing/hash_storage.h | 51 +++- include/meta/hashing/hash_traits.h | 4 +- include/meta/hashing/probe_map.h | 4 + include/meta/hashing/probe_set.h | 3 + src/CMakeLists.txt | 1 + src/embeddings/CMakeLists.txt | 3 + src/embeddings/tools/CMakeLists.txt | 5 + src/embeddings/tools/embedding_coocur.cpp | 279 ++++++++++++++++++++++ src/embeddings/tools/embedding_vocab.cpp | 134 +++++++++++ 9 files changed, 478 insertions(+), 6 deletions(-) create mode 100644 src/embeddings/CMakeLists.txt create mode 100644 src/embeddings/tools/CMakeLists.txt create mode 100644 src/embeddings/tools/embedding_coocur.cpp create mode 100644 src/embeddings/tools/embedding_vocab.cpp diff --git a/include/meta/hashing/hash_storage.h b/include/meta/hashing/hash_storage.h index bc8b45ade..2f7982aba 100644 --- a/include/meta/hashing/hash_storage.h +++ b/include/meta/hashing/hash_storage.h @@ -387,6 +387,16 @@ class storage_base using hash_type = typename storage_traits::hash_type; using equal_type = typename storage_traits::equal_type; + constexpr static double default_max_load_factor() + { + return 0.85; + } + + constexpr static double default_resize_ratio() + { + return 1.5; + } + iterator begin() { return {as_derived(), 0}; @@ -553,8 +563,8 @@ class storage_base hash_type hash_; equal_type equal_; - double max_load_factor_ = 0.85; - double resize_ratio_ = 1.5; + double max_load_factor_ = default_max_load_factor(); + double resize_ratio_ = default_resize_ratio(); }; /** @@ -908,6 +918,11 @@ class inline_key_value_storage + sizeof(std::size_t); } + vector_type extract() && + { + return std::move(table_); + } + vector_type table_; std::size_t size_; }; @@ -1019,8 +1034,8 @@ class inline_key_external_value_storage { assert(new_cap > capacity()); - std::vector> temptable( - new_cap, std::make_pair(key_traits::sentinel(), 0)); + key_vector_type temptable(new_cap, + std::make_pair(key_traits::sentinel(), 0)); using std::swap; swap(table_, temptable); @@ -1041,6 +1056,26 @@ class inline_key_external_value_storage + sizeof(V) * values_.capacity(); } + std::vector> extract() && + { + std::vector> ret; + ret.reserve(values_.size()); + + for (auto& key_pr : table_) + { + if (!this->key_equal(key_pr.first, key_traits::sentinel())) + { + ret.emplace_back(std::move(key_pr.first), + std::move(values_[key_pr.second])); + } + } + + key_vector_type{}.swap(table_); + value_vector_type{}.swap(values_); + + return ret; + } + key_vector_type table_; value_vector_type values_; }; @@ -1166,6 +1201,14 @@ class external_key_value_storage + sizeof(std::pair) * storage_.capacity(); } + kv_vector_type extract() && + { + idx_vector_type{}.swap(table_); + kv_vector_type ret; + storage_.swap(ret); + return ret; + } + idx_vector_type table_; kv_vector_type storage_; }; diff --git a/include/meta/hashing/hash_traits.h b/include/meta/hashing/hash_traits.h index 9fcdaf787..5472d986e 100644 --- a/include/meta/hashing/hash_traits.h +++ b/include/meta/hashing/hash_traits.h @@ -99,7 +99,7 @@ struct hash_traits> { template using key_inlineable_storage = typename std:: - conditional::inlineable, + conditional, inline_key_external_value_storage> KeyEqual>>::type; using key_inlineable_probe_entry = - typename std::conditional::inlineable, std::pair, + typename std::conditional, std::pair>::type; using probe_entry = diff --git a/include/meta/hashing/probe_map.h b/include/meta/hashing/probe_map.h index 3fdfe4b27..91e981193 100644 --- a/include/meta/hashing/probe_map.h +++ b/include/meta/hashing/probe_map.h @@ -46,6 +46,9 @@ class probe_map using typename storage_type::iterator; using typename storage_type::const_iterator; + using storage_type::default_max_load_factor; + using storage_type::default_resize_ratio; + using storage_type::storage_type; using storage_type::begin; using storage_type::end; @@ -60,6 +63,7 @@ class probe_map using storage_type::capacity; using storage_type::clear; using storage_type::bytes_used; + using storage_type::extract; probe_map() : storage_type{8} { diff --git a/include/meta/hashing/probe_set.h b/include/meta/hashing/probe_set.h index ec276cf9b..dfe578163 100644 --- a/include/meta/hashing/probe_set.h +++ b/include/meta/hashing/probe_set.h @@ -49,6 +49,9 @@ class probe_set using typename storage_type::iterator; using typename storage_type::const_iterator; + using storage_type::default_max_load_factor; + using storage_type::default_resize_ratio; + using storage_type::storage_type; using storage_type::begin; using storage_type::end; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 0ae3d497d..374cbbaf7 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -3,6 +3,7 @@ project(meta) add_subdirectory(analyzers) add_subdirectory(classify) add_subdirectory(corpus) +add_subdirectory(embeddings) add_subdirectory(features) add_subdirectory(graph) add_subdirectory(index) diff --git a/src/embeddings/CMakeLists.txt b/src/embeddings/CMakeLists.txt new file mode 100644 index 000000000..c61f3800f --- /dev/null +++ b/src/embeddings/CMakeLists.txt @@ -0,0 +1,3 @@ +project(meta-embeddings) + +add_subdirectory(tools) diff --git a/src/embeddings/tools/CMakeLists.txt b/src/embeddings/tools/CMakeLists.txt new file mode 100644 index 000000000..acea8b789 --- /dev/null +++ b/src/embeddings/tools/CMakeLists.txt @@ -0,0 +1,5 @@ +add_executable(embedding-vocab embedding_vocab.cpp) +target_link_libraries(embedding-vocab meta-analyzers meta-util meta-io) + +add_executable(embedding-coocur embedding_coocur.cpp) +target_link_libraries(embedding-coocur meta-analyzers meta-util meta-io) diff --git a/src/embeddings/tools/embedding_coocur.cpp b/src/embeddings/tools/embedding_coocur.cpp new file mode 100644 index 000000000..e0000fd61 --- /dev/null +++ b/src/embeddings/tools/embedding_coocur.cpp @@ -0,0 +1,279 @@ +/** + * @file embedding_coocur.cpp + * @author Chase Geigle + * + * This tool builds the weighted coocurrence matrix for the GloVe training + * method. + */ + +#include + +#include "cpptoml.h" +#include "meta/analyzers/all.h" +#include "meta/analyzers/token_stream.h" +#include "meta/corpus/corpus_factory.h" +#include "meta/hashing/probe_map.h" +#include "meta/io/packed.h" +#include "meta/logging/logger.h" +#include "meta/util/progress.h" +#include "meta/util/printing.h" + +using namespace meta; + +namespace meta +{ +namespace hashing +{ +template +struct key_traits> +{ + static constexpr bool inlineable + = key_traits::inlineable && key_traits::inlineable; + + constexpr static std::pair sentinel() + { + return {key_traits::sentinel(), key_traits::sentinel()}; + } +}; +} +} + +class coocur_buffer +{ + public: + coocur_buffer(std::size_t max_ram, util::string_view prefix) + : max_bytes_{max_ram}, + prefix_{prefix.to_string()}, + coocur_{static_cast(max_bytes_ / sizeof(count_t))} + { + // nothing + } + + void flush() + { + LOG(info) << "Flushing buffer of size: " + << printing::bytes_to_units(coocur_.bytes_used()) << " with " + << coocur_.size() << " unique pairs" << ENDLG; + + { + auto items = std::move(coocur_).extract(); + std::sort(items.begin(), items.end(), + [](const count_t& a, const count_t& b) + { + return a.first < b.first; + }); + + std::ofstream output{prefix_ + "/chunk-" + + std::to_string(chunk_num_), + std::ios::binary}; + io::packed::write(output, items.size()); + for (const auto& pr : items) + { + io::packed::write(output, pr.first.first); + io::packed::write(output, pr.first.second); + io::packed::write(output, pr.second); + } + } + + coocur_ = map_t{static_cast(max_bytes_ / sizeof(count_t))}; + ++chunk_num_; + } + + void operator()(uint64_t target, uint64_t context, double weight) + { + auto it = coocur_.find(std::make_pair(target, context)); + if (it == coocur_.end()) + { + maybe_flush(); + coocur_[std::make_pair(target, context)] = weight; + } + else + { + it->value() += weight; + } + } + + std::size_t num_chunks() const + { + return chunk_num_; + } + + private: + void maybe_flush() + { + // check if inserting a new coocurrence would cause a resize + if (coocur_.next_load_factor() >= coocur_.max_load_factor()) + { + // see if the newly resized table would fit in ram + auto bytes_used = coocur_.bytes_used() * coocur_.resize_ratio(); + + if (bytes_used >= max_bytes_) + { + flush(); + } + } + } + + using count_t = std::pair, double>; + using map_t + = meta::hashing::probe_map, double>; + const std::size_t max_bytes_; + const std::string prefix_; + map_t coocur_; + std::size_t chunk_num_ = 0; +}; + +std::unique_ptr +make_stream(const cpptoml::table& config) +{ + std::unique_ptr stream; + auto analyzers = config.get_table_array("analyzers"); + for (const auto& group : analyzers->get()) + { + auto method = group->get_as("method"); + if (!method) + continue; + + if (*method == analyzers::ngram_word_analyzer::id) + { + stream = analyzers::load_filters(config, *group); + break; + } + } + return stream; +} + +hashing::probe_map +load_vocab(const std::string& filename) +{ + using map_type = hashing::probe_map; + + std::ifstream input{filename, std::ios::binary}; + auto size = io::packed::read(input); + auto reserve_size = static_cast( + std::ceil(size / map_type::default_max_load_factor())); + + printing::progress progress{" > Loading vocab: ", size}; + map_type vocab{reserve_size}; + for (uint64_t tid{0}; tid < size; ++tid) + { + progress(tid); + auto word = io::packed::read(input); + io::packed::read(input); + + vocab[word] = tid; + } + + return vocab; +} + +int main(int argc, char** argv) +{ + if (argc < 2) + { + std::cerr << "Usage: " << argv[0] << " config.toml" << std::endl; + return 1; + } + + logging::set_cerr_logging(); + + auto config = cpptoml::parse_file(argv[1]); + + // extract building parameters + auto embed_cfg = config->get_table("embeddings"); + auto prefix = *embed_cfg->get_as("prefix"); + auto vocab_filename = prefix + "/vocab.bin"; + auto window_size = static_cast( + embed_cfg->get_as("window-size").value_or(15)); + auto max_ram = static_cast( + embed_cfg->get_as("max-ram").value_or(4096)) * 1024 * 1024; + + if (!filesystem::file_exists(vocab_filename)) + { + LOG(fatal) << "Vocabulary file has not yet been generated, please do " + "this before building the coocurrence table" + << ENDLG; + return 1; + } + + auto vocab = load_vocab(vocab_filename); + LOG(info) << "Loaded vocabulary of size " << vocab.size() << " occupying " + << printing::bytes_to_units(vocab.bytes_used()) << ENDLG; + + if (max_ram <= vocab.bytes_used()) + { + LOG(fatal) << "RAM limit too restrictive" << ENDLG; + return 1; + } + + max_ram -= vocab.bytes_used(); + if (max_ram < 1024 * 1024) + { + LOG(fatal) << "RAM limit too restrictive" << ENDLG; + return 1; + } + + auto stream = make_stream(*config); + if (!stream) + { + LOG(fatal) << "Failed to find an ngram-word analyzer configuration in " + << argv[1] << ENDLG; + return 1; + } + + coocur_buffer coocur{max_ram, prefix}; + + { + auto docs = corpus::make_corpus(*config); + printing::progress progress{" > Counting coocurrences: ", docs->size()}; + for (uint64_t i = 0; docs->has_next(); ++i) + { + progress(i); + auto doc = docs->next(); + stream->set_content(analyzers::get_content(doc)); + + std::deque history; + while (*stream) + { + auto tok = stream->next(); + + if (tok == "") + { + history.clear(); + } + else if (tok == "") + { + continue; + } + else + { + // ignore out-of-vocabulary words + auto it = vocab.find(tok); + if (it == vocab.end()) + continue; + + auto tid = it->value(); + + // everything in history is a left-context of tid. + // Likewise, tid is a right-context of everything in + // history. + for (auto it = history.begin(), end = history.end(); + it != end; ++it) + { + auto dist = std::distance(it, end); + coocur(tid, *it, 1.0 / dist); + coocur(*it, tid, 1.0 / dist); + } + + history.push_back(tid); + if (history.size() > window_size) + history.pop_front(); + } + } + } + } + + coocur.flush(); + + return 0; +} diff --git a/src/embeddings/tools/embedding_vocab.cpp b/src/embeddings/tools/embedding_vocab.cpp new file mode 100644 index 000000000..90be60953 --- /dev/null +++ b/src/embeddings/tools/embedding_vocab.cpp @@ -0,0 +1,134 @@ +/** + * @file embedding_vocab.cpp + * @author Chase Geigle + * + * This tool builds the vocabulary file for the other word embedding tools. + */ + +#include "cpptoml.h" +#include "meta/analyzers/all.h" +#include "meta/analyzers/token_stream.h" +#include "meta/corpus/corpus_factory.h" +#include "meta/hashing/probe_map.h" +#include "meta/io/packed.h" +#include "meta/logging/logger.h" +#include "meta/util/progress.h" + +using namespace meta; + +std::unique_ptr +make_stream(const cpptoml::table& config) +{ + std::unique_ptr stream; + auto analyzers = config.get_table_array("analyzers"); + for (const auto& group : analyzers->get()) + { + auto method = group->get_as("method"); + if (!method) + continue; + + if (*method == analyzers::ngram_word_analyzer::id) + { + stream = analyzers::load_filters(config, *group); + break; + } + } + return stream; +} + +int main(int argc, char** argv) +{ + if (argc < 2) + { + std::cerr << "Usage: " << argv[0] << " config.toml" << std::endl; + return 1; + } + + logging::set_cerr_logging(); + + auto config = cpptoml::parse_file(argv[1]); + + // extract vocab building parameters + auto embed_cfg = config->get_table("embeddings"); + auto vocab_filename + = *embed_cfg->get_as("prefix") + "/vocab.bin"; + auto vocab_cfg = embed_cfg->get_table("vocab"); + + auto min_count = vocab_cfg->get_as("min-count").value_or(100); + auto max_size = vocab_cfg->get_as("max-size") + .value_or(std::numeric_limits::max()); + + auto stream = make_stream(*config); + if (!stream) + { + LOG(fatal) << "Failed to find an ngram-word analyzer configuration in " + << argv[1] << ENDLG; + return 1; + } + + hashing::probe_map vocab; + + { + auto docs = corpus::make_corpus(*config); + printing::progress progress{" > Building vocabulary: ", docs->size()}; + for (uint64_t i = 0; docs->has_next(); ++i) + { + progress(i); + auto doc = docs->next(); + stream->set_content(analyzers::get_content(doc)); + + while (*stream) + ++vocab[stream->next()]; + } + } + + LOG(info) << "Found " << vocab.size() << " unique words" << ENDLG; + + LOG(progress) << "> Sorting vocab...\n" << ENDLG; + auto items = std::move(vocab).extract(); + + auto begin = std::begin(items); + auto end = std::end(items); + auto middle = end; + if (items.size() > static_cast(max_size)) + middle = begin + max_size; + + using count_t = std::pair; + + // partial sort to avoid doing redundant work if our desired vocab + // size is smaller than items.size() + std::partial_sort(begin, middle, end, [](const count_t& a, const count_t& b) + { + return a.second > b.second; + }); + + // truncate the vocabulary if needed: locate the position one past the + // last element that is greater than the threshold value + auto it + = std::lower_bound(begin, middle, static_cast(min_count) - 1, + [](const count_t& a, uint64_t thresh) + { + // comparison is reversed since we're + // working on reverse-sorted data + return a.second > thresh; + }); + + auto size = static_cast(std::distance(begin, it)); + + LOG(info) << "Vocab truncated to size " << size << ENDLG; + + { + std::ofstream output{vocab_filename, std::ios::binary}; + printing::progress progress{" > Writing vocab: ", size}; + + io::packed::write(output, size); + for (uint64_t i = 0; begin != it; ++begin, ++i) + { + progress(i); + io::packed::write(output, begin->first); + io::packed::write(output, begin->second); + } + } + + return 0; +} From 16fa2defcb8373f27b7ec21bd2a6d488d596386a Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 3 Feb 2016 05:04:00 -0600 Subject: [PATCH 08/48] Fix probe_map::extract() for inline_key_value_storage type. Old implementation forgot to delete all sentinel values before returning the vector. --- include/meta/hashing/hash_storage.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/meta/hashing/hash_storage.h b/include/meta/hashing/hash_storage.h index 2f7982aba..a7c4ea73e 100644 --- a/include/meta/hashing/hash_storage.h +++ b/include/meta/hashing/hash_storage.h @@ -920,6 +920,15 @@ class inline_key_value_storage vector_type extract() && { + // get rid of all blank cells + table_.erase(std::remove_if(table_.begin(), table_.end(), + [this](const std::pair& pr) + { + return this->key_equal( + pr.first, + key_traits::sentinel()); + }), + table_.end()); return std::move(table_); } From e61987378c9b31ac60f94ba58408f8c8f681b009 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 3 Feb 2016 05:04:51 -0600 Subject: [PATCH 09/48] Ensure that the embeddings prefix directory exists. --- src/embeddings/tools/embedding_vocab.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/embeddings/tools/embedding_vocab.cpp b/src/embeddings/tools/embedding_vocab.cpp index 90be60953..68be8e01b 100644 --- a/src/embeddings/tools/embedding_vocab.cpp +++ b/src/embeddings/tools/embedding_vocab.cpp @@ -50,8 +50,8 @@ int main(int argc, char** argv) // extract vocab building parameters auto embed_cfg = config->get_table("embeddings"); - auto vocab_filename - = *embed_cfg->get_as("prefix") + "/vocab.bin"; + auto prefix = *embed_cfg->get_as("prefix"); + auto vocab_filename = prefix + "/vocab.bin"; auto vocab_cfg = embed_cfg->get_table("vocab"); auto min_count = vocab_cfg->get_as("min-count").value_or(100); @@ -118,6 +118,7 @@ int main(int argc, char** argv) LOG(info) << "Vocab truncated to size " << size << ENDLG; { + filesystem::make_directory(prefix); std::ofstream output{vocab_filename, std::ios::binary}; printing::progress progress{" > Writing vocab: ", size}; From d7047abcfdf3e8ad58cbc6557189c09b759aa624 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 3 Feb 2016 05:09:15 -0600 Subject: [PATCH 10/48] Merge on-disk coocurrence chunks into one master file. --- src/embeddings/tools/embedding_coocur.cpp | 152 +++++++++++++++++++++- 1 file changed, 150 insertions(+), 2 deletions(-) diff --git a/src/embeddings/tools/embedding_coocur.cpp b/src/embeddings/tools/embedding_coocur.cpp index e0000fd61..537258c09 100644 --- a/src/embeddings/tools/embedding_coocur.cpp +++ b/src/embeddings/tools/embedding_coocur.cpp @@ -15,6 +15,7 @@ #include "meta/hashing/probe_map.h" #include "meta/io/packed.h" #include "meta/logging/logger.h" +#include "meta/util/multiway_merge.h" #include "meta/util/progress.h" #include "meta/util/printing.h" @@ -38,6 +39,126 @@ struct key_traits> } } +struct coocur_record +{ + uint64_t target; + uint64_t context; + double weight; + + void merge_with(coocur_record&& other) + { + weight += other.weight; + } + + template + uint64_t write(OutputStream& os) const + { + auto bytes = io::packed::write(os, target); + bytes += io::packed::write(os, context); + bytes += io::packed::write(os, weight); + return bytes; + } +}; + +bool operator==(const coocur_record& a, const coocur_record& b) +{ + return std::tie(a.target, a.context) == std::tie(b.target, b.context); +} + +bool operator!=(const coocur_record& a, const coocur_record& b) +{ + return !(a == b); +} + +bool operator<(const coocur_record& a, const coocur_record& b) +{ + return std::tie(a.target, a.context) < std::tie(b.target, b.context); +} + +class coocur_chunk_iterator +{ + public: + using value_type = coocur_record; + + coocur_chunk_iterator(const std::string& filename) + : path_{filename}, + input_{make_unique(filename, std::ios::binary)}, + total_bytes_{filesystem::file_size(filename)}, + bytes_read_{0} + { + ++(*this); + } + + coocur_chunk_iterator() = default; + coocur_chunk_iterator(coocur_chunk_iterator&&) = default; + + ~coocur_chunk_iterator() + { + if (input_) + { + input_ = nullptr; + filesystem::delete_file(path_); + } + } + + coocur_chunk_iterator& operator++() + { + if (input_->get() == EOF) + return *this; + + input_->unget(); + bytes_read_ += io::packed::read(*input_, record_.target); + bytes_read_ += io::packed::read(*input_, record_.context); + bytes_read_ += io::packed::read(*input_, record_.weight); + return *this; + } + + coocur_record& operator*() + { + return record_; + } + + const coocur_record& operator*() const + { + return record_; + } + + bool operator==(const coocur_chunk_iterator& other) const + { + if (!other.input_) + { + return !input_ || !static_cast(*input_); + } + else + { + return std::tie(path_, bytes_read_) + == std::tie(other.path_, other.bytes_read_); + } + } + + uint64_t total_bytes() const + { + return total_bytes_; + } + + uint64_t bytes_read() const + { + return bytes_read_; + } + + private: + std::string path_; + std::unique_ptr input_; + coocur_record record_; + uint64_t total_bytes_; + uint64_t bytes_read_; +}; + +bool operator!=(const coocur_chunk_iterator& a, const coocur_chunk_iterator& b) +{ + return !(a == b); +} + class coocur_buffer { public: @@ -66,7 +187,6 @@ class coocur_buffer std::ofstream output{prefix_ + "/chunk-" + std::to_string(chunk_num_), std::ios::binary}; - io::packed::write(output, items.size()); for (const auto& pr : items) { io::packed::write(output, pr.first.first); @@ -98,6 +218,23 @@ class coocur_buffer return chunk_num_; } + uint64_t merge_chunks() + { + coocur_ = map_t{}; + std::vector chunks; + chunks.reserve(num_chunks()); + + for (std::size_t i = 0; i < num_chunks(); ++i) + chunks.emplace_back(prefix_ + "/chunk-" + std::to_string(i)); + + std::ofstream output{prefix_ + "/coocur.bin", std::ios::binary}; + return util::multiway_merge(chunks.begin(), chunks.end(), + [&](coocur_record&& record) + { + record.write(output); + }); + } + private: void maybe_flush() { @@ -186,7 +323,8 @@ int main(int argc, char** argv) auto window_size = static_cast( embed_cfg->get_as("window-size").value_or(15)); auto max_ram = static_cast( - embed_cfg->get_as("max-ram").value_or(4096)) * 1024 * 1024; + embed_cfg->get_as("max-ram").value_or(4096)) + * 1024 * 1024; if (!filesystem::file_exists(vocab_filename)) { @@ -273,7 +411,17 @@ int main(int argc, char** argv) } } + // flush any remaining elements coocur.flush(); + // merge all on-disk chunks + auto uniq = coocur.merge_chunks(); + + LOG(info) << "Coocurrence matrix elements: " << uniq << ENDLG; + LOG(info) << "Coocurrence matrix size: " + << printing::bytes_to_units( + filesystem::file_size(prefix + "/coocur.bin")) + << ENDLG; + return 0; } From 0505e056993571c81e040390eba6782089197d96 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 3 Feb 2016 06:06:24 -0600 Subject: [PATCH 11/48] Attempt to fix build on older GCC versions. --- include/meta/util/multiway_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/meta/util/multiway_merge.h b/include/meta/util/multiway_merge.h index 61b49da77..e5c58cf33 100644 --- a/include/meta/util/multiway_merge.h +++ b/include/meta/util/multiway_merge.h @@ -115,7 +115,7 @@ uint64_t multiway_merge(ForwardIterator begin, ForwardIterator end, auto range = std::equal_range(to_merge.begin(), to_merge.end(), to_merge.front(), chunk_iter_comp); - Record merged{std::move(*(*range.first).get())}; + auto merged = std::move(*(*range.first).get()); ++(*range.first).get(); ++range.first; std::for_each(range.first, range.second, [&](ChunkIterator& iter) From 1b5fde03009ac6688274429ac90283e8ca947191 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 3 Feb 2016 14:04:39 -0600 Subject: [PATCH 12/48] Remove unused using declaration. --- include/meta/util/multiway_merge.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/meta/util/multiway_merge.h b/include/meta/util/multiway_merge.h index e5c58cf33..f95b5608b 100644 --- a/include/meta/util/multiway_merge.h +++ b/include/meta/util/multiway_merge.h @@ -76,7 +76,6 @@ uint64_t multiway_merge(ForwardIterator begin, ForwardIterator end, RecordHandler&& output) { using ChunkIterator = typename ForwardIterator::value_type; - using Record = typename ChunkIterator::value_type; uint64_t to_read = std::accumulate( begin, end, 0ul, [](uint64_t acc, const ChunkIterator& chunk) From ff94191ce667b308c6813ae5d00037df90b01b40 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 3 Feb 2016 14:05:28 -0600 Subject: [PATCH 13/48] Begin initial work on GloVe training tool. Refactor the existing embedding-coocur code to pull out the reusable components into the meta::embeddings namespace. Add the first step of GloVe training, which is to shuffle and partition the coocurrence matrix into num_threads chunks, each of which can be operated on in parallel. Add initial skeleton of what interacting with the glove class might look like. --- include/meta/embeddings/coocur_iterator.h | 101 ++++++++ include/meta/embeddings/coocur_record.h | 70 ++++++ src/embeddings/tools/CMakeLists.txt | 3 + src/embeddings/tools/embedding_coocur.cpp | 143 ++--------- src/embeddings/tools/glove.cpp | 280 ++++++++++++++++++++++ 5 files changed, 471 insertions(+), 126 deletions(-) create mode 100644 include/meta/embeddings/coocur_iterator.h create mode 100644 include/meta/embeddings/coocur_record.h create mode 100644 src/embeddings/tools/glove.cpp diff --git a/include/meta/embeddings/coocur_iterator.h b/include/meta/embeddings/coocur_iterator.h new file mode 100644 index 000000000..37135829f --- /dev/null +++ b/include/meta/embeddings/coocur_iterator.h @@ -0,0 +1,101 @@ +/** + * @file coocur_iterator.h + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_EMBEDDINGS_COOCUR_ITERATOR_H_ +#define META_EMBEDDINGS_COOCUR_ITERATOR_H_ + +#include + +#include "meta/embeddings/coocur_record.h" +#include "meta/io/filesystem.h" +#include "meta/util/shim.h" + +namespace meta +{ +namespace embeddings +{ + +/** + * An iterator over coocur_record's that live in a packed file on disk. + * Satisfies the ChunkIterator concept for multiway_merge support. + */ +class coocur_iterator +{ + public: + using value_type = coocur_record; + + coocur_iterator(const std::string& filename) + : path_{filename}, + input_{make_unique(filename, std::ios::binary)}, + total_bytes_{filesystem::file_size(filename)}, + bytes_read_{0} + { + ++(*this); + } + + coocur_iterator() = default; + coocur_iterator(coocur_iterator&&) = default; + + coocur_iterator& operator++() + { + if (input_->peek() == EOF) + return *this; + + bytes_read_ += record_.read(*input_); + return *this; + } + + coocur_record& operator*() + { + return record_; + } + + const coocur_record& operator*() const + { + return record_; + } + + bool operator==(const coocur_iterator& other) const + { + if (!other.input_) + { + return !input_ || !static_cast(*input_); + } + else + { + return std::tie(path_, bytes_read_) + == std::tie(other.path_, other.bytes_read_); + } + } + + uint64_t total_bytes() const + { + return total_bytes_; + } + + uint64_t bytes_read() const + { + return bytes_read_; + } + + private: + std::string path_; + std::unique_ptr input_; + coocur_record record_; + uint64_t total_bytes_; + uint64_t bytes_read_; +}; + +bool operator!=(const coocur_iterator& a, const coocur_iterator& b) +{ + return !(a == b); +} +} +} +#endif diff --git a/include/meta/embeddings/coocur_record.h b/include/meta/embeddings/coocur_record.h new file mode 100644 index 000000000..e7d35e42a --- /dev/null +++ b/include/meta/embeddings/coocur_record.h @@ -0,0 +1,70 @@ +/** + * @file coocur_record.h + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_EMBEDDINGS_COOCUR_RECORD_H_ +#define META_EMBEDDINGS_COOCUR_RECORD_H_ + +#include +#include "meta/io/packed.h" + +namespace meta +{ +namespace embeddings +{ +/** + * Represents an entry in the coocurrence matrix. Satisfies the Record + * concept for multiway_merge support. + */ +struct coocur_record +{ + uint64_t target; + uint64_t context; + double weight; + + void merge_with(coocur_record&& other) + { + weight += other.weight; + } + + template + uint64_t write(OutputStream& os) const + { + auto bytes = io::packed::write(os, target); + bytes += io::packed::write(os, context); + bytes += io::packed::write(os, weight); + return bytes; + } + + template + uint64_t read(InputStream& is) + { + auto bytes = io::packed::read(is, target); + bytes += io::packed::read(is, context); + bytes += io::packed::read(is, weight); + return bytes; + } +}; + +bool operator==(const coocur_record& a, const coocur_record& b) +{ + return std::tie(a.target, a.context) == std::tie(b.target, b.context); +} + +bool operator!=(const coocur_record& a, const coocur_record& b) +{ + return !(a == b); +} + +bool operator<(const coocur_record& a, const coocur_record& b) +{ + return std::tie(a.target, a.context) < std::tie(b.target, b.context); +} +} +} +#endif diff --git a/src/embeddings/tools/CMakeLists.txt b/src/embeddings/tools/CMakeLists.txt index acea8b789..36d9e30cb 100644 --- a/src/embeddings/tools/CMakeLists.txt +++ b/src/embeddings/tools/CMakeLists.txt @@ -3,3 +3,6 @@ target_link_libraries(embedding-vocab meta-analyzers meta-util meta-io) add_executable(embedding-coocur embedding_coocur.cpp) target_link_libraries(embedding-coocur meta-analyzers meta-util meta-io) + +add_executable(glove glove.cpp) +target_link_libraries(glove meta-util meta-io cpptoml) diff --git a/src/embeddings/tools/embedding_coocur.cpp b/src/embeddings/tools/embedding_coocur.cpp index 537258c09..b0090576c 100644 --- a/src/embeddings/tools/embedding_coocur.cpp +++ b/src/embeddings/tools/embedding_coocur.cpp @@ -12,6 +12,7 @@ #include "meta/analyzers/all.h" #include "meta/analyzers/token_stream.h" #include "meta/corpus/corpus_factory.h" +#include "meta/embeddings/coocur_iterator.h" #include "meta/hashing/probe_map.h" #include "meta/io/packed.h" #include "meta/logging/logger.h" @@ -39,126 +40,6 @@ struct key_traits> } } -struct coocur_record -{ - uint64_t target; - uint64_t context; - double weight; - - void merge_with(coocur_record&& other) - { - weight += other.weight; - } - - template - uint64_t write(OutputStream& os) const - { - auto bytes = io::packed::write(os, target); - bytes += io::packed::write(os, context); - bytes += io::packed::write(os, weight); - return bytes; - } -}; - -bool operator==(const coocur_record& a, const coocur_record& b) -{ - return std::tie(a.target, a.context) == std::tie(b.target, b.context); -} - -bool operator!=(const coocur_record& a, const coocur_record& b) -{ - return !(a == b); -} - -bool operator<(const coocur_record& a, const coocur_record& b) -{ - return std::tie(a.target, a.context) < std::tie(b.target, b.context); -} - -class coocur_chunk_iterator -{ - public: - using value_type = coocur_record; - - coocur_chunk_iterator(const std::string& filename) - : path_{filename}, - input_{make_unique(filename, std::ios::binary)}, - total_bytes_{filesystem::file_size(filename)}, - bytes_read_{0} - { - ++(*this); - } - - coocur_chunk_iterator() = default; - coocur_chunk_iterator(coocur_chunk_iterator&&) = default; - - ~coocur_chunk_iterator() - { - if (input_) - { - input_ = nullptr; - filesystem::delete_file(path_); - } - } - - coocur_chunk_iterator& operator++() - { - if (input_->get() == EOF) - return *this; - - input_->unget(); - bytes_read_ += io::packed::read(*input_, record_.target); - bytes_read_ += io::packed::read(*input_, record_.context); - bytes_read_ += io::packed::read(*input_, record_.weight); - return *this; - } - - coocur_record& operator*() - { - return record_; - } - - const coocur_record& operator*() const - { - return record_; - } - - bool operator==(const coocur_chunk_iterator& other) const - { - if (!other.input_) - { - return !input_ || !static_cast(*input_); - } - else - { - return std::tie(path_, bytes_read_) - == std::tie(other.path_, other.bytes_read_); - } - } - - uint64_t total_bytes() const - { - return total_bytes_; - } - - uint64_t bytes_read() const - { - return bytes_read_; - } - - private: - std::string path_; - std::unique_ptr input_; - coocur_record record_; - uint64_t total_bytes_; - uint64_t bytes_read_; -}; - -bool operator!=(const coocur_chunk_iterator& a, const coocur_chunk_iterator& b) -{ - return !(a == b); -} - class coocur_buffer { public: @@ -221,18 +102,28 @@ class coocur_buffer uint64_t merge_chunks() { coocur_ = map_t{}; - std::vector chunks; + std::vector chunks; chunks.reserve(num_chunks()); for (std::size_t i = 0; i < num_chunks(); ++i) chunks.emplace_back(prefix_ + "/chunk-" + std::to_string(i)); std::ofstream output{prefix_ + "/coocur.bin", std::ios::binary}; - return util::multiway_merge(chunks.begin(), chunks.end(), - [&](coocur_record&& record) - { - record.write(output); - }); + auto num_records + = util::multiway_merge(chunks.begin(), chunks.end(), + [&](embeddings::coocur_record&& record) + { + record.write(output); + }); + chunks.clear(); + + // clean up temporary files + for (std::size_t i = 0; i < num_chunks(); ++i) + { + filesystem::delete_file(prefix_ + "/chunk-" + std::to_string(i)); + } + + return num_records; } private: diff --git a/src/embeddings/tools/glove.cpp b/src/embeddings/tools/glove.cpp new file mode 100644 index 000000000..97fc301e2 --- /dev/null +++ b/src/embeddings/tools/glove.cpp @@ -0,0 +1,280 @@ +/** + * @file glove.cpp + * @author Chase Geigle + * + * This tool builds word embedding vectors from a weighted coocurrence + * matrix using the GloVe model. + * + * @see http://nlp.stanford.edu/projects/glove/ + */ + +#include + +#include "cpptoml.h" +#include "meta/embeddings/coocur_iterator.h" +#include "meta/io/filesystem.h" +#include "meta/io/packed.h" +#include "meta/logging/logger.h" +#include "meta/util/aligned_allocator.h" +#include "meta/util/progress.h" +#include "meta/util/printing.h" +#include "meta/util/random.h" + +using namespace meta; + +template +class array_view +{ + public: + array_view(T* start, std::size_t len) : start_{start}, end_{start + len} + { + // nothing + } + + array_view(T* start, T* end) : start_{start}, end_{end} + { + // nothing + } + + T* begin() const + { + return start_; + } + + T* end() const + { + return end_; + } + + std::size_t size() const + { + return end_ - start_; + } + + private: + T* start_; + T* end_; +}; + +class glove +{ + public: + glove(std::size_t num_words, std::size_t vector_size) + : weights_(num_words * 2 * (vector_size + 1)), vector_size_{vector_size} + { + // two vectors for each word (target and context vectors) + // each has vector_size number of features, plus one bias weight + } + + array_view target_vector(uint64_t term) + { + return {weights_.data() + (term * 2 * (vector_size_ + 1)), + vector_size_}; + } + + array_view target_vector(uint64_t term) const + { + return {weights_.data() + (term * 2 * (vector_size_ + 1)), + vector_size_}; + } + + double& target_bias(uint64_t term) + { + return weights_[term * 2 * (vector_size_ + 1) + vector_size_]; + } + + double target_bias(uint64_t term) const + { + return weights_[term * 2 * (vector_size_ + 1) + vector_size_]; + } + + array_view context_vector(uint64_t term) + { + return {weights_.data() + (term * 2 * (vector_size_ + 1)) + vector_size_ + + 1, + vector_size_}; + } + + array_view context_vector(uint64_t term) const + { + return {weights_.data() + (term * 2 * (vector_size_ + 1)) + vector_size_ + + 1, + vector_size_}; + } + + double& context_bias(uint64_t term) + { + return weights_[term * 2 * (vector_size_ + 1) + 2 * vector_size_ + 1]; + } + + double context_bias(uint64_t term) const + { + return weights_[term * 2 * (vector_size_ + 1) + 2 * vector_size_ + 1]; + } + + double score(uint64_t target, uint64_t context) const + { + auto tv = target_vector(target); + auto cv = context_vector(context); + + return std::inner_product(tv.begin(), tv.end(), cv.begin(), 0.0) + + target_bias(target) + context_bias(context); + } + + private: + util::aligned_vector weights_; + std::size_t vector_size_; +}; + +void shuffle_partition(const std::string& prefix, std::size_t max_ram, + std::size_t num_partitions) +{ + using namespace embeddings; + + using vec_type = std::vector; + using diff_type = vec_type::iterator::difference_type; + + std::mt19937 engine{std::random_device{}()}; + vec_type records(max_ram / sizeof(coocur_record)); + + // read in RAM sized chunks and shuffle in memory and write out to disk + std::vector chunk_sizes; + + uint64_t total_records = 0; + coocur_iterator input{prefix + "/coocur.bin"}; + + { + printing::progress progress{" > Shuffling (pass 1): ", + input.total_bytes()}; + while (input != coocur_iterator{}) + { + std::size_t i = 0; + for (; i < records.size() && input != coocur_iterator{}; + ++i, ++input) + { + progress(input.bytes_read()); + records[i] = *input; + } + + std::shuffle(records.begin(), + records.begin() + static_cast(i), engine); + + std::ofstream output{prefix + "/coocur-shuf." + + std::to_string(chunk_sizes.size()) + + ".tmp", + std::ios::binary}; + + total_records += i; + chunk_sizes.push_back(i); + for (std::size_t j = 0; j < i; ++j) + records[j].write(output); + } + } + + std::vector chunks; + chunks.reserve(chunk_sizes.size()); + for (std::size_t i = 0; i < chunk_sizes.size(); ++i) + { + chunks.emplace_back(prefix + "/coocur-shuf." + std::to_string(i) + + ".tmp"); + } + + std::vector outputs(num_partitions); + for (std::size_t i = 0; i < outputs.size(); ++i) + { + outputs[i].open(prefix + "/coocur-shuf." + std::to_string(i) + ".bin", + std::ios::binary); + } + + { + printing::progress progress{" > Shuffling (pass 2): ", total_records}; + uint64_t num_read = 0; + while (true) + { + // read in records from each chunk on disk. Records are taken from + // chunks with probability proportional to their total size (in + // records). + std::size_t i = 0; + for (std::size_t j = 0; j < chunk_sizes.size(); ++j) + { + auto to_write = std::max( + static_cast(static_cast(chunk_sizes[j]) + / total_records * records.size()), + 1); + + for (std::size_t n = 0; n < to_write; ++n) + { + if (chunks[j] == coocur_iterator{} || i == records.size()) + break; + records[i] = *chunks[j]; + ++chunks[j]; + ++i; + progress(++num_read); + } + } + + if (i == 0) + break; + + // partition the records into the output files randomly + for (std::size_t j = 0; j < i; ++j) + { + auto idx = random::bounded_rand(engine, outputs.size()); + records[j].write(outputs[idx]); + } + } + } + + // delete temporary files + for (std::size_t i = 0; i < chunk_sizes.size(); ++i) + { + filesystem::delete_file(prefix + "/coocur-shuf." + std::to_string(i) + + ".tmp"); + } +} + +int main(int argc, char** argv) +{ + if (argc < 2) + { + std::cerr << "Usage: " << argv[0] << " config.toml" << std::endl; + return 1; + } + + logging::set_cerr_logging(); + + auto config = cpptoml::parse_file(argv[1]); + + // extract building parameters + auto embed_cfg = config->get_table("embeddings"); + auto prefix = *embed_cfg->get_as("prefix"); + auto max_ram = static_cast( + embed_cfg->get_as("max-ram").value_or(4096)) + * 1024 * 1024; + + if (!filesystem::file_exists(prefix + "/vocab.bin")) + { + LOG(fatal) << "Vocabulary has not yet been generated, please " + "do this before learning word embeddings" + << ENDLG; + return 1; + } + + if (!filesystem::file_exists(prefix + "/coocur.bin")) + { + LOG(fatal) << "Coocurrence matrix has not yet been generated, please " + "do this before learning word embeddings" + << ENDLG; + return 1; + } + + uint64_t num_words = 0; + { + std::ifstream vocab{prefix + "/vocab.bin", std::ios::binary}; + io::packed::read(vocab, num_words); + } + + std::size_t partitions = std::max(std::thread::hardware_concurrency(), 1u); + + shuffle_partition(prefix, max_ram, partitions); +} From 429fabc8ebd50074f2847342d1b26108af513d0e Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Mon, 8 Feb 2016 22:17:18 -0600 Subject: [PATCH 14/48] Add initial implementation of GloVe learning algorithm. Also add a tool to convert MeTA's coocurrence and vocab file formats to the format expected by the original GloVe tool, though it is mainly just for sanity checking our implementation. --- src/embeddings/tools/CMakeLists.txt | 10 +- src/embeddings/tools/glove.cpp | 437 ++++++++++++++++++------- src/embeddings/tools/meta_to_glove.cpp | 68 ++++ 3 files changed, 395 insertions(+), 120 deletions(-) create mode 100644 src/embeddings/tools/meta_to_glove.cpp diff --git a/src/embeddings/tools/CMakeLists.txt b/src/embeddings/tools/CMakeLists.txt index 36d9e30cb..11a136084 100644 --- a/src/embeddings/tools/CMakeLists.txt +++ b/src/embeddings/tools/CMakeLists.txt @@ -5,4 +5,12 @@ add_executable(embedding-coocur embedding_coocur.cpp) target_link_libraries(embedding-coocur meta-analyzers meta-util meta-io) add_executable(glove glove.cpp) -target_link_libraries(glove meta-util meta-io cpptoml) +target_link_libraries(glove meta-util + meta-io + cpptoml + ${CMAKE_THREAD_LIBS_INIT}) + +add_executable(meta-to-glove meta_to_glove.cpp) +target_link_libraries(meta-to-glove meta-util + meta-io + cpptoml) diff --git a/src/embeddings/tools/glove.cpp b/src/embeddings/tools/glove.cpp index 97fc301e2..ff0840e67 100644 --- a/src/embeddings/tools/glove.cpp +++ b/src/embeddings/tools/glove.cpp @@ -15,13 +15,130 @@ #include "meta/io/filesystem.h" #include "meta/io/packed.h" #include "meta/logging/logger.h" +#include "meta/parallel/thread_pool.h" #include "meta/util/aligned_allocator.h" #include "meta/util/progress.h" #include "meta/util/printing.h" #include "meta/util/random.h" +#include "meta/util/time.h" using namespace meta; +std::size_t shuffle_partition(const std::string& prefix, std::size_t max_ram, + std::size_t num_partitions) +{ + using namespace embeddings; + + using vec_type = std::vector; + using diff_type = vec_type::iterator::difference_type; + + std::mt19937 engine{std::random_device{}()}; + vec_type records(max_ram / sizeof(coocur_record)); + + // read in RAM sized chunks and shuffle in memory and write out to disk + std::vector chunk_sizes; + + std::size_t total_records = 0; + coocur_iterator input{prefix + "/coocur.bin"}; + + auto elapsed = common::time( + [&]() + { + printing::progress progress{" > Shuffling (pass 1): ", + input.total_bytes()}; + while (input != coocur_iterator{}) + { + std::size_t i = 0; + for (; i < records.size() && input != coocur_iterator{}; + ++i, ++input) + { + progress(input.bytes_read()); + records[i] = *input; + } + + std::shuffle(records.begin(), + records.begin() + static_cast(i), + engine); + + std::ofstream output{prefix + "/coocur-shuf." + + std::to_string(chunk_sizes.size()) + + ".tmp", + std::ios::binary}; + + total_records += i; + chunk_sizes.push_back(i); + for (std::size_t j = 0; j < i; ++j) + records[j].write(output); + } + }); + + LOG(info) << "Shuffling pass 1 took " << elapsed.count() / 1000.0 + << " seconds" << ENDLG; + + std::vector chunks; + chunks.reserve(chunk_sizes.size()); + for (std::size_t i = 0; i < chunk_sizes.size(); ++i) + { + chunks.emplace_back(prefix + "/coocur-shuf." + std::to_string(i) + + ".tmp"); + } + + std::vector outputs(num_partitions); + for (std::size_t i = 0; i < outputs.size(); ++i) + { + outputs[i].open(prefix + "/coocur-shuf." + std::to_string(i) + ".bin", + std::ios::binary); + } + + { + printing::progress progress{" > Shuffling (pass 2): ", total_records}; + uint64_t num_read = 0; + while (true) + { + // read in records from each chunk on disk. Records are taken from + // chunks with probability proportional to their total size (in + // records). + std::size_t i = 0; + for (std::size_t j = 0; j < chunk_sizes.size(); ++j) + { + auto to_write = std::max( + static_cast(static_cast(chunk_sizes[j]) + / total_records * records.size()), + 1); + + for (std::size_t n = 0; n < to_write; ++n) + { + if (chunks[j] == coocur_iterator{} || i == records.size()) + break; + records[i] = *chunks[j]; + ++chunks[j]; + ++i; + progress(++num_read); + } + } + + if (i == 0) + break; + + // partition the records into the output files randomly + for (std::size_t j = 0; j < i; ++j) + { + auto idx = random::bounded_rand(engine, outputs.size()); + records[j].write(outputs[idx]); + } + } + } + + // delete temporary files + for (std::size_t i = 0; i < chunk_sizes.size(); ++i) + { + filesystem::delete_file(prefix + "/coocur-shuf." + std::to_string(i) + + ".tmp"); + } + + return total_records; +} + template class array_view { @@ -46,6 +163,16 @@ class array_view return end_; } + const T& operator[](std::size_t idx) const + { + return begin()[idx]; + } + + T& operator[](std::size_t idx) + { + return begin()[idx]; + } + std::size_t size() const { return end_ - start_; @@ -56,14 +183,86 @@ class array_view T* end_; }; -class glove +class glove_exception : public std::runtime_error +{ + public: + using std::runtime_error::runtime_error; +}; + +class glove_trainer { public: - glove(std::size_t num_words, std::size_t vector_size) - : weights_(num_words * 2 * (vector_size + 1)), vector_size_{vector_size} + glove_trainer(const cpptoml::table& embed_cfg) { + // extract building parameters + auto prefix = *embed_cfg.get_as("prefix"); + auto max_ram = static_cast( + embed_cfg.get_as("max-ram").value_or(4096)) + * 1024 * 1024; + vector_size_ = static_cast( + embed_cfg.get_as("vector-size").value_or(50)); + + auto num_threads = static_cast( + embed_cfg.get_as("num-threads") + .value_or(std::max(1u, std::thread::hardware_concurrency()))); + + auto iters = static_cast( + embed_cfg.get_as("max-iter").value_or(25)); + + learning_rate_ + = embed_cfg.get_as("learning-rate").value_or(0.05); + xmax_ = embed_cfg.get_as("xmax").value_or(100.0); + scale_ = embed_cfg.get_as("scale").value_or(0.75); + + if (!filesystem::file_exists(prefix + "/vocab.bin")) + { + LOG(fatal) << "Vocabulary has not yet been generated, please " + "do this before learning word embeddings" + << ENDLG; + throw glove_exception{"no vocabulary file found in " + prefix}; + } + + if (!filesystem::file_exists(prefix + "/coocur.bin")) + { + LOG(fatal) + << "Coocurrence matrix has not yet been generated, please " + "do this before learning word embeddings" + << ENDLG; + throw glove_exception{"no coocurrence matrix found in " + prefix}; + } + + std::size_t num_words = 0; + { + std::ifstream vocab{prefix + "/vocab.bin", std::ios::binary}; + io::packed::read(vocab, num_words); + } + // two vectors for each word (target and context vectors) // each has vector_size number of features, plus one bias weight + auto size = num_words * 2 * (vector_size_ + 1); + weights_.resize(size); + grad_squared_.resize(size, 1.0); + + // randomly initialize the word embeddings and biases + { + std::mt19937 engine{std::random_device{}()}; + std::generate(weights_.begin(), weights_.end(), [&]() + { + // use the word2vec style initialization + // I'm not entirely sure why, but this seems + // to do better than initializing the vectors + // to lie in the unit cube. Maybe scaling? + auto rnd = random::bounded_rand(engine, 65536); + return (rnd / 65536.0 - 0.5) / (vector_size_ + 1); + }); + } + + // shuffle the data and partition it into equal parts for each + // thread + auto total_records = shuffle_partition(prefix, max_ram, num_threads); + + // train using the specified number of threads + train(prefix, num_threads, iters, total_records); } array_view target_vector(uint64_t term) @@ -90,26 +289,24 @@ class glove array_view context_vector(uint64_t term) { - return {weights_.data() + (term * 2 * (vector_size_ + 1)) + vector_size_ - + 1, + return {weights_.data() + (term * 2 + 1) * (vector_size_ + 1), vector_size_}; } array_view context_vector(uint64_t term) const { - return {weights_.data() + (term * 2 * (vector_size_ + 1)) + vector_size_ - + 1, + return {weights_.data() + (term * 2 + 1) * (vector_size_ + 1), vector_size_}; } double& context_bias(uint64_t term) { - return weights_[term * 2 * (vector_size_ + 1) + 2 * vector_size_ + 1]; + return weights_[(term * 2 + 1) * (vector_size_ + 1) + vector_size_]; } double context_bias(uint64_t term) const { - return weights_[term * 2 * (vector_size_ + 1) + 2 * vector_size_ + 1]; + return weights_[(term * 2 + 1) * (vector_size_ + 1) + vector_size_]; } double score(uint64_t target, uint64_t context) const @@ -122,116 +319,141 @@ class glove } private: - util::aligned_vector weights_; - std::size_t vector_size_; -}; - -void shuffle_partition(const std::string& prefix, std::size_t max_ram, - std::size_t num_partitions) -{ - using namespace embeddings; - - using vec_type = std::vector; - using diff_type = vec_type::iterator::difference_type; + array_view target_gradsq(uint64_t term) + { + return {grad_squared_.data() + (term * 2 * (vector_size_ + 1)), + vector_size_}; + } - std::mt19937 engine{std::random_device{}()}; - vec_type records(max_ram / sizeof(coocur_record)); + double& target_bias_gradsq(uint64_t term) + { + return grad_squared_[term * 2 * (vector_size_ + 1) + vector_size_]; + } - // read in RAM sized chunks and shuffle in memory and write out to disk - std::vector chunk_sizes; + array_view context_gradsq(uint64_t term) + { + return {grad_squared_.data() + (term * 2 + 1) * (vector_size_ + 1), + vector_size_}; + } - uint64_t total_records = 0; - coocur_iterator input{prefix + "/coocur.bin"}; + double& context_bias_gradsq(uint64_t term) + { + return grad_squared_[(term * 2 + 1) * (vector_size_ + 1) + + vector_size_]; + } + void train(const std::string& prefix, std::size_t num_threads, + std::size_t iters, std::size_t total_records) { - printing::progress progress{" > Shuffling (pass 1): ", - input.total_bytes()}; - while (input != coocur_iterator{}) + parallel::thread_pool pool{num_threads}; + for (std::size_t i = 1; i <= iters; ++i) { - std::size_t i = 0; - for (; i < records.size() && input != coocur_iterator{}; - ++i, ++input) + printing::progress progress{" > Iteration: ", total_records}; + std::atomic_size_t records{0}; + std::vector> futures; + futures.reserve(num_threads); + for (std::size_t t = 0; t < num_threads; ++t) { - progress(input.bytes_read()); - records[i] = *input; + futures.emplace_back(pool.submit_task( + [&, t]() + { + return train_thread(prefix, t, progress, records); + })); } - std::shuffle(records.begin(), - records.begin() + static_cast(i), engine); - - std::ofstream output{prefix + "/coocur-shuf." - + std::to_string(chunk_sizes.size()) - + ".tmp", - std::ios::binary}; - - total_records += i; - chunk_sizes.push_back(i); - for (std::size_t j = 0; j < i; ++j) - records[j].write(output); + double total_cost = 0.0; + auto elapsed = common::time([&]() + { + for (auto& fut : futures) + total_cost += fut.get(); + }); + progress.end(); + + LOG(progress) << "> Iteration " << i << "/" << iters + << ": avg cost = " << total_cost / total_records + << ", " << elapsed.count() / 1000.0 << " seconds\n" + << ENDLG; } } - std::vector chunks; - chunks.reserve(chunk_sizes.size()); - for (std::size_t i = 0; i < chunk_sizes.size(); ++i) + double cost_weight(double coocur) const { - chunks.emplace_back(prefix + "/coocur-shuf." + std::to_string(i) - + ".tmp"); + return (coocur < xmax_) ? std::pow(coocur / xmax_, scale_) : 1.0; } - std::vector outputs(num_partitions); - for (std::size_t i = 0; i < outputs.size(); ++i) + void update_weight(double* weight, double* gradsq, double grad) { - outputs[i].open(prefix + "/coocur-shuf." + std::to_string(i) + ".bin", - std::ios::binary); + // adaptive gradient update + *weight -= grad / std::sqrt(*gradsq); + *gradsq += grad * grad; } + double train_thread(const std::string& prefix, std::size_t thread_id, + printing::progress& progress, + std::atomic_size_t& records) { - printing::progress progress{" > Shuffling (pass 2): ", total_records}; - uint64_t num_read = 0; - while (true) + using namespace embeddings; + + coocur_iterator iter{prefix + "/coocur-shuf." + + std::to_string(thread_id) + ".bin"}; + + double cost = 0.0; + + for (; iter != coocur_iterator{}; ++iter) { - // read in records from each chunk on disk. Records are taken from - // chunks with probability proportional to their total size (in - // records). - std::size_t i = 0; - for (std::size_t j = 0; j < chunk_sizes.size(); ++j) - { - auto to_write = std::max( - static_cast(static_cast(chunk_sizes[j]) - / total_records * records.size()), - 1); + progress(records++); + auto record = *iter; - for (std::size_t n = 0; n < to_write; ++n) - { - if (chunks[j] == coocur_iterator{} || i == records.size()) - break; - records[i] = *chunks[j]; - ++chunks[j]; - ++i; - progress(++num_read); - } - } + auto diff = score(record.target, record.context) + - std::log(record.weight); + auto weighted_diff = cost_weight(record.weight) * diff; - if (i == 0) - break; + // cost is weighted squared difference + cost += 0.5 * weighted_diff * diff; - // partition the records into the output files randomly - for (std::size_t j = 0; j < i; ++j) + auto delta = weighted_diff * learning_rate_; + + auto target = target_vector(record.target); + auto targ_gradsq = target_gradsq(record.target); + auto context = context_vector(record.context); + auto ctx_gradsq = context_gradsq(record.context); + + auto target_it = target.begin(); + auto target_grad_it = targ_gradsq.begin(); + auto context_it = context.begin(); + auto context_grad_it = ctx_gradsq.begin(); + auto target_end = target.end(); + + // update the embedding vectors + while (target_it != target_end) { - auto idx = random::bounded_rand(engine, outputs.size()); - records[j].write(outputs[idx]); + auto target_grad = delta * *context_it; + auto context_grad = delta * *target_it; + update_weight(target_it, target_grad_it, target_grad); + update_weight(context_it, context_grad_it, context_grad); + ++target_it; + ++target_grad_it; + ++context_it; + ++context_grad_it; } + + // update the bias terms + update_weight(&target_bias(record.target), + &target_bias_gradsq(record.target), delta); + update_weight(&context_bias(record.context), + &context_bias_gradsq(record.context), delta); } - } - // delete temporary files - for (std::size_t i = 0; i < chunk_sizes.size(); ++i) - { - filesystem::delete_file(prefix + "/coocur-shuf." + std::to_string(i) - + ".tmp"); + return cost; } -} + + util::aligned_vector weights_; + util::aligned_vector grad_squared_; + std::size_t vector_size_; + double xmax_; + double scale_; + double learning_rate_; +}; int main(int argc, char** argv) { @@ -244,37 +466,14 @@ int main(int argc, char** argv) logging::set_cerr_logging(); auto config = cpptoml::parse_file(argv[1]); - - // extract building parameters auto embed_cfg = config->get_table("embeddings"); - auto prefix = *embed_cfg->get_as("prefix"); - auto max_ram = static_cast( - embed_cfg->get_as("max-ram").value_or(4096)) - * 1024 * 1024; - - if (!filesystem::file_exists(prefix + "/vocab.bin")) - { - LOG(fatal) << "Vocabulary has not yet been generated, please " - "do this before learning word embeddings" - << ENDLG; - return 1; - } - - if (!filesystem::file_exists(prefix + "/coocur.bin")) + if (!embed_cfg) { - LOG(fatal) << "Coocurrence matrix has not yet been generated, please " - "do this before learning word embeddings" - << ENDLG; + std::cerr << "Missing [embeddings] configuration in " << argv[1] + << std::endl; return 1; } - uint64_t num_words = 0; - { - std::ifstream vocab{prefix + "/vocab.bin", std::ios::binary}; - io::packed::read(vocab, num_words); - } - - std::size_t partitions = std::max(std::thread::hardware_concurrency(), 1u); - - shuffle_partition(prefix, max_ram, partitions); + glove_trainer trainer{*embed_cfg}; + return 0; } diff --git a/src/embeddings/tools/meta_to_glove.cpp b/src/embeddings/tools/meta_to_glove.cpp new file mode 100644 index 000000000..016297cd4 --- /dev/null +++ b/src/embeddings/tools/meta_to_glove.cpp @@ -0,0 +1,68 @@ +/** + * @file embedding_coocur.cpp + * @author Chase Geigle + * + * This tool decompresses the MeTA vocabulary and coocurrence matrix files + * to input that the original GloVe tool can read. + * + * (This is mainly for sanity checking.) + */ + +#include "cpptoml.h" +#include "meta/embeddings/coocur_iterator.h" +#include "meta/io/binary.h" +#include "meta/logging/logger.h" +#include "meta/util/progress.h" + +using namespace meta; + +int main(int argc, char** argv) +{ + using namespace embeddings; + + if (argc < 2) + { + std::cerr << "Usage: " << argv[0] << " config.toml" << std::endl; + return 1; + } + + logging::set_cerr_logging(); + + auto config = cpptoml::parse_file(argv[1]); + + // extract building parameters + auto embed_cfg = config->get_table("embeddings"); + auto prefix = *embed_cfg->get_as("prefix"); + + { + std::ifstream input{prefix + "/vocab.bin", std::ios::binary}; + std::ofstream output{"vocab-glove.txt"}; + auto size = io::packed::read(input); + + printing::progress progress{" > Decompressing vocab: ", + size}; + for (uint64_t tid = 0; tid < size; ++tid) + { + progress(tid); + auto word = io::packed::read(input); + auto count = io::packed::read(input); + + output << word << " " << count << "\n"; + } + } + + { + coocur_iterator iter{prefix + "/coocur.bin"}; + printing::progress progress{" > Decompressing coocurrence matrix: ", + iter.total_bytes()}; + std::ofstream output{"coocur-glove.bin", std::ios::binary}; + for (; iter != coocur_iterator{}; ++iter) + { + progress(iter.bytes_read()); + auto record = *iter; + io::write_binary(output, (int)record.target); + io::write_binary(output, (int)record.context); + io::write_binary(output, record.weight); + } + } +} From faa602fa91f925fa26bb1d299e5399fe85f5c8bd Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 9 Feb 2016 13:40:33 -0600 Subject: [PATCH 15/48] Save normalized word vectors after training. --- src/embeddings/tools/glove.cpp | 86 ++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/src/embeddings/tools/glove.cpp b/src/embeddings/tools/glove.cpp index ff0840e67..2c95cd851 100644 --- a/src/embeddings/tools/glove.cpp +++ b/src/embeddings/tools/glove.cpp @@ -214,6 +214,9 @@ class glove_trainer xmax_ = embed_cfg.get_as("xmax").value_or(100.0); scale_ = embed_cfg.get_as("scale").value_or(0.75); + auto num_rare = static_cast( + embed_cfg.get_as("unk-num-avg").value_or(100)); + if (!filesystem::file_exists(prefix + "/vocab.bin")) { LOG(fatal) << "Vocabulary has not yet been generated, please " @@ -263,6 +266,14 @@ class glove_trainer // train using the specified number of threads train(prefix, num_threads, iters, total_records); + + // delete the temporary shuffled coocurrence files + for (std::size_t i = 0; i < num_threads; ++i) + filesystem::delete_file(prefix + "/coocur-shuf." + std::to_string(i) + + ".bin"); + + // save the target and context word embeddings + save(prefix, num_words, num_rare); } array_view target_vector(uint64_t term) @@ -447,6 +458,81 @@ class glove_trainer return cost; } + void save(const std::string& prefix, uint64_t num_words, + uint64_t num_rare) const + { + // target embeddings + { + std::ofstream output{prefix + "/embeddings.target.bin", + std::ios::binary}; + printing::progress progress{" > Saving target embeddings: ", + num_words}; + io::packed::write(output, vector_size_); + save_embeddings(output, num_words, num_rare, progress, + [&](uint64_t term) + { + return target_vector(term); + }); + } + + // context embeddings + { + + std::ofstream output{prefix + "/embeddings.context.bin", + std::ios::binary}; + printing::progress progress{" > Saving context embeddings: ", + num_words}; + io::packed::write(output, vector_size_); + save_embeddings(output, num_words, num_rare, progress, + [&](uint64_t term) + { + return context_vector(term); + }); + } + } + + template + void save_embeddings(std::ofstream& output, uint64_t num_words, + uint64_t num_rare, printing::progress& progress, + VectorFetcher&& vf) const + { + for (uint64_t tid = 0; tid < num_words; ++tid) + { + progress(tid); + const auto& vec = vf(tid); + write_normalized(vec.begin(), vec.end(), output); + } + + util::aligned_vector unk_vec(vector_size_, 0.0); + auto num_to_average = std::min(num_rare, num_words); + for (uint64_t tid = num_words - num_rare; tid < num_words; ++tid) + { + const auto& vec = vf(tid); + std::transform(unk_vec.begin(), unk_vec.end(), vec.begin(), + unk_vec.begin(), + [=](double unkweight, double vecweight) + { + return unkweight + vecweight / num_to_average; + }); + } + write_normalized(unk_vec.begin(), unk_vec.end(), output); + } + + template + void write_normalized(ForwardIterator begin, ForwardIterator end, + std::ofstream& output) const + { + auto len = std::sqrt(std::accumulate(begin, end, 0.0, + [](double accum, double weight) + { + return accum + weight * weight; + })); + std::for_each(begin, end, [&](double weight) + { + io::packed::write(output, weight / len); + }); + } + util::aligned_vector weights_; util::aligned_vector grad_squared_; std::size_t vector_size_; From bc82dccc3c420a83f7e85eb218520077632aa8f8 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 9 Feb 2016 13:47:19 -0600 Subject: [PATCH 16/48] Split array_view out into the util namespace. --- include/meta/util/array_view.h | 100 +++++++++++++++++++++++++++++++++ src/embeddings/tools/glove.cpp | 57 +++---------------- 2 files changed, 107 insertions(+), 50 deletions(-) create mode 100644 include/meta/util/array_view.h diff --git a/include/meta/util/array_view.h b/include/meta/util/array_view.h new file mode 100644 index 000000000..cb7016650 --- /dev/null +++ b/include/meta/util/array_view.h @@ -0,0 +1,100 @@ +/** + * @file array_view.h + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_UTIL_ARRAY_VIEW_H_ +#define META_UTIL_ARRAY_VIEW_H_ + +#include + +namespace meta +{ +namespace util +{ + +/** + * A non-owning reference to an array (or part of one). The underlying data + * must outlive the array_view on top of it. + */ +template +class array_view +{ + public: + /** + * Constructs an array view starting at the given starting point of + * the specified length. + * + * @param start The start point + * @param len The length of the array_view + */ + array_view(T* start, std::size_t len) : start_{start}, end_{start + len} + { + // nothing + } + + /** + * Constructs an array view starting at the given starting point and + * ending at the given ending point (exclusive). + * + * @param start The starting point + * @param end The ending point + */ + array_view(T* start, T* end) : start_{start}, end_{end} + { + // nothing + } + + /** + * @return an iterator to the start + */ + T* begin() const + { + return start_; + } + + /** + * @return an iterator to the end + */ + T* end() const + { + return end_; + } + + /** + * @param idx The index to access + * @return the element at that position + */ + const T& operator[](std::size_t idx) const + { + return begin()[idx]; + } + + /** + * @param idx The index to access + * @return the element at that position + */ + T& operator[](std::size_t idx) + { + return begin()[idx]; + } + + /** + * @return the number of elements in this array_view + */ + std::size_t size() const + { + return end_ - start_; + } + + private: + T* start_; + T* end_; +}; +} +} +#endif diff --git a/src/embeddings/tools/glove.cpp b/src/embeddings/tools/glove.cpp index 2c95cd851..c37ea9925 100644 --- a/src/embeddings/tools/glove.cpp +++ b/src/embeddings/tools/glove.cpp @@ -17,6 +17,7 @@ #include "meta/logging/logger.h" #include "meta/parallel/thread_pool.h" #include "meta/util/aligned_allocator.h" +#include "meta/util/array_view.h" #include "meta/util/progress.h" #include "meta/util/printing.h" #include "meta/util/random.h" @@ -139,50 +140,6 @@ std::size_t shuffle_partition(const std::string& prefix, std::size_t max_ram, return total_records; } -template -class array_view -{ - public: - array_view(T* start, std::size_t len) : start_{start}, end_{start + len} - { - // nothing - } - - array_view(T* start, T* end) : start_{start}, end_{end} - { - // nothing - } - - T* begin() const - { - return start_; - } - - T* end() const - { - return end_; - } - - const T& operator[](std::size_t idx) const - { - return begin()[idx]; - } - - T& operator[](std::size_t idx) - { - return begin()[idx]; - } - - std::size_t size() const - { - return end_ - start_; - } - - private: - T* start_; - T* end_; -}; - class glove_exception : public std::runtime_error { public: @@ -276,13 +233,13 @@ class glove_trainer save(prefix, num_words, num_rare); } - array_view target_vector(uint64_t term) + util::array_view target_vector(uint64_t term) { return {weights_.data() + (term * 2 * (vector_size_ + 1)), vector_size_}; } - array_view target_vector(uint64_t term) const + util::array_view target_vector(uint64_t term) const { return {weights_.data() + (term * 2 * (vector_size_ + 1)), vector_size_}; @@ -298,13 +255,13 @@ class glove_trainer return weights_[term * 2 * (vector_size_ + 1) + vector_size_]; } - array_view context_vector(uint64_t term) + util::array_view context_vector(uint64_t term) { return {weights_.data() + (term * 2 + 1) * (vector_size_ + 1), vector_size_}; } - array_view context_vector(uint64_t term) const + util::array_view context_vector(uint64_t term) const { return {weights_.data() + (term * 2 + 1) * (vector_size_ + 1), vector_size_}; @@ -330,7 +287,7 @@ class glove_trainer } private: - array_view target_gradsq(uint64_t term) + util::array_view target_gradsq(uint64_t term) { return {grad_squared_.data() + (term * 2 * (vector_size_ + 1)), vector_size_}; @@ -341,7 +298,7 @@ class glove_trainer return grad_squared_[term * 2 * (vector_size_ + 1) + vector_size_]; } - array_view context_gradsq(uint64_t term) + util::array_view context_gradsq(uint64_t term) { return {grad_squared_.data() + (term * 2 + 1) * (vector_size_ + 1), vector_size_}; From 2684fd810b8e6f2c4b4f84425b000f47ccde66ad Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 9 Feb 2016 13:48:55 -0600 Subject: [PATCH 17/48] Catch glove_exceptions in training. --- src/embeddings/tools/glove.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/embeddings/tools/glove.cpp b/src/embeddings/tools/glove.cpp index c37ea9925..16084c2dd 100644 --- a/src/embeddings/tools/glove.cpp +++ b/src/embeddings/tools/glove.cpp @@ -517,6 +517,14 @@ int main(int argc, char** argv) return 1; } - glove_trainer trainer{*embed_cfg}; + try + { + glove_trainer trainer{*embed_cfg}; + } + catch (const glove_exception& ex) + { + LOG(fatal) << ex.what() << ENDLG; + return 1; + } return 0; } From c274236b5fbd987f5b8473906fcbb7a2c8d8004c Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 9 Feb 2016 17:17:57 -0600 Subject: [PATCH 18/48] Move fastapprox to include/meta/math. --- include/meta/{util => math}/fastapprox.h | 0 src/index/ranker/lm_ranker.cpp | 2 +- src/index/ranker/okapi_bm25.cpp | 2 +- src/index/ranker/pivoted_length.cpp | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) rename include/meta/{util => math}/fastapprox.h (100%) diff --git a/include/meta/util/fastapprox.h b/include/meta/math/fastapprox.h similarity index 100% rename from include/meta/util/fastapprox.h rename to include/meta/math/fastapprox.h diff --git a/src/index/ranker/lm_ranker.cpp b/src/index/ranker/lm_ranker.cpp index 8f52bab02..c5460ff29 100644 --- a/src/index/ranker/lm_ranker.cpp +++ b/src/index/ranker/lm_ranker.cpp @@ -5,7 +5,7 @@ #include #include "meta/corpus/document.h" -#include "meta/util/fastapprox.h" +#include "meta/math/fastapprox.h" #include "meta/index/score_data.h" #include "meta/index/ranker/lm_ranker.h" diff --git a/src/index/ranker/okapi_bm25.cpp b/src/index/ranker/okapi_bm25.cpp index e464cbc24..6bd61ebda 100644 --- a/src/index/ranker/okapi_bm25.cpp +++ b/src/index/ranker/okapi_bm25.cpp @@ -7,7 +7,7 @@ #include "meta/index/inverted_index.h" #include "meta/index/ranker/okapi_bm25.h" #include "meta/index/score_data.h" -#include "meta/util/fastapprox.h" +#include "meta/math/fastapprox.h" namespace meta { diff --git a/src/index/ranker/pivoted_length.cpp b/src/index/ranker/pivoted_length.cpp index b3f3034f3..521630b7a 100644 --- a/src/index/ranker/pivoted_length.cpp +++ b/src/index/ranker/pivoted_length.cpp @@ -6,7 +6,7 @@ #include "meta/index/inverted_index.h" #include "meta/index/ranker/pivoted_length.h" #include "meta/index/score_data.h" -#include "meta/util/fastapprox.h" +#include "meta/math/fastapprox.h" namespace meta { From 9d5a75b24635fbea301445f81b889e5c568c3df5 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 9 Feb 2016 17:18:26 -0600 Subject: [PATCH 19/48] Add some converting constructors to util::array_view. --- include/meta/util/array_view.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/include/meta/util/array_view.h b/include/meta/util/array_view.h index cb7016650..7b906035b 100644 --- a/include/meta/util/array_view.h +++ b/include/meta/util/array_view.h @@ -11,6 +11,7 @@ #define META_UTIL_ARRAY_VIEW_H_ #include +#include namespace meta { @@ -49,6 +50,37 @@ class array_view // nothing } + /** + * Constructs an array_view over a std::vector. + */ + template + array_view(const std::vector& container) + : array_view(container.data(), container.size()) + { + // nothing + } + + /** + * Constructs an array_view over a std::vector. + */ + template + array_view(std::vector& container) + : array_view(container.data(), container.size()) + { + // nothing + } + + /** + * Constructs an array_view from a compatible other array_view. + */ + template ::value>::type> + array_view(const array_view& av) + : start_{av.begin()}, end_{av.end()} + { + // nothing + } + /** * @return an iterator to the start */ From 9da825ad9460b4236a48f58724a58af6568cf65a Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Tue, 9 Feb 2016 17:18:50 -0600 Subject: [PATCH 20/48] Add simple vector math library. Right now, only overloads for operator+ and operator- are provided. To use them, one needs to add a using namespace meta::math::operators; in the scope where their use is desired. --- include/meta/math/vector.h | 283 +++++++++++++++++++++++++++++++++++++ tests/unit_tests.cmake | 4 + tests/vector_math_test.cpp | 162 +++++++++++++++++++++ 3 files changed, 449 insertions(+) create mode 100644 include/meta/math/vector.h create mode 100644 tests/vector_math_test.cpp diff --git a/include/meta/math/vector.h b/include/meta/math/vector.h new file mode 100644 index 000000000..7830cafcd --- /dev/null +++ b/include/meta/math/vector.h @@ -0,0 +1,283 @@ +/** + * @file vector.h + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_MATH_VECTOR_H_ +#define META_MATH_VECTOR_H_ + +#include +#include +#include +#include + +#include "meta/util/array_view.h" + +namespace meta +{ +namespace math +{ +namespace operators +{ + +template > +using vector = std::vector; + +namespace detail +{ +template +struct common_type +{ + using type = + typename std::remove_const::type>::type; + using vector_type = vector; +}; +} + +// forward declarations: operator+ + +// - vector x vector +template +vector operator+(const vector& a, + const vector& b); + +template +vector operator+(vector&& a, + const vector& b); + +template +vector operator+(const vector& a, + vector&& b); + +template +vector operator+(vector&& a, + vector&& b); + +// - vector x array_view +template +vector operator+(const vector& a, + util::array_view b); + +template +vector operator+(vector&& a, util::array_view b); + +// - array_view x vector +template +vector operator+(util::array_view a, + const vector& b); + +template +vector operator+(util::array_view a, vector&& b); + +// - array_view x array_view +template +typename detail::common_type::vector_type +operator+(util::array_view a, util::array_view b); + +// implementations: operator+ + +// - vector x vector +template +vector operator+(const vector& a, + const vector& b) +{ + vector result{a}; + return std::move(result) + util::array_view(b); +} + +template +vector operator+(vector&& a, + const vector& b) +{ + return std::move(a) + util::array_view(b); +} + +template +vector operator+(const vector& a, + vector&& b) +{ + return std::move(b) + util::array_view(a); +} + +template +vector operator+(vector&& a, + vector&& b) +{ + return std::move(a) + util::array_view(b); +} + +// - vector x array_view +template +vector operator+(const vector& a, + util::array_view b) +{ + vector result{a}; + return std::move(result) + b; +} + +template +vector operator+(vector&& a, util::array_view b) +{ + vector result{std::move(a)}; + std::transform(b.begin(), b.end(), result.begin(), result.begin(), + [](const T& bval, const T& resval) + { + return bval + resval; + }); + return result; +} + +// - array_view x vector +template +vector operator+(util::array_view a, + const vector& b) +{ + vector result{b}; + return std::move(result) + a; +} + +template +vector operator+(util::array_view a, vector&& b) +{ + return std::move(b) + a; +} + +// - array_view x array_view +template +typename detail::common_type::vector_type operator+(util::array_view a, + util::array_view b) +{ + typename detail::common_type::vector_type result{a.begin(), a.end()}; + return std::move(result) + b; +} + +// forward declarations: operator- + +// - vector x vector +template +vector operator-(const vector& a, + const vector& b); + +template +vector operator-(vector&& a, + const vector& b); + +template +vector operator-(const vector& a, + vector&& b); + +template +vector operator-(vector&& a, + vector&& b); + +// - vector x array_view +template +vector operator-(const vector& a, + util::array_view b); + +template +vector operator-(vector&& a, util::array_view b); + +// - array_view x vector +template +vector operator-(util::array_view a, + const vector& b); + +template +vector operator-(util::array_view a, vector&& b); + +// - array_view x array_view +template +typename detail::common_type::vector_type +operator-(util::array_view a, util::array_view b); + +// implementations: operator- + +// vector x vector +template +vector operator-(const vector& a, + const vector& b) +{ + vector result{a}; + return std::move(result) - util::array_view(b); +} + +template +vector operator-(vector&& a, + const vector& b) +{ + return std::move(a) - util::array_view(b); +} + +template +vector operator-(const vector& a, + vector&& b) +{ + return util::array_view(a) - std::move(b); +} + +template +vector operator-(vector&& a, + vector&& b) +{ + return std::move(a) - util::array_view(b); +} + +// vector x array_view + +template +vector operator-(const vector& a, + util::array_view b) +{ + vector result{std::move(a)}; + return std::move(result) - b; +} + +template +vector operator-(vector&& a, util::array_view b) +{ + vector result{std::move(a)}; + std::transform(result.begin(), result.end(), b.begin(), result.begin(), + [](const T& resval, const T& bval) + { + return resval - bval; + }); + return result; +} + +// array_view x vector +template +vector operator-(util::array_view a, + const vector& b) +{ + vector result{b}; + return a - std::move(result); +} + +template +vector operator-(util::array_view a, vector&& b) +{ + vector result{std::move(b)}; + std::transform(a.begin(), a.end(), result.begin(), result.begin(), + [](const T& aval, const T& resval) + { + return aval - resval; + }); + return result; +} + +template +typename detail::common_type::vector_type operator-(util::array_view a, + util::array_view b) +{ + typename detail::common_type::vector_type result{a.begin(), a.end()}; + return std::move(result) - b; +} +} +} +} +#endif diff --git a/tests/unit_tests.cmake b/tests/unit_tests.cmake index 366199354..e4019b93a 100644 --- a/tests/unit_tests.cmake +++ b/tests/unit_tests.cmake @@ -85,3 +85,7 @@ set_tests_properties(topics PROPERTIES TIMEOUT 10 WORKING_DIRECTORY add_test(hashing ${UNIT_TEST_EXE} --only=[hashing] --reporter=spec) set_tests_properties(hashing PROPERTIES TIMEOUT 10 WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) + +add_test(vector-math ${UNIT_TEST_EXE} --only=[vector math] --reporter=spec) +set_tests_properties(vector-math PROPERTIES TIMEOUT 10 WORKING_DIRECTORY + ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) diff --git a/tests/vector_math_test.cpp b/tests/vector_math_test.cpp new file mode 100644 index 000000000..2461c9caf --- /dev/null +++ b/tests/vector_math_test.cpp @@ -0,0 +1,162 @@ +/** + * @file vector_math_test.h + * @author Chase Geigle + */ + +#include "bandit/bandit.h" +#include "meta/math/vector.h" + +using namespace bandit; + +go_bandit([]() { + describe("[vector math]", []() { + using namespace meta; + using namespace math::operators; + std::vector a = {2, 2, 2, 2}; + std::vector b = {1, 1, 1, 1}; + + it("should add two vectors (const ref + const ref)", [&]() { + auto c = a + b; + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(3)); + }); + + it("should add two vectors (rval + const ref)", [&]() { + auto c = std::vector(a) + b; + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(3)); + }); + + it("should add two vectors (const ref + rval)", [&]() { + auto c = a + std::vector(b); + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(3)); + }); + + it("should add two vectors (rval + rval)", [&]() { + auto c = std::vector(a) + std::vector(b); + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(3)); + }); + + it("should add two vectors (const ref + array_view)", [&]() { + auto c = a + util::array_view(b); + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(3)); + }); + + it("should add two vectors (rval + array_view)", [&]() { + auto c = std::vector(a) + util::array_view(b); + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(3)); + }); + + it("should add two vectors (array_view + const ref)", [&]() { + auto c = util::array_view(a) + b; + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(3)); + }); + + it("should add two vectors (array_view + rval)", [&]() { + auto c = util::array_view(a) + std::vector(b); + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(3)); + }); + + it("should add two vectors (array_view + array_view)", [&]() { + auto c = util::array_view(a) + + util::array_view(b); + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(3)); + }); + + it("should add two vectors (array_view + array_view)", [&]() { + auto c = util::array_view(a) + + util::array_view(b); + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(3)); + }); + + it("should subtract two vectors (const ref - const ref)", [&]() { + auto c = a - b; + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(1)); + }); + + it("should subtract two vectors (rval - const ref)", [&]() { + auto c = std::vector(a) - b; + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(1)); + }); + + it("should subtract two vectors (const ref - rval)", [&]() { + auto c = a - std::vector(b); + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(1)); + }); + + it("should subtract two vectors (rval - rval)", [&]() { + auto c = std::vector(a) - std::vector(b); + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(1)); + }); + + it("should subtract two vectors (const ref - array_view)", [&]() { + auto c = a - util::array_view(b); + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(1)); + }); + + it("should subtract two vectors (rval - array_view)", [&]() { + auto c = std::vector(a) - util::array_view(b); + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(1)); + }); + + it("should subtract two vectors (array_view - const ref)", [&]() { + auto c = util::array_view(a) - b; + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(1)); + }); + + it("should subtract two vectors (array_view - rval)", [&]() { + auto c = util::array_view(a) - std::vector(b); + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(1)); + }); + + it("should subtract two vectors (array_view - array_view)", [&]() { + auto c = util::array_view(a) + - util::array_view(b); + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(1)); + }); + + it("should add two vectors (array_view - array_view)", [&]() { + auto c = util::array_view(a) + - util::array_view(b); + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(1)); + }); + }); +}); From 79b2a763bb8ef67bc3318a316aa6483744ae1fb4 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 10 Feb 2016 15:58:41 -0600 Subject: [PATCH 21/48] Fix incorrect definition of l1norm() in sgd_model. --- src/learn/sgd.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/learn/sgd.cpp b/src/learn/sgd.cpp index bdc7af6e3..71968af4d 100644 --- a/src/learn/sgd.cpp +++ b/src/learn/sgd.cpp @@ -3,6 +3,8 @@ * @author Chase Geigle */ +#include +#include #include "meta/io/packed.h" #include "meta/learn/sgd.h" @@ -177,11 +179,11 @@ double sgd_model::l2norm() const double sgd_model::l1norm() const { - return std::count_if(weights_.begin(), weights_.end(), - [&](const weight_type& w) - { - return scale_ * w.weight > 0; - }); + return std::accumulate(weights_.begin(), weights_.end(), 0.0, + [](double accum, const weight_type& w) + { + return accum + std::abs(w.weight); + }); } } } From 75ddbb919c0dad428dcaa5baf64f775ff37a9267 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 10 Feb 2016 17:06:08 -0600 Subject: [PATCH 22/48] Add initial interface for loading word embeddings. --- include/meta/embeddings/word_embeddings.h | 104 +++++++++++++ src/embeddings/CMakeLists.txt | 3 + src/embeddings/word_embeddings.cpp | 179 ++++++++++++++++++++++ 3 files changed, 286 insertions(+) create mode 100644 include/meta/embeddings/word_embeddings.h create mode 100644 src/embeddings/word_embeddings.cpp diff --git a/include/meta/embeddings/word_embeddings.h b/include/meta/embeddings/word_embeddings.h new file mode 100644 index 000000000..396de5024 --- /dev/null +++ b/include/meta/embeddings/word_embeddings.h @@ -0,0 +1,104 @@ +/** + * @file word_embeddings.h + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_EMBEDDINGS_WORD_EMBEDDINGS_H_ +#define META_EMBEDDINGS_WORD_EMBEDDINGS_H_ + +#include +#include + +#include "cpptoml.h" +#include "meta/hashing/probe_map.h" +#include "meta/util/aligned_allocator.h" +#include "meta/util/array_view.h" +#include "meta/util/string_view.h" + +namespace meta +{ +namespace embeddings +{ + +struct embedding +{ + std::size_t tid; + util::array_view v; +}; + +/** + * A read-only model for accessing word embeddings. + */ +class word_embeddings +{ + public: + /** + * Loads word embeddings from files. + * + * @param vocab The stream to read the vocabulary from + * @param vectors The stream to read the vectors from + */ + word_embeddings(std::istream& vocab, std::istream& vectors); + + /** + * Loads word embeddings from parallel streams. The word embeddings + * will be component-wise sums of the word embeddings from the two + * streams. + * + * @param vocab The stream to read the vocabulary from + * @param first The first stream + * @param second The second stream + */ + word_embeddings(std::istream& vocab, std::istream& first, + std::istream& second); + + /** + * @param term The term to look up + * @return the embedding vector (as an array_view) for the given term, + * or the vector for the unknown word as appropriate + */ + embedding at(util::string_view term) const; + + /** + * @param tid The term id to look up + * @return the term (as a string_view) represented by that term id + */ + util::string_view term(std::size_t tid) const; + + private: + util::array_view vector(std::size_t tid); + + util::array_view vector(std::size_t tid) const; + + void load_vocab(std::istream& vocab); + + /// The size of the word embeddings + const std::size_t vector_size_; + + /// The embeddings matrix + util::aligned_vector embeddings_; + + /// A list of all of the strings in the vocabulary, indexed by id + util::aligned_vector id_to_term_; + + /// A hash table from a term to its id + hashing::probe_map term_to_id_; +}; + +/** + * Exception thrown when interacting with the word_embeddings model. + */ +class word_embeddings_exception : public std::runtime_error +{ + public: + using std::runtime_error::runtime_error; +}; + +word_embeddings load_embeddings(const cpptoml::table& config); +} +} +#endif diff --git a/src/embeddings/CMakeLists.txt b/src/embeddings/CMakeLists.txt index c61f3800f..ec9d0972b 100644 --- a/src/embeddings/CMakeLists.txt +++ b/src/embeddings/CMakeLists.txt @@ -1,3 +1,6 @@ project(meta-embeddings) add_subdirectory(tools) + +add_library(meta-embeddings word_embeddings.cpp) +target_link_libraries(meta-embeddings cpptoml meta-util) diff --git a/src/embeddings/word_embeddings.cpp b/src/embeddings/word_embeddings.cpp new file mode 100644 index 000000000..8a82fe577 --- /dev/null +++ b/src/embeddings/word_embeddings.cpp @@ -0,0 +1,179 @@ +/** + * @file word_embeddings.cpp + * @author Chase Geigle + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#include "meta/embeddings/word_embeddings.h" +#include "meta/io/packed.h" +#include "meta/util/progress.h" + +namespace meta +{ +namespace embeddings +{ + +using vocab_type = hashing::probe_map; + +word_embeddings::word_embeddings(std::istream& vocab, std::istream& vectors) + : vector_size_{io::packed::read(vectors)}, + embeddings_(vector_size_), + id_to_term_(io::packed::read(vocab)), + term_to_id_{static_cast(std::ceil( + id_to_term_.size() / vocab_type::default_max_load_factor()))} +{ + load_vocab(vocab); + + printing::progress progress{" > Loading embeddings: ", id_to_term_.size()}; + // +1 for the unk vector, which is always last + for (std::size_t tid = 0; tid < id_to_term_.size() + 1; ++tid) + { + if (!vectors) + throw word_embeddings_exception{ + "embeddings stream ended unexpectedly"}; + + progress(tid); + auto vec = vector(tid); + std::generate(vec.begin(), vec.end(), [&]() + { + return io::packed::read(vectors); + }); + } +} + +word_embeddings::word_embeddings(std::istream& vocab, std::istream& first, + std::istream& second) + : vector_size_{io::packed::read(first)}, + embeddings_(vector_size_), + id_to_term_(io::packed::read(vocab)), + term_to_id_{static_cast(std::ceil( + id_to_term_.size() / vocab_type::default_max_load_factor()))} +{ + if (io::packed::read(second) != vector_size_) + throw word_embeddings_exception{"mismatched vector sizes"}; + + load_vocab(vocab); + + printing::progress progress{" > Loading embeddings: ", id_to_term_.size()}; + // +1 for the unk vector, which is always last + for (std::size_t tid = 0; tid < id_to_term_.size() + 1; ++tid) + { + if (!first) + throw word_embeddings_exception{ + "first embeddings stream ended unexpectedly"}; + + if (!second) + throw word_embeddings_exception{ + "second embeddings stream ended unexpectedly"}; + + progress(tid); + auto vec = vector(tid); + std::generate(vec.begin(), vec.end(), [&]() + { + return (io::packed::read(first) + + io::packed::read(second)) + / 2; + }); + } +} + +void word_embeddings::load_vocab(std::istream& vocab) +{ + printing::progress progress{" > Loading vocab: ", id_to_term_.size()}; + for (std::size_t tid = 0; tid < id_to_term_.size(); ++tid) + { + if (!vocab) + throw word_embeddings_exception{"vocab stream ended unexpectedly"}; + + progress(tid); + io::packed::read(vocab, id_to_term_[tid]); + term_to_id_[id_to_term_[tid]] = tid; + } +} + +util::array_view word_embeddings::vector(std::size_t tid) +{ + return {embeddings_.data() + tid * vector_size_, vector_size_}; +} + +util::array_view word_embeddings::vector(std::size_t tid) const +{ + return {embeddings_.data() + tid * vector_size_, vector_size_}; +} + +embedding word_embeddings::at(util::string_view term) const +{ + std::size_t tid; + auto v_it = term_to_id_.find(term); + if (v_it == term_to_id_.end()) + { + tid = id_to_term_.size(); + } + else + { + tid = v_it->value(); + } + return {tid, vector(tid)}; +} + +util::string_view word_embeddings::term(std::size_t tid) const +{ + if (tid >= id_to_term_.size()) + return ""; + return id_to_term_[tid]; +} + +word_embeddings load_embeddings(const cpptoml::table& config) +{ + auto prefix = config.get_as("prefix"); + if (!prefix) + throw word_embeddings_exception{ + "missing prefix key in configuration file"}; + + std::ifstream vocab{*prefix + "/vocab.bin", std::ios::binary}; + if (!vocab) + throw word_embeddings_exception{"missing vocabulary file in: " + + *prefix}; + + std::ifstream target{*prefix + "/embeddings.target.bin", std::ios::binary}; + std::ifstream context{*prefix + "/embeddings.context.bin", + std::ios::binary}; + + auto mode = config.get_as("mode").value_or("average"); + if (mode == "average") + { + if (!target) + throw word_embeddings_exception{"missing target vectors in: " + + *prefix}; + if (!context) + throw word_embeddings_exception{"missing context vectors in: " + + *prefix}; + + return {vocab, target, context}; + } + else if (mode == "target") + { + if (!target) + throw word_embeddings_exception{"missing target vectors in: " + + *prefix}; + + return {vocab, target}; + } + else if (mode == "context") + { + if (!context) + throw word_embeddings_exception{"missing context vectors in: " + + *prefix}; + + return {vocab, context}; + } + else + { + throw word_embeddings_exception{"invalid mode key in configuration"}; + } +} +} +} From 80dedc870d217d28b3322b93d7f7d21582282761 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 10 Feb 2016 17:30:18 -0600 Subject: [PATCH 23/48] Add top_k method to word_embeddings. --- include/meta/embeddings/word_embeddings.h | 15 ++++++++++++++ src/embeddings/word_embeddings.cpp | 25 +++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/include/meta/embeddings/word_embeddings.h b/include/meta/embeddings/word_embeddings.h index 396de5024..58d7e8c79 100644 --- a/include/meta/embeddings/word_embeddings.h +++ b/include/meta/embeddings/word_embeddings.h @@ -30,6 +30,12 @@ struct embedding util::array_view v; }; +struct scored_embedding +{ + embedding e; + double score; +}; + /** * A read-only model for accessing word embeddings. */ @@ -69,6 +75,15 @@ class word_embeddings */ util::string_view term(std::size_t tid) const; + /** + * @param query A vector of the same length as a word embedding to + * query for + * @param k The number of embeddings to return + * @return the top k word scored_embeddings closest to the query + */ + std::vector top_k(util::array_view query, + std::size_t k = 100) const; + private: util::array_view vector(std::size_t tid); diff --git a/src/embeddings/word_embeddings.cpp b/src/embeddings/word_embeddings.cpp index 8a82fe577..57ebecf91 100644 --- a/src/embeddings/word_embeddings.cpp +++ b/src/embeddings/word_embeddings.cpp @@ -9,6 +9,7 @@ #include "meta/embeddings/word_embeddings.h" #include "meta/io/packed.h" +#include "meta/util/fixed_heap.h" #include "meta/util/progress.h" namespace meta @@ -126,6 +127,30 @@ util::string_view word_embeddings::term(std::size_t tid) const return id_to_term_[tid]; } +std::vector +word_embeddings::top_k(util::array_view query, + std::size_t k) const +{ + auto comp = [](const scored_embedding& a, const scored_embedding& b) + { + return a.score > b.score; + }; + util::fixed_heap results{k, comp}; + + // +1 for + for (std::size_t tid = 0; tid < id_to_term_.size() + 1; ++tid) + { + auto vec = vector(tid); + auto score + = std::inner_product(query.begin(), query.end(), vec.begin(), 0.0); + + embedding e{tid, vec}; + results.push({e, score}); + } + + return results.extract_top(); +} + word_embeddings load_embeddings(const cpptoml::table& config) { auto prefix = config.get_as("prefix"); From a7fa8281ca37b0f21433560be36a501fde7ca000 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 10 Feb 2016 17:30:43 -0600 Subject: [PATCH 24/48] Add initial version of demo app for playing with word embeddings. --- src/embeddings/tools/CMakeLists.txt | 3 + .../tools/interactive_embeddings.cpp | 60 +++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 src/embeddings/tools/interactive_embeddings.cpp diff --git a/src/embeddings/tools/CMakeLists.txt b/src/embeddings/tools/CMakeLists.txt index 11a136084..3eda80486 100644 --- a/src/embeddings/tools/CMakeLists.txt +++ b/src/embeddings/tools/CMakeLists.txt @@ -10,6 +10,9 @@ target_link_libraries(glove meta-util cpptoml ${CMAKE_THREAD_LIBS_INIT}) +add_executable(interactive-embeddings interactive_embeddings.cpp) +target_link_libraries(interactive-embeddings meta-embeddings) + add_executable(meta-to-glove meta_to_glove.cpp) target_link_libraries(meta-to-glove meta-util meta-io diff --git a/src/embeddings/tools/interactive_embeddings.cpp b/src/embeddings/tools/interactive_embeddings.cpp new file mode 100644 index 000000000..14ff5a59f --- /dev/null +++ b/src/embeddings/tools/interactive_embeddings.cpp @@ -0,0 +1,60 @@ +/** + * @file interactive_embeddings.cpp + * @author Chase Geigle + * + * This tool is an interactive demo over learned word embeddings. Each + * query will be interpreted down to a unit-length vector and the top 100 + * closest embeddings to that query will be printed along with their score. + */ + +#include +#include + +#include "cpptoml.h" +#include "meta/embeddings/word_embeddings.h" +#include "meta/logging/logger.h" + +using namespace meta; + +int main(int argc, char** argv) +{ + if (argc < 2) + { + std::cerr << "Usage: " << argv[0] << " config.toml" << std::endl; + return 1; + } + + logging::set_cerr_logging(); + + auto config = cpptoml::parse_file(argv[1]); + auto embed_cfg = config->get_table("embeddings"); + if (!embed_cfg) + { + std::cerr << "Missing [embeddings] configuration in " << argv[1] + << std::endl; + return 1; + } + + auto glove = embeddings::load_embeddings(*embed_cfg); + + std::cout << "Enter a query and press enter.\n> " << std::flush; + + std::string line; + while (std::getline(std::cin, line)) + { + if (line.empty()) + break; + + auto query = glove.at(line); + for (const auto& se : glove.top_k(query.v)) + { + auto term = glove.term(se.e.tid); + std::cout << term << " (" << se.score << ")\n"; + } + std::cout << std::endl; + + std::cout << "> " << std::flush; + } + + return 0; +} From 296f6ccbdd891422e540888e3d86e5a1a0009125 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 10 Feb 2016 18:25:33 -0600 Subject: [PATCH 25/48] Fix loading bugs for word embeddings. --- include/meta/embeddings/word_embeddings.h | 6 +++--- src/embeddings/word_embeddings.cpp | 11 +++++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/include/meta/embeddings/word_embeddings.h b/include/meta/embeddings/word_embeddings.h index 58d7e8c79..22c00b312 100644 --- a/include/meta/embeddings/word_embeddings.h +++ b/include/meta/embeddings/word_embeddings.h @@ -94,14 +94,14 @@ class word_embeddings /// The size of the word embeddings const std::size_t vector_size_; - /// The embeddings matrix - util::aligned_vector embeddings_; - /// A list of all of the strings in the vocabulary, indexed by id util::aligned_vector id_to_term_; /// A hash table from a term to its id hashing::probe_map term_to_id_; + + /// The embeddings matrix + util::aligned_vector embeddings_; }; /** diff --git a/src/embeddings/word_embeddings.cpp b/src/embeddings/word_embeddings.cpp index 57ebecf91..0d2aa9abd 100644 --- a/src/embeddings/word_embeddings.cpp +++ b/src/embeddings/word_embeddings.cpp @@ -21,10 +21,10 @@ using vocab_type = hashing::probe_map; word_embeddings::word_embeddings(std::istream& vocab, std::istream& vectors) : vector_size_{io::packed::read(vectors)}, - embeddings_(vector_size_), id_to_term_(io::packed::read(vocab)), term_to_id_{static_cast(std::ceil( - id_to_term_.size() / vocab_type::default_max_load_factor()))} + id_to_term_.size() / vocab_type::default_max_load_factor()))}, + embeddings_(vector_size_ * (id_to_term_.size() + 1)) { load_vocab(vocab); @@ -48,10 +48,10 @@ word_embeddings::word_embeddings(std::istream& vocab, std::istream& vectors) word_embeddings::word_embeddings(std::istream& vocab, std::istream& first, std::istream& second) : vector_size_{io::packed::read(first)}, - embeddings_(vector_size_), id_to_term_(io::packed::read(vocab)), term_to_id_{static_cast(std::ceil( - id_to_term_.size() / vocab_type::default_max_load_factor()))} + id_to_term_.size() / vocab_type::default_max_load_factor()))}, + embeddings_(vector_size_ * (id_to_term_.size() + 1)) { if (io::packed::read(second) != vector_size_) throw word_embeddings_exception{"mismatched vector sizes"}; @@ -92,6 +92,9 @@ void word_embeddings::load_vocab(std::istream& vocab) progress(tid); io::packed::read(vocab, id_to_term_[tid]); term_to_id_[id_to_term_[tid]] = tid; + + // discard the count + io::packed::read(vocab); } } From adefb2d87cf06fb25cd23cf9b6af6ba2666399f4 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 10 Feb 2016 18:26:04 -0600 Subject: [PATCH 26/48] Fix normalization when averaging target and context vectors. --- include/meta/math/vector.h | 118 +++++++++++++++++++++++++++++ src/embeddings/word_embeddings.cpp | 9 ++- 2 files changed, 125 insertions(+), 2 deletions(-) diff --git a/include/meta/math/vector.h b/include/meta/math/vector.h index 7830cafcd..cfeb996b6 100644 --- a/include/meta/math/vector.h +++ b/include/meta/math/vector.h @@ -11,8 +11,10 @@ #define META_MATH_VECTOR_H_ #include +#include #include #include +#include #include #include "meta/util/array_view.h" @@ -277,6 +279,122 @@ typename detail::common_type::vector_type operator-(util::array_view a, typename detail::common_type::vector_type result{a.begin(), a.end()}; return std::move(result) - b; } + +// operator/ +template +vector operator/(vector&& vec, U denom) +{ + vector result{std::move(vec)}; + std::transform(result.begin(), result.end(), result.begin(), + [=](const T& elem) + { + return elem / denom; + }); + return result; +} + +template +vector operator/(const vector& vec, U denom) +{ + vector result{vec}; + return std::move(result) / denom; +} + +template +vector::type> operator/(util::array_view vec, + U denom) +{ + vector::type> result{vec.begin(), vec.end()}; + return std::move(result) / denom; +} + +// operator* +template +vector operator*(vector&& vec, U mult) +{ + vector result{std::move(vec)}; + std::transform(result.begin(), result.end(), result.begin(), + [=](const T& elem) + { + return elem * mult; + }); + return result; +} + +template +vector operator*(const vector& vec, U mult) +{ + vector result{vec}; + return std::move(result) * mult; +} + +template +vector::type> operator*(util::array_view vec, + U mult) +{ + vector::type> result{vec.begin(), vec.end()}; + return std::move(result) * mult; +} + +template +vector operator*(U mult, vector&& vec) +{ + vector result{std::move(vec)}; + std::transform(result.begin(), result.end(), result.begin(), + [=](const T& elem) + { + return elem * mult; + }); + return result; +} + +template +vector operator*(U mult, const vector& vec) +{ + vector result{vec}; + return std::move(result) * mult; +} + +template +vector::type> operator*(U mult, + util::array_view vec) +{ + vector::type> result{vec.begin(), vec.end()}; + return std::move(result) * mult; +} + +// norms +template +double l2norm(util::array_view vec) +{ + return std::sqrt(std::accumulate(vec.begin(), vec.end(), 0.0, + [](double accum, const T& elem) + { + return accum + elem * elem; + })); +} + +template +double l2norm(const vector& vec) +{ + return l2norm(util::array_view(vec)); +} + +template +double l1norm(util::array_view vec) +{ + return std::accumulate(vec.begin(), vec.end(), 0.0, + [](double accum, const T& elem) + { + return accum + std::abs(elem); + }); +} + +template +double l1norm(const vector& vec) +{ + return l1norm(util::array_view(vec)); +} } } } diff --git a/src/embeddings/word_embeddings.cpp b/src/embeddings/word_embeddings.cpp index 0d2aa9abd..23b08f62f 100644 --- a/src/embeddings/word_embeddings.cpp +++ b/src/embeddings/word_embeddings.cpp @@ -9,6 +9,7 @@ #include "meta/embeddings/word_embeddings.h" #include "meta/io/packed.h" +#include "meta/math/vector.h" #include "meta/util/fixed_heap.h" #include "meta/util/progress.h" @@ -75,9 +76,13 @@ word_embeddings::word_embeddings(std::istream& vocab, std::istream& first, std::generate(vec.begin(), vec.end(), [&]() { return (io::packed::read(first) - + io::packed::read(second)) - / 2; + + io::packed::read(second)); }); + auto len = math::operators::l2norm(vec); + std::transform(vec.begin(), vec.end(), vec.begin(), [=](double weight) + { + return weight / len; + }); } } From 35bc18507ab58456066f3b2fc04c9dc6e6fd624b Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 10 Feb 2016 18:27:14 -0600 Subject: [PATCH 27/48] Only report top 10 similar words in interactive-embeddings. --- src/embeddings/tools/interactive_embeddings.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/embeddings/tools/interactive_embeddings.cpp b/src/embeddings/tools/interactive_embeddings.cpp index 14ff5a59f..c1ce2c3e0 100644 --- a/src/embeddings/tools/interactive_embeddings.cpp +++ b/src/embeddings/tools/interactive_embeddings.cpp @@ -46,7 +46,7 @@ int main(int argc, char** argv) break; auto query = glove.at(line); - for (const auto& se : glove.top_k(query.v)) + for (const auto& se : glove.top_k(query.v, 10)) { auto term = glove.term(se.e.tid); std::cout << term << " (" << se.score << ")\n"; From 1fc97f37c3f9ac41c1a2a0b07746b8df2207edb8 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Wed, 10 Feb 2016 20:14:38 -0600 Subject: [PATCH 28/48] Add simple math expression parsing to interactive embeddings demo. --- .../tools/interactive_embeddings.cpp | 95 +++++++++++++++++-- 1 file changed, 89 insertions(+), 6 deletions(-) diff --git a/src/embeddings/tools/interactive_embeddings.cpp b/src/embeddings/tools/interactive_embeddings.cpp index c1ce2c3e0..d5c721287 100644 --- a/src/embeddings/tools/interactive_embeddings.cpp +++ b/src/embeddings/tools/interactive_embeddings.cpp @@ -7,15 +7,90 @@ * closest embeddings to that query will be printed along with their score. */ +#include #include #include +#include #include "cpptoml.h" #include "meta/embeddings/word_embeddings.h" #include "meta/logging/logger.h" +#include "meta/math/vector.h" using namespace meta; +class parse_exception : public std::runtime_error +{ + public: + using std::runtime_error::runtime_error; +}; + +void parse_whitespace(util::string_view& query) +{ + while (!query.empty() && std::isspace(query.front())) + query = query.substr(1); +} + +util::array_view +parse_word(util::string_view& query, const embeddings::word_embeddings& glove) +{ + parse_whitespace(query); + auto word = query; + while (!query.empty() && (!std::isspace(query.front()) + && query.front() != '+' && query.front() != '-')) + { + query = query.substr(1); + } + word = word.substr(0, word.length() - query.length()); + if (word.empty()) + throw parse_exception{"invalid expression"}; + parse_whitespace(query); + return glove.at(word).v; +} + +std::vector parse_expression(util::string_view& query, + const embeddings::word_embeddings& glove) +{ + if (query.empty()) + throw parse_exception{"invalid expression"}; + + using namespace meta::math::operators; + auto pos = query.find_last_of("+-"); + if (pos == util::string_view::npos) + { + auto vec = parse_word(query, glove); + return {vec.begin(), vec.end()}; + } + + auto left_expr = query.substr(0, pos); + auto left = parse_expression(left_expr, glove); + query = query.substr(pos); + if (query.empty()) + throw parse_exception{"invalid expression"}; + auto op = query.front(); + query = query.substr(1); + auto right = parse_word(query, glove); + + switch (op) + { + case '+': + return left + right; + case '-': + return left - right; + default: + throw parse_exception{"invalid expression"}; + } +} + +std::vector parse_query(util::string_view query, + const embeddings::word_embeddings& glove) +{ + using namespace meta::math::operators; + auto vec = parse_expression(query, glove); + auto len = l2norm(vec); + return std::move(vec) / len; +} + int main(int argc, char** argv) { if (argc < 2) @@ -37,7 +112,8 @@ int main(int argc, char** argv) auto glove = embeddings::load_embeddings(*embed_cfg); - std::cout << "Enter a query and press enter.\n> " << std::flush; + std::cout << "Enter a query and press enter (empty to quit).\n> " + << std::flush; std::string line; while (std::getline(std::cin, line)) @@ -45,13 +121,20 @@ int main(int argc, char** argv) if (line.empty()) break; - auto query = glove.at(line); - for (const auto& se : glove.top_k(query.v, 10)) + try + { + auto query = parse_query(line, glove); + for (const auto& se : glove.top_k(query, 10)) + { + auto term = glove.term(se.e.tid); + std::cout << term << " (" << se.score << ")\n"; + } + std::cout << std::endl; + } + catch (const parse_exception& ex) { - auto term = glove.term(se.e.tid); - std::cout << term << " (" << se.score << ")\n"; + std::cout << "error: " << ex.what() << std::endl; } - std::cout << std::endl; std::cout << "> " << std::flush; } From b0278f7a7bc16233142ac1ae34c59f71745738d6 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Thu, 11 Feb 2016 11:16:46 -0600 Subject: [PATCH 29/48] Read filter chain definition from embeddings configuration group. This cleans stuff up a bit and doesn't require you to modify your [[analyzers]] setup, since no analyzer is being used here. --- src/embeddings/tools/embedding_coocur.cpp | 22 +---------------- src/embeddings/tools/embedding_vocab.cpp | 29 +---------------------- 2 files changed, 2 insertions(+), 49 deletions(-) diff --git a/src/embeddings/tools/embedding_coocur.cpp b/src/embeddings/tools/embedding_coocur.cpp index b0090576c..51faa8c51 100644 --- a/src/embeddings/tools/embedding_coocur.cpp +++ b/src/embeddings/tools/embedding_coocur.cpp @@ -151,26 +151,6 @@ class coocur_buffer std::size_t chunk_num_ = 0; }; -std::unique_ptr -make_stream(const cpptoml::table& config) -{ - std::unique_ptr stream; - auto analyzers = config.get_table_array("analyzers"); - for (const auto& group : analyzers->get()) - { - auto method = group->get_as("method"); - if (!method) - continue; - - if (*method == analyzers::ngram_word_analyzer::id) - { - stream = analyzers::load_filters(config, *group); - break; - } - } - return stream; -} - hashing::probe_map load_vocab(const std::string& filename) { @@ -242,7 +222,7 @@ int main(int argc, char** argv) return 1; } - auto stream = make_stream(*config); + auto stream = analyzers::load_filters(*config, *embed_cfg); if (!stream) { LOG(fatal) << "Failed to find an ngram-word analyzer configuration in " diff --git a/src/embeddings/tools/embedding_vocab.cpp b/src/embeddings/tools/embedding_vocab.cpp index 68be8e01b..8b7e39ef8 100644 --- a/src/embeddings/tools/embedding_vocab.cpp +++ b/src/embeddings/tools/embedding_vocab.cpp @@ -16,26 +16,6 @@ using namespace meta; -std::unique_ptr -make_stream(const cpptoml::table& config) -{ - std::unique_ptr stream; - auto analyzers = config.get_table_array("analyzers"); - for (const auto& group : analyzers->get()) - { - auto method = group->get_as("method"); - if (!method) - continue; - - if (*method == analyzers::ngram_word_analyzer::id) - { - stream = analyzers::load_filters(config, *group); - break; - } - } - return stream; -} - int main(int argc, char** argv) { if (argc < 2) @@ -58,14 +38,7 @@ int main(int argc, char** argv) auto max_size = vocab_cfg->get_as("max-size") .value_or(std::numeric_limits::max()); - auto stream = make_stream(*config); - if (!stream) - { - LOG(fatal) << "Failed to find an ngram-word analyzer configuration in " - << argv[1] << ENDLG; - return 1; - } - + auto stream = analyzers::load_filters(*config, *embed_cfg); hashing::probe_map vocab; { From 1e82bbd40bc56ac1715ba4f3ea899a7da7b6672d Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Thu, 11 Feb 2016 11:24:38 -0600 Subject: [PATCH 30/48] Cap hours display in printing::progress. --- src/util/progress.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/util/progress.cpp b/src/util/progress.cpp index 64935cdfc..539120eef 100644 --- a/src/util/progress.cpp +++ b/src/util/progress.cpp @@ -63,7 +63,8 @@ void progress::print() *it++ = ' '; it += ::sprintf(&(*it), "%d%%", static_cast(percent * 100)); - it += ::sprintf(&(*it), " ETA %02d:%02d:%02d", hrs, mins % 60, secs % 60); + it += ::sprintf(&(*it), " ETA %02d:%02d:%02d", std::min(999, hrs), + mins % 60, secs % 60); LOG(progress) << '\r' << output_ << ENDLG; } From d396592e496eef4996d1ae0f829fa6f1cc77bee3 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Thu, 11 Feb 2016 17:20:00 -0600 Subject: [PATCH 31/48] allow full text storage in index as metadata field If store-full-text = true (default false) in the corpus config, the string metadata field "content" will be added. This is to simplify the creation of full text metadata: the user doesn't have to duplicate their dataset in metadata.dat, and metadata.dat will still be somewhat human-readable without large strings of full text added. This option has no affect for libsvm-corpus. --- include/meta/corpus/corpus.h | 22 +++++++++++++++++++++- src/corpus/corpus.cpp | 24 ++++++++++++++++++++---- src/corpus/corpus_factory.cpp | 5 +++++ src/corpus/file_corpus.cpp | 3 +++ src/corpus/gz_corpus.cpp | 6 +++++- src/corpus/line_corpus.cpp | 6 +++++- 6 files changed, 59 insertions(+), 7 deletions(-) diff --git a/include/meta/corpus/corpus.h b/include/meta/corpus/corpus.h index 56ff6e952..d609c24fd 100644 --- a/include/meta/corpus/corpus.h +++ b/include/meta/corpus/corpus.h @@ -37,7 +37,17 @@ namespace corpus * The corpus spec toml file also requires a corpus type and an optional * encoding for the corpus text. * - * Optional config parameters: none. + * Required config parameters: + * ~~~toml + * type = "line-corpus" # for example + * ~~~ + * + * Optional config parameters: + * ~~~toml + * encoding = "utf-8" # default value + * store-full-text = false # default value; N/A for libsvm-corpus + * metadata = # metadata schema; see metadata object + * ~~~ * * @see https://meta-toolkit.org/overview-tutorial.html */ @@ -80,6 +90,12 @@ class corpus */ const std::string& encoding() const; + /** + * @return whether this corpus will create a metadata field for full text + * (called "content") + */ + bool store_full_text() const; + protected: /** * Helper function to be used by deriving classes in implementing @@ -92,10 +108,14 @@ class corpus void set_metadata_parser(metadata_parser&& mdparser); + void set_store_full_text(bool store_full_text); + /// The type of encoding this document uses std::string encoding_; /// The metadata parser util::optional mdata_parser_; + /// Whether to store the original document text + bool store_full_text_; }; /** diff --git a/src/corpus/corpus.cpp b/src/corpus/corpus.cpp index 8fc67171e..0073a7f7e 100644 --- a/src/corpus/corpus.cpp +++ b/src/corpus/corpus.cpp @@ -3,9 +3,9 @@ * @author Sean Massung */ -#include "meta/corpus/corpus.h" -#include "meta/corpus/all.h" #include "cpptoml.h" +#include "meta/corpus/all.h" +#include "meta/corpus/corpus.h" #include "meta/io/filesystem.h" #include "meta/util/shim.h" @@ -14,7 +14,8 @@ namespace meta namespace corpus { -corpus::corpus(std::string encoding) : encoding_{std::move(encoding)} +corpus::corpus(std::string encoding) + : encoding_{std::move(encoding)}, store_full_text_{false} { // nothing } @@ -26,7 +27,12 @@ std::vector corpus::next_metadata() metadata::schema corpus::schema() const { - return mdata_parser_->schema(); + auto schema = mdata_parser_->schema(); + if (store_full_text()) + schema.insert( + schema.begin(), + metadata::field_info{"content", metadata::field_type::STRING}); + return schema; } const std::string& corpus::encoding() const @@ -38,5 +44,15 @@ void corpus::set_metadata_parser(metadata_parser&& parser) { mdata_parser_ = std::move(parser); } + +void corpus::set_store_full_text(bool store_full_text) +{ + store_full_text_ = store_full_text; +} + +bool corpus::store_full_text() const +{ + return store_full_text_; +} } } diff --git a/src/corpus/corpus_factory.cpp b/src/corpus/corpus_factory.cpp index 3de3a984a..443eb8cb9 100644 --- a/src/corpus/corpus_factory.cpp +++ b/src/corpus/corpus_factory.cpp @@ -55,6 +55,11 @@ std::unique_ptr make_corpus(const cpptoml::table& config) result->set_metadata_parser({*prefix + "/" + *dataset + "/metadata.dat", metadata_schema(*corpus_config)}); + + auto store_full_text + = corpus_config->get_as("store-full-text").value_or(false); + result->set_store_full_text(store_full_text); + return result; } } diff --git a/src/corpus/file_corpus.cpp b/src/corpus/file_corpus.cpp index 1abe343a5..87c4c1750 100644 --- a/src/corpus/file_corpus.cpp +++ b/src/corpus/file_corpus.cpp @@ -62,6 +62,9 @@ document file_corpus::next() doc.content(filesystem::file_text(prefix_ + docs_[cur_].first), encoding()); auto mdata = next_metadata(); + if (store_full_text()) + mdata.insert(mdata.begin(), metadata::field{doc.content()}); + // add "path" metadata manually mdata.insert(mdata.begin(), metadata::field{prefix_ + docs_[cur_].first}); doc.mdata(std::move(mdata)); diff --git a/src/corpus/gz_corpus.cpp b/src/corpus/gz_corpus.cpp index cb9285899..02959f2f9 100644 --- a/src/corpus/gz_corpus.cpp +++ b/src/corpus/gz_corpus.cpp @@ -42,7 +42,11 @@ document gz_corpus::next() document doc{cur_id_++, label}; doc.content(line, encoding()); - doc.mdata(next_metadata()); + + auto mdata = next_metadata(); + if (store_full_text()) + mdata.insert(mdata.begin(), metadata::field{doc.content()}); + doc.mdata(std::move(mdata)); return doc; } diff --git a/src/corpus/line_corpus.cpp b/src/corpus/line_corpus.cpp index 5a356a53c..57df4803d 100644 --- a/src/corpus/line_corpus.cpp +++ b/src/corpus/line_corpus.cpp @@ -54,8 +54,12 @@ document line_corpus::next() if (!std::getline(infile_, content)) throw corpus_exception{"error parsing line_corpus line " + std::to_string(cur_id_)}; + doc.content(content, encoding()); - doc.mdata(next_metadata()); + auto mdata = next_metadata(); + if (store_full_text()) + mdata.insert(mdata.begin(), metadata::field{doc.content()}); + doc.mdata(std::move(mdata)); return doc; } From 5ca23174385df53eedf14fdf4fdd3d7859aba515 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Thu, 11 Feb 2016 17:27:03 -0600 Subject: [PATCH 32/48] display full text from metadata (if it exists) in search demos --- src/index/tools/interactive_search.cpp | 35 ++++++++++---------------- src/index/tools/query_runner.cpp | 24 ++++++++++++++---- 2 files changed, 32 insertions(+), 27 deletions(-) diff --git a/src/index/tools/interactive_search.cpp b/src/index/tools/interactive_search.cpp index d35712f94..4b2ec2fa5 100644 --- a/src/index/tools/interactive_search.cpp +++ b/src/index/tools/interactive_search.cpp @@ -18,19 +18,6 @@ using namespace meta; -/** - * @param path The path to the file to open - * @return the text content of that file - */ -std::string get_content(const std::string& path) -{ - std::ifstream in{path}; - std::string str{(std::istreambuf_iterator(in)), - std::istreambuf_iterator()}; - std::replace(str.begin(), str.end(), '\n', ' '); - return str; -} - /** * Demo app to allow a user to create queries and search an index. */ @@ -63,8 +50,7 @@ int main(int argc, char* argv[]) std::string prefix = *config->get_as("prefix") + "/" + *config->get_as("dataset") + "/"; - std::cout << "Enter a query, or blank to quit." << std::endl - << std::endl; + std::cout << "Enter a query, or blank to quit." << std::endl << std::endl; std::string text; while (true) @@ -92,15 +78,20 @@ int main(int argc, char* argv[]) for (auto& result : ranking) { std::string path{idx->doc_path(result.d_id)}; - std::cout << printing::make_bold( - std::to_string(result_num) + ". " + path + " (" - + std::to_string(result.score) + ")") << std::endl; - std::cout << get_content(prefix + path) << std::endl - << std::endl; + auto output + = printing::make_bold(std::to_string(result_num) + ". " + path) + + " (score = " + std::to_string(result.score) + ", docid = " + + std::to_string(result.d_id) + ")"; + std::cout << output << std::endl; + auto mdata = idx->metadata(result.d_id); + if (auto content = mdata.get("content")) + { + auto len = std::min(77ul, content->size()); + std::cout << content->substr(0, len) << "..." << std::endl + << std::endl; + } if (result_num++ == 5) break; } - - std::cout << std::endl; } } diff --git a/src/index/tools/query_runner.cpp b/src/index/tools/query_runner.cpp index 571d3c991..28ae63c20 100644 --- a/src/index/tools/query_runner.cpp +++ b/src/index/tools/query_runner.cpp @@ -74,17 +74,31 @@ int main(int argc, char* argv[]) { corpus::document query{doc_id{0}}; query.content(content); - std::cout << "Query " << ++i << ": " << std::endl; - std::cout << std::string(20, '=') << std::endl; + std::cout << std::string(80, '=') << std::endl; + std::cout << "Query " << ++i << ": \"" << content << "\"" + << std::endl; + std::cout << std::string(80, '-') << std::endl; // Use the ranker to score the query over the index. auto ranking = ranker->score(*idx, query); auto result_num = 1; for (auto& result : ranking) { - std::cout << result_num << ". " - << idx->doc_name(result.d_id) << " " - << result.score << std::endl; + std::string path{idx->doc_path(result.d_id)}; + auto output = printing::make_bold(std::to_string(result_num) + + ". " + path) + + " (score = " + std::to_string(result.score) + + ", docid = " + std::to_string(result.d_id) + + ")"; + std::cout << output << std::endl; + auto mdata = idx->metadata(result.d_id); + if (auto content = mdata.get("content")) + { + auto len = std::min(77ul, content->size()); + std::cout << content->substr(0, len) << "..." + << std::endl + << std::endl; + } if (result_num++ == 10) break; } From bf65aef0d248141101b80dc614fa851cd12039d7 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Thu, 11 Feb 2016 17:49:34 -0600 Subject: [PATCH 33/48] Update HandleOutOfTreeLLVM.patch for travis for latest libc++. --- travis/HandleOutOfTreeLLVM.patch | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/travis/HandleOutOfTreeLLVM.patch b/travis/HandleOutOfTreeLLVM.patch index c98c600a4..5d597bfb4 100644 --- a/travis/HandleOutOfTreeLLVM.patch +++ b/travis/HandleOutOfTreeLLVM.patch @@ -4,8 +4,8 @@ set(LLVM_INCLUDE_DIR ${INCLUDE_DIR} CACHE PATH "Path to llvm/include") set(LLVM_BINARY_DIR ${LLVM_OBJ_ROOT} CACHE PATH "Path to LLVM build tree") set(LLVM_MAIN_SRC_DIR ${MAIN_SRC_DIR} CACHE PATH "Path to LLVM source tree") -- set(LLVM_CMAKE_PATH "${LLVM_BINARY_DIR}/share/llvm/cmake") -+ set(LLVM_CMAKE_PATH "${LLVM_BINARY_DIR}/share/llvm/cmake" CACHE PATH "Path to LLVM cmake modules") +- set(LLVM_CMAKE_PATH "${LLVM_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/cmake/llvm") ++ set(LLVM_CMAKE_PATH "${LLVM_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/cmake/llvm" CACHE PATH "Path to LLVM cmake modules") else() set(LLVM_FOUND OFF) return() From 29d026e0add4f1ea7df0e9a602270c8fe1040a97 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Thu, 11 Feb 2016 17:53:19 -0600 Subject: [PATCH 34/48] Fix errors in util::string_view. --- include/meta/util/string_view.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/meta/util/string_view.h b/include/meta/util/string_view.h index 97ea9b493..5f352a784 100644 --- a/include/meta/util/string_view.h +++ b/include/meta/util/string_view.h @@ -115,12 +115,12 @@ class basic_string_view const_reverse_iterator rbegin() const noexcept { - return {end()}; + return const_reverse_iterator{end()}; } const_reverse_iterator rend() const noexcept { - return {begin()}; + return const_reverse_iterator{begin()}; } const_reverse_iterator crbegin() const noexcept @@ -375,12 +375,13 @@ class basic_string_view if (pos >= size()) return npos; - auto diff = size() - std::min(size(), pos); + auto diff + = static_cast(size() - std::min(size(), pos)); auto it = std::find_first_of(rbegin() + diff, rend(), s.begin(), s.end(), Traits::eq); if (it == rend()) return npos; - return size() - 1 - std::distance(rbegin(), it); + return size() - 1 - static_cast(std::distance(rbegin(), it)); } constexpr size_type find_last_of(Char c, size_type pos = npos) const From 51fca42a7f8565f84adf1fc7787a21c6db0e9dfe Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Thu, 11 Feb 2016 18:24:10 -0600 Subject: [PATCH 35/48] Minor cleanup in printing::progress. --- src/util/progress.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/util/progress.cpp b/src/util/progress.cpp index 539120eef..c6aeeb0d5 100644 --- a/src/util/progress.cpp +++ b/src/util/progress.cpp @@ -37,9 +37,7 @@ progress::progress(const std::string& prefix, uint64_t length, int interval) void progress::print() { using namespace std::chrono; - uint64_t iter = iter_; - if (iter == 0) - iter = 1; + auto iter = std::max(uint64_t{1}, iter_.load()); auto tp = steady_clock::now(); auto percent = static_cast(iter) / length_; auto elapsed = duration_cast(tp - start_).count(); From 62d01fad8e0311aef662b74e2ae805f03592780f Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Thu, 11 Feb 2016 18:24:27 -0600 Subject: [PATCH 36/48] Fix bug in progress reporting in multiway_merge. The first chunk in the equal_range was not having its bytes read tracked properly. --- include/meta/util/multiway_merge.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/meta/util/multiway_merge.h b/include/meta/util/multiway_merge.h index f95b5608b..20905879c 100644 --- a/include/meta/util/multiway_merge.h +++ b/include/meta/util/multiway_merge.h @@ -115,7 +115,9 @@ uint64_t multiway_merge(ForwardIterator begin, ForwardIterator end, to_merge.front(), chunk_iter_comp); auto merged = std::move(*(*range.first).get()); + auto before = (*range.first).get().bytes_read(); ++(*range.first).get(); + total_read += ((*range.first).get().bytes_read() - before); ++range.first; std::for_each(range.first, range.second, [&](ChunkIterator& iter) { From adcfe7207dcf1e33bf13ed7ae7d8e59684685f3d Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Thu, 11 Feb 2016 18:31:06 -0600 Subject: [PATCH 37/48] Attempt to use two jobs in make on Travis. --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 3e140dd36..f9aaf4c00 100644 --- a/.travis.yml +++ b/.travis.yml @@ -96,5 +96,5 @@ before_script: script: - git submodule update --init --recursive - - ../travis/cmake.sh Debug && make && make clean - - rm -rf CMake* && ../travis/cmake.sh Release && make && ctest --output-on-failure + - ../travis/cmake.sh Debug && make -j2 && make clean + - rm -rf CMake* && ../travis/cmake.sh Release && make -j2 && ctest --output-on-failure From 60c8d48f0bfba85e26cd78d3766ab082fbf203b0 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Thu, 11 Feb 2016 18:54:25 -0600 Subject: [PATCH 38/48] use same types in call to std::min --- src/index/tools/interactive_search.cpp | 3 ++- src/index/tools/query_runner.cpp | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/index/tools/interactive_search.cpp b/src/index/tools/interactive_search.cpp index 4b2ec2fa5..3ba580bdc 100644 --- a/src/index/tools/interactive_search.cpp +++ b/src/index/tools/interactive_search.cpp @@ -86,7 +86,8 @@ int main(int argc, char* argv[]) auto mdata = idx->metadata(result.d_id); if (auto content = mdata.get("content")) { - auto len = std::min(77ul, content->size()); + auto len + = std::min(std::string::size_type{77}, content->size()); std::cout << content->substr(0, len) << "..." << std::endl << std::endl; } diff --git a/src/index/tools/query_runner.cpp b/src/index/tools/query_runner.cpp index 28ae63c20..252097fd0 100644 --- a/src/index/tools/query_runner.cpp +++ b/src/index/tools/query_runner.cpp @@ -94,7 +94,8 @@ int main(int argc, char* argv[]) auto mdata = idx->metadata(result.d_id); if (auto content = mdata.get("content")) { - auto len = std::min(77ul, content->size()); + auto len = std::min(std::string::size_type{77}, + content->size()); std::cout << content->substr(0, len) << "..." << std::endl << std::endl; From 8fa14767dc0c61db595c67e77c91e3ea59f4b051 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 12 Feb 2016 11:08:54 -0600 Subject: [PATCH 39/48] fix bug in gmap calculation where 0 average precision was ignored --- src/index/eval/ir_eval.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/index/eval/ir_eval.cpp b/src/index/eval/ir_eval.cpp index 6a912846c..b4642b501 100644 --- a/src/index/eval/ir_eval.cpp +++ b/src/index/eval/ir_eval.cpp @@ -46,10 +46,12 @@ void ir_eval::init_index(const std::string& path) while (in.good()) { std::getline(in, line); - bool trec = (std::count_if(line.begin(), line.end(), [](char ch) + bool trec = (std::count_if(line.begin(), line.end(), + [](char ch) { return ch == ' '; - }) == 3); // 3 spaces == 4 columns + }) + == 3); // 3 spaces == 4 columns std::istringstream iss{line}; iss >> q_id; if (trec) @@ -205,8 +207,11 @@ double ir_eval::gmap() const double sum = 0.0; for (auto& s : scores_) - if (s > 0.0) - sum += std::log(s); + { + if (s <= 0.0) + return 0.0; + sum += std::log(s); + } return std::exp(sum / scores_.size()); } From b9182c11f1017521e3c63019800a5057fad0e162 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Fri, 12 Feb 2016 11:14:37 -0600 Subject: [PATCH 40/48] Print newline before announcing chunk flush in embedding-coocur. --- src/embeddings/tools/embedding_coocur.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/embeddings/tools/embedding_coocur.cpp b/src/embeddings/tools/embedding_coocur.cpp index 51faa8c51..06d17c3ee 100644 --- a/src/embeddings/tools/embedding_coocur.cpp +++ b/src/embeddings/tools/embedding_coocur.cpp @@ -53,7 +53,7 @@ class coocur_buffer void flush() { - LOG(info) << "Flushing buffer of size: " + LOG(info) << "\nFlushing buffer of size: " << printing::bytes_to_units(coocur_.bytes_used()) << " with " << coocur_.size() << " unique pairs" << ENDLG; From 3b64dde9061ca2530bd627673b10a03afd3dfe89 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 12 Feb 2016 11:36:27 -0600 Subject: [PATCH 41/48] add unit tests for gmap bug found in 8fa14767dc0c61db595c67e77c91e3ea59f4b051 --- tests/ir_eval_test.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/ir_eval_test.cpp b/tests/ir_eval_test.cpp index 22aedbc7a..8844fc43d 100644 --- a/tests/ir_eval_test.cpp +++ b/tests/ir_eval_test.cpp @@ -76,9 +76,12 @@ go_bandit([]() { Is().GreaterThanOrEqualTo(0).And().LessThanOrEqualTo(1)); AssertThat(eval.gmap(), Is().GreaterThanOrEqualTo(0).And().LessThanOrEqualTo(1)); + + // geometric mean of numbers is always <= arithmetic mean + AssertThat(eval.map(), Is().GreaterThanOrEqualTo(eval.gmap())); }); - it("should compute correct results", []() { + it("should compute correct eval measures", []() { auto file_cfg = tests::create_config("file"); index::ir_eval eval{*file_cfg}; @@ -144,6 +147,15 @@ go_bandit([]() { check_query(eval, results, qid, 1.0, 1.0, 1.0, 1.0, 1.0); // recall is still not perfect @5 check_query(eval, results, qid, 1.0 / 1.5, 1.0, 0.5, 1.0, 1.0, 5); + + // add result with zero AP + results.clear(); + results.emplace_back(doc_id{2}, 0.9); // not relevant + avg_p = eval.avg_p(results, qid, 1000); + AssertThat(avg_p, EqualsWithDelta(0.0, delta)); + AssertThat(eval.map(), + Is().GreaterThanOrEqualTo(0).And().LessThanOrEqualTo(1)); + AssertThat(eval.gmap(), EqualsWithDelta(0.0, delta)); }); }); From c6f92c577876ec8031a61526690199aa914d1602 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Fri, 12 Feb 2016 11:40:38 -0600 Subject: [PATCH 42/48] Add tests for the rest of the vector math library. --- tests/vector_math_test.cpp | 112 ++++++++++++++++++++++++++++++++----- 1 file changed, 99 insertions(+), 13 deletions(-) diff --git a/tests/vector_math_test.cpp b/tests/vector_math_test.cpp index 2461c9caf..450a184c2 100644 --- a/tests/vector_math_test.cpp +++ b/tests/vector_math_test.cpp @@ -79,13 +79,14 @@ go_bandit([]() { AssertThat(v, Equals(3)); }); - it("should add two vectors (array_view + array_view)", [&]() { - auto c = util::array_view(a) - + util::array_view(b); - AssertThat(c.size(), Equals(a.size())); - for (const auto& v : c) - AssertThat(v, Equals(3)); - }); + it("should add two vectors (array_view + array_view)", + [&]() { + auto c + = util::array_view(a) + util::array_view(b); + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(3)); + }); it("should subtract two vectors (const ref - const ref)", [&]() { auto c = a - b; @@ -143,20 +144,105 @@ go_bandit([]() { AssertThat(v, Equals(1)); }); - it("should subtract two vectors (array_view - array_view)", [&]() { - auto c = util::array_view(a) - - util::array_view(b); + it("should subtract two vectors (array_view - array_view)", + [&]() { + auto c = util::array_view(a) + - util::array_view(b); + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(1)); + }); + + it("should add two vectors (array_view - array_view)", + [&]() { + auto c + = util::array_view(a) - util::array_view(b); + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(1)); + }); + + it("should divide by a constant (const ref / scalar)", [&]() { + auto c = a / 2; AssertThat(c.size(), Equals(a.size())); for (const auto& v : c) AssertThat(v, Equals(1)); }); - it("should add two vectors (array_view - array_view)", [&]() { - auto c = util::array_view(a) - - util::array_view(b); + it("should divide by a constant (rval / scalar)", [&]() { + auto c = std::vector(a) / 2; AssertThat(c.size(), Equals(a.size())); for (const auto& v : c) AssertThat(v, Equals(1)); }); + + it("should divide by a constant (array_view / scalar)", [&]() { + auto c = util::array_view(a) / 2; + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(1)); + }); + + it("should multiply by a constant (const ref * scalar)", [&]() { + auto c = a * 2; + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(4)); + }); + + it("should multiply by a constant (rval * scalar)", [&]() { + auto c = std::vector(a) * 2; + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(4)); + }); + + it("should multiply by a constant (array_view * scalar)", [&]() { + auto c = util::array_view(a) * 2; + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(4)); + }); + + it("should multiply by a constant (scalar * const ref)", [&]() { + auto c = 2 * a; + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(4)); + }); + + it("should multiply by a constant (scalar * rval)", [&]() { + auto c = 2 * std::vector(a); + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(4)); + }); + + it("should multiply by a constant (scalar * array_view)", [&]() { + auto c = 2 * std::vector(a); + AssertThat(c.size(), Equals(a.size())); + for (const auto& v : c) + AssertThat(v, Equals(4)); + }); + + it("should compute the l2-norm (const ref)", [&]() { + auto val = l2norm(a); + AssertThat(val, Equals(std::sqrt(2 * 2 + 2 * 2 + 2 * 2 + 2 * 2))); + }); + + it("should compute the l2-norm (array_view)", [&]() { + auto val = l2norm(util::array_view(a)); + AssertThat(val, Equals(std::sqrt(2 * 2 + 2 * 2 + 2 * 2 + 2 * 2))); + }); + + it("should compute the l1-norm (const ref)", [&]() { + auto val = l1norm(a); + AssertThat(val, Equals(2 + 2 + 2 + 2)); + }); + + it("should compute the l1-norm (const ref)", [&]() { + auto val = l1norm(util::array_view(a)); + AssertThat(val, Equals(2 + 2 + 2 + 2)); + }); }); }); From a1c258b90e35297da6c91712568d64d962460a09 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Fri, 12 Feb 2016 14:32:47 -0600 Subject: [PATCH 43/48] Switch to using ./unit-test without ctest on Travis/Appveyor. --- .appveyor.yml | 2 +- .travis.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index c286bef9e..b280f1d86 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -17,4 +17,4 @@ before_build: build_script: - bash -lc "export PATH=/mingw64/bin:$PATH && cd $APPVEYOR_BUILD_FOLDER/build && make" test_script: - - bash -lc "export PATH=/mingw64/bin:$PATH && cd $APPVEYOR_BUILD_FOLDER/build && cp ../config.toml . && ctest --output-on-failure" + - bash -lc "export PATH=/mingw64/bin:$PATH && cd $APPVEYOR_BUILD_FOLDER/build && cp ../config.toml . && ./unit-test --reporter=spec" diff --git a/.travis.yml b/.travis.yml index f9aaf4c00..071c06c40 100644 --- a/.travis.yml +++ b/.travis.yml @@ -97,4 +97,4 @@ before_script: script: - git submodule update --init --recursive - ../travis/cmake.sh Debug && make -j2 && make clean - - rm -rf CMake* && ../travis/cmake.sh Release && make -j2 && ctest --output-on-failure + - rm -rf CMake* && ../travis/cmake.sh Release && make -j2 && ./unit-test --reporter=spec From 23f28fe1acf04bcd584f7acb897f60facc3247a2 Mon Sep 17 00:00:00 2001 From: Chase Geigle Date: Fri, 12 Feb 2016 14:38:12 -0600 Subject: [PATCH 44/48] Switch to just using the standalone ./unit-test instead of ctest. There aren't really many advantages for us to using CTest at this point with the new unit test framework, so just use our unit test executable. --- CMakeLists.txt | 1 - README.md | 14 +++---- tests/CMakeLists.txt | 3 -- tests/unit_tests.cmake | 87 ------------------------------------------ 4 files changed, 7 insertions(+), 98 deletions(-) delete mode 100644 tests/unit_tests.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 040f92e37..3ad7cd1a4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,7 +13,6 @@ set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) -include(CTest) include(CheckCXXCompilerFlag) include(CheckCXXSourceCompiles) include(CheckCXXSourceRuns) diff --git a/README.md b/README.md index 782e5a9f6..91fb8827c 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ make You can now test the system by running the following command: ```bash -ctest --output-on-failure +./unit-test --reporter=spec ``` If everything passes, congratulations! MeTA seems to be working on your @@ -193,7 +193,7 @@ make You can now test the system by running the following command: ```bash -/usr/local/bin/ctest --output-on-failure +./unit-test --reporter=spec ``` If everything passes, congratulations! MeTA seems to be working on your @@ -269,7 +269,7 @@ make You can now test the system by running the following command: ```bash -ctest --output-on-failure +./unit-test --reporter=spec ``` If everything passes, congratulations! MeTA seems to be working on your @@ -310,7 +310,7 @@ make You can now test the system by running the following command: ```bash -ctest --output-on-failure +./unit-test --reporter=spec ``` If everything passes, congratulations! MeTA seems to be working on your @@ -381,7 +381,7 @@ make You can now test the system with the following command: ```bash -ctest --output-on-failure +./unit-test --reporter=spec ``` ## EWS/EngrIT Build Guide @@ -449,7 +449,7 @@ make You can now test the system by running the following command: ```bash -ctest --output-on-failure +./unit-test --reporter=spec ``` If everything passes, congratulations! MeTA seems to be working on your @@ -497,7 +497,7 @@ make You can now test the system by running the following command: ```bash -ctest --output-on-failure +./unit-test --reporter=spec ``` If everything passes, congratulations! MeTA seems to be working on your diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index de4e9a0a7..bb17ae871 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -40,6 +40,3 @@ target_link_libraries(unit-test meta-index meta-features meta-language-model meta-topics) - -set(UNIT_TEST_EXE unit-test) -include(unit_tests.cmake) diff --git a/tests/unit_tests.cmake b/tests/unit_tests.cmake deleted file mode 100644 index 366199354..000000000 --- a/tests/unit_tests.cmake +++ /dev/null @@ -1,87 +0,0 @@ -add_test(analyzers ${UNIT_TEST_EXE} --only=[analyzers] --reporter=spec) -set_tests_properties(analyzers PROPERTIES TIMEOUT 10 WORKING_DIRECTORY - ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) - -add_test(stemmers ${UNIT_TEST_EXE} --only=[stemmers] --reporter=spec) -set_tests_properties(stemmers PROPERTIES TIMEOUT 10 WORKING_DIRECTORY - ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) - -add_test(parallel ${UNIT_TEST_EXE} --only=[parallel] --reporter=spec) -set_tests_properties(parallel PROPERTIES TIMEOUT 30 WORKING_DIRECTORY - ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) - -add_test(inverted-index ${UNIT_TEST_EXE} --only=[inverted-index] --reporter=spec) -set_tests_properties(inverted-index PROPERTIES TIMEOUT 30 WORKING_DIRECTORY - ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) - -add_test(forward-index ${UNIT_TEST_EXE} --only=[forward-index] --reporter=spec) -set_tests_properties(forward-index PROPERTIES TIMEOUT 30 WORKING_DIRECTORY - ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) - -add_test(classifier ${UNIT_TEST_EXE} --only=[classifier] --reporter=spec) -set_tests_properties(classifier PROPERTIES TIMEOUT 100 WORKING_DIRECTORY - ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) - -add_test(regression ${UNIT_TEST_EXE} --only=[regression] --reporter=spec) -set_tests_properties(regression PROPERTIES TIMEOUT 100 WORKING_DIRECTORY - ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) - -add_test(string-list ${UNIT_TEST_EXE} --only=[string-list] --reporter=spec) -set_tests_properties(string-list PROPERTIES TIMEOUT 10 WORKING_DIRECTORY - ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) - -add_test(vocabulary-map ${UNIT_TEST_EXE} --only=[vocabulary-map] --reporter=spec) -set_tests_properties(vocabulary-map PROPERTIES TIMEOUT 10 WORKING_DIRECTORY - ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) - -add_test(libsvm-parser ${UNIT_TEST_EXE} --only=[libsvm-parser] --reporter=spec) -set_tests_properties(libsvm-parser PROPERTIES TIMEOUT 10 WORKING_DIRECTORY - ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) - -add_test(rankers ${UNIT_TEST_EXE} --only=[rankers] --reporter=spec) -set_tests_properties(rankers PROPERTIES TIMEOUT 90 WORKING_DIRECTORY - ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) - -add_test(ir-eval ${UNIT_TEST_EXE} --only=[ir-eval] --reporter=spec) -set_tests_properties(ir-eval PROPERTIES TIMEOUT 10 WORKING_DIRECTORY - ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) - -add_test(binary-io ${UNIT_TEST_EXE} --only=[binary-io] --reporter=spec) -set_tests_properties(binary-io PROPERTIES TIMEOUT 30 WORKING_DIRECTORY - ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) - -add_test(graph ${UNIT_TEST_EXE} --only=[graph] --reporter=spec) -set_tests_properties(graph PROPERTIES TIMEOUT 10 WORKING_DIRECTORY - ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) - -add_test(parser ${UNIT_TEST_EXE} --only=[parser] --reporter=spec) -set_tests_properties(parser PROPERTIES TIMEOUT 10 WORKING_DIRECTORY - ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) - -add_test(language-model ${UNIT_TEST_EXE} --only=[language-model] --reporter=spec) -set_tests_properties(language-model PROPERTIES TIMEOUT 10 WORKING_DIRECTORY - ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) - -add_test(filesystem ${UNIT_TEST_EXE} --only=[filesystem] --reporter=spec) -set_tests_properties(filesystem PROPERTIES TIMEOUT 10 WORKING_DIRECTORY - ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) - -add_test(feature-selection ${UNIT_TEST_EXE} --only=[feature-selection] --reporter=spec) -set_tests_properties(feature-selection PROPERTIES TIMEOUT 10 WORKING_DIRECTORY - ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) - -add_test(tokenizer-filter ${UNIT_TEST_EXE} --only=[tokenizer-filter] --reporter=spec) -set_tests_properties(tokenizer-filter PROPERTIES TIMEOUT 10 WORKING_DIRECTORY - ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) - -add_test(metadata ${UNIT_TEST_EXE} --only=[metadata] --reporter=spec) -set_tests_properties(metadata PROPERTIES TIMEOUT 10 WORKING_DIRECTORY - ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) - -add_test(topics ${UNIT_TEST_EXE} --only=[topics] --reporter=spec) -set_tests_properties(topics PROPERTIES TIMEOUT 10 WORKING_DIRECTORY - ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) - -add_test(hashing ${UNIT_TEST_EXE} --only=[hashing] --reporter=spec) -set_tests_properties(hashing PROPERTIES TIMEOUT 10 WORKING_DIRECTORY - ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) From 1516c5c7f4b177aed0291c69120d160946cc5929 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 12 Feb 2016 14:43:11 -0600 Subject: [PATCH 45/48] allow make_index to take a user-supplied corpus object The internal {forward,inverted}_index::create_index functions now all take two parameters: the global config and a corpus. The make_index function passes the "default" corpus created from the global config if no corpus is specified. --- include/meta/index/forward_index.h | 31 ++++++++------ include/meta/index/inverted_index.h | 28 +++++++----- include/meta/index/make_index.h | 19 +++++++-- src/index/forward_index.cpp | 66 ++++++++++++++--------------- src/index/inverted_index.cpp | 23 +++++----- 5 files changed, 93 insertions(+), 74 deletions(-) diff --git a/include/meta/index/forward_index.h b/include/meta/index/forward_index.h index 84c3ca313..fae047aed 100644 --- a/include/meta/index/forward_index.h +++ b/include/meta/index/forward_index.h @@ -55,20 +55,26 @@ class forward_index : public disk_index { public: /** - * forward_index is a friend of the factory method used to create - * it. + * forward_index is a friend of the factory method used to create it. */ template friend std::shared_ptr make_index(const cpptoml::table& config, Args&&... args); /** - * forward_index is a friend of the factory method used to create - * cached versions of it. + * forward_index is a friend of the factory method used to create it. + */ + template + friend std::shared_ptr make_index(const cpptoml::table& config, + corpus::corpus& docs, + Args&&... args); + /** + * forward_index is a friend of the factory method used to create cached + * versions of it. */ template class Cache, class... Args> friend std::shared_ptr> - make_index(const cpptoml::table& config_file, Args&&... args); + make_index(const cpptoml::table& config_file, Args&&... args); using primary_key_type = doc_id; using secondary_key_type = term_id; @@ -116,14 +122,14 @@ class forward_index : public disk_index * @return the postings data for a given doc_id */ virtual std::shared_ptr - search_primary(doc_id d_id) const; + search_primary(doc_id d_id) const; /** * @param d_id The doc_id to search for * @return the postings stream for a given doc_id */ util::optional> - stream_for(doc_id d_id) const; + stream_for(doc_id d_id) const; /** * @param d_id The document id of the doc to convert to liblinear format @@ -138,16 +144,17 @@ class forward_index : public disk_index private: /** - * This function loads a disk index from its filesystem - * representation. + * Loads a forward index from its filesystem representation. */ void load_index(); /** - * This function initializes the forward index. - * @param config_file The configuration file used to create the index + * Initializes the forward index; it is called by the make_index factory + * function. + * @param config The configuration to be used + * @param docs A corpus object of documents to index */ - void create_index(const cpptoml::table& config); + void create_index(const cpptoml::table& config, corpus::corpus& docs); /** * @return whether this index contains all necessary files diff --git a/include/meta/index/inverted_index.h b/include/meta/index/inverted_index.h index 7eaa9af4f..0689b21d1 100644 --- a/include/meta/index/inverted_index.h +++ b/include/meta/index/inverted_index.h @@ -71,15 +71,21 @@ class inverted_index : public disk_index using exception = inverted_index_exception; /** - * inverted_index is a friend of the factory method used to create - * it. + * inverted_index is a friend of the factory method used to create it. */ template friend std::shared_ptr make_index(const cpptoml::table&, Args&&...); /** - * inverted_index is a friend of the factory method used to create - * cached versions of it. + * inverted_index is a friend of the factory method used to create it. + */ + template + friend std::shared_ptr make_index(const cpptoml::table&, + corpus::corpus& docs, Args&&...); + + /** + * inverted_index is a friend of the factory method used to create cached + * versions of it. */ template class Cache, class... Args> friend std::shared_ptr> @@ -168,17 +174,17 @@ class inverted_index : public disk_index private: /** - * This function initializes the disk index; it is called by the - * make_index factory function. - * @param config The configuration to be used + * Loads an inverted index from its filesystem representation. */ - void create_index(const cpptoml::table& config); + void load_index(); /** - * This function loads a disk index from its filesystem - * representation. + * Initializes the inverted index; it is called by the make_index factory + * function. + * @param config The configuration to be used + * @param docs A corpus object of documents to index */ - void load_index(); + void create_index(const cpptoml::table& config, corpus::corpus& docs); /** * @return whether this index contains all necessary files diff --git a/include/meta/index/make_index.h b/include/meta/index/make_index.h index 46204710a..5619d5ee0 100644 --- a/include/meta/index/make_index.h +++ b/include/meta/index/make_index.h @@ -13,6 +13,7 @@ #include "cpptoml.h" #include "meta/caching/all.h" +#include "meta/corpus/corpus_factory.h" #include "meta/index/cached_index.h" #include "meta/io/filesystem.h" @@ -51,12 +52,14 @@ using splay_forward_index = cached_index; * ~~~ * * @param config The configuration to be used to build the index + * @param corpus The collection of documents to index * @param args any additional arguments to forward to the * constructor for the chosen index type (usually none) * @return A properly initialized index */ template -std::shared_ptr make_index(const cpptoml::table& config, Args&&... args) +std::shared_ptr make_index(const cpptoml::table& config, + corpus::corpus& docs, Args&&... args) { // check if we have paths specified for either kind of index if (!(config.contains("forward-index") @@ -92,11 +95,21 @@ std::shared_ptr make_index(const cpptoml::table& config, Args&&... args) if (!filesystem::make_directory(idx->index_name()) && idx->valid()) idx->load_index(); else - idx->create_index(config); + idx->create_index(config, docs); return idx; } +/** + * Helper for make_index that creates a corpus from the global config file. + */ +template +std::shared_ptr make_index(const cpptoml::table& config, Args&&... args) +{ + auto docs = corpus::make_corpus(config); + return make_index(config, *docs, std::forward(args)...); +} + /** * Factory method for creating indexes that are cached. * Usage: @@ -118,7 +131,7 @@ std::shared_ptr make_index(const cpptoml::table& config, Args&&... args) */ template class Cache, class... Args> std::shared_ptr> - make_index(const cpptoml::table& config, Args&&... args) +make_index(const cpptoml::table& config, Args&&... args) { return make_index>(config, std::forward(args)...); diff --git a/src/index/forward_index.cpp b/src/index/forward_index.cpp index c341fe890..80cbddf1f 100644 --- a/src/index/forward_index.cpp +++ b/src/index/forward_index.cpp @@ -3,11 +3,11 @@ * @author Sean Massung */ +#include "cpptoml.h" #include "meta/analyzers/analyzer.h" #include "meta/corpus/corpus.h" #include "meta/corpus/corpus_factory.h" #include "meta/corpus/libsvm_corpus.h" -#include "cpptoml.h" #include "meta/hashing/probe_map.h" #include "meta/index/chunk_reader.h" #include "meta/index/disk_index_impl.h" @@ -52,7 +52,7 @@ class forward_index::impl * num_threads number of forward_index chunks that then need to be * merged. */ - void tokenize_docs(corpus::corpus* corpus, + void tokenize_docs(corpus::corpus& corpus, const analyzers::analyzer& analyzer, metadata_writer& mdata_writer, uint64_t ram_budget); @@ -69,9 +69,9 @@ class forward_index::impl hashing::probe_map vocab); /** - * @param config the configuration settings for this index + * @param docs The documents to index (that are in libsvm format) */ - void create_libsvm_postings(const cpptoml::table& config); + void create_libsvm_postings(corpus::corpus& docs); /** * @param inv_idx The inverted index to uninvert @@ -191,7 +191,8 @@ void forward_index::load_index() unique_terms_file >> fwd_impl_->total_unique_terms_; } -void forward_index::create_index(const cpptoml::table& config) +void forward_index::create_index(const cpptoml::table& config, + corpus::corpus& docs) { { std::ofstream config_file{index_name() + "/config.toml"}; @@ -205,7 +206,7 @@ void forward_index::create_index(const cpptoml::table& config) LOG(info) << "Creating index from libsvm data: " << index_name() << ENDLG; - fwd_impl_->create_libsvm_postings(config); + fwd_impl_->create_libsvm_postings(docs); impl_->save_label_id_mapping(); } else @@ -217,9 +218,10 @@ void forward_index::create_index(const cpptoml::table& config) { LOG(info) << "Creating index by uninverting: " << index_name() << ENDLG; + { // Ensure all files are flushed before uninverting - make_index(config); + make_index(config, docs); } auto inv_idx = make_index(config); @@ -234,26 +236,22 @@ void forward_index::create_index(const cpptoml::table& config) { LOG(info) << "Creating forward index: " << index_name() << ENDLG; - auto docs = corpus::make_corpus(config); - - { - auto analyzer = analyzers::load(config); + auto analyzer = analyzers::load(config); - metadata_writer mdata_writer{index_name(), docs->size(), - docs->schema()}; + metadata_writer mdata_writer{index_name(), docs.size(), + docs.schema()}; - impl_->load_labels(docs->size()); + impl_->load_labels(docs.size()); - // RAM budget is given in MB - fwd_impl_->tokenize_docs(docs.get(), *analyzer, mdata_writer, - ram_budget * 1024 * 1024); - impl_->load_term_id_mapping(); - impl_->save_label_id_mapping(); - fwd_impl_->total_unique_terms_ = impl_->total_unique_terms(); + // RAM budget is given in MB + fwd_impl_->tokenize_docs(docs, *analyzer, mdata_writer, + ram_budget * 1024 * 1024); + impl_->load_term_id_mapping(); + impl_->save_label_id_mapping(); + fwd_impl_->total_unique_terms_ = impl_->total_unique_terms(); - // reload the label file to ensure it was flushed - impl_->load_labels(); - } + // reload the label file to ensure it was flushed + impl_->load_labels(); } } @@ -271,7 +269,7 @@ void forward_index::create_index(const cpptoml::table& config) LOG(info) << "Done creating index: " << index_name() << ENDLG; } -void forward_index::impl::tokenize_docs(corpus::corpus* docs, +void forward_index::impl::tokenize_docs(corpus::corpus& docs, const analyzers::analyzer& ana, metadata_writer& mdata_writer, uint64_t ram_budget) @@ -279,7 +277,7 @@ void forward_index::impl::tokenize_docs(corpus::corpus* docs, std::mutex io_mutex; std::mutex corpus_mutex; std::mutex vocab_mutex; - printing::progress progress{" > Tokenizing Docs: ", docs->size()}; + printing::progress progress{" > Tokenizing Docs: ", docs.size()}; hashing::probe_map vocab; bool exceeded_budget = false; @@ -295,10 +293,10 @@ void forward_index::impl::tokenize_docs(corpus::corpus* docs, { std::lock_guard lock{corpus_mutex}; - if (!docs->has_next()) + if (!docs.has_next()) return; - doc = docs->next(); + doc = docs.next(); } { std::lock_guard lock{io_mutex}; @@ -369,7 +367,7 @@ void forward_index::impl::tokenize_docs(corpus::corpus* docs, progress.end(); - merge_chunks(num_threads, docs->size(), std::move(vocab)); + merge_chunks(num_threads, docs.size(), std::move(vocab)); } void forward_index::impl::merge_chunks( @@ -437,12 +435,10 @@ void forward_index::impl::merge_chunks( }); } -void forward_index::impl::create_libsvm_postings(const cpptoml::table& config) +void forward_index::impl::create_libsvm_postings(corpus::corpus& docs) { auto filename = idx_->index_name() + idx_->impl_->files[POSTINGS]; - - auto docs = corpus::make_corpus(config); - auto num_docs = docs->size(); + auto num_docs = docs.size(); idx_->impl_->load_labels(num_docs); total_unique_terms_ = 0; @@ -451,13 +447,13 @@ void forward_index::impl::create_libsvm_postings(const cpptoml::table& config) num_docs}; // make md_writer with empty schema - metadata_writer md_writer{idx_->index_name(), num_docs, docs->schema()}; + metadata_writer md_writer{idx_->index_name(), num_docs, docs.schema()}; printing::progress progress{" > Creating postings from libsvm data: ", num_docs}; - while (docs->has_next()) + while (docs.has_next()) { - auto doc = docs->next(); + auto doc = docs.next(); progress(doc.id()); uint64_t num_unique = 0; diff --git a/src/index/inverted_index.cpp b/src/index/inverted_index.cpp index f47478570..5352c3539 100644 --- a/src/index/inverted_index.cpp +++ b/src/index/inverted_index.cpp @@ -54,7 +54,7 @@ class inverted_index::impl * @param ram_budget The total **estimated** RAM budget * @return the number of chunks created */ - void tokenize_docs(corpus::corpus* docs, + void tokenize_docs(corpus::corpus& docs, postings_inverter& inverter, metadata_writer& mdata_writer, uint64_t ram_budget); @@ -110,7 +110,8 @@ bool inverted_index::valid() const return true; } -void inverted_index::create_index(const cpptoml::table& config) +void inverted_index::create_index(const cpptoml::table& config, + corpus::corpus& docs) { // save the config file so we can recreate the analyzer { @@ -120,9 +121,6 @@ void inverted_index::create_index(const cpptoml::table& config) LOG(info) << "Creating index: " << index_name() << ENDLG; - // load the documents from the corpus - auto docs = corpus::make_corpus(config); - auto ram_budget = static_cast( config.get_as("indexer-ram-budget").value_or(1024)); auto max_writers = static_cast( @@ -130,13 +128,12 @@ void inverted_index::create_index(const cpptoml::table& config) postings_inverter inverter{index_name(), max_writers}; { - metadata_writer mdata_writer{index_name(), docs->size(), - docs->schema()}; - uint64_t num_docs = docs->size(); + metadata_writer mdata_writer{index_name(), docs.size(), docs.schema()}; + uint64_t num_docs = docs.size(); impl_->load_labels(num_docs); // RAM budget is given in megabytes - inv_impl_->tokenize_docs(docs.get(), inverter, mdata_writer, + inv_impl_->tokenize_docs(docs, inverter, mdata_writer, ram_budget * 1024 * 1024); } @@ -175,11 +172,11 @@ void inverted_index::load_index() } void inverted_index::impl::tokenize_docs( - corpus::corpus* docs, postings_inverter& inverter, + corpus::corpus& docs, postings_inverter& inverter, metadata_writer& mdata_writer, uint64_t ram_budget) { std::mutex mutex; - printing::progress progress{" > Tokenizing Docs: ", docs->size()}; + printing::progress progress{" > Tokenizing Docs: ", docs.size()}; auto task = [&](uint64_t ram_budget) { @@ -191,10 +188,10 @@ void inverted_index::impl::tokenize_docs( { std::lock_guard lock{mutex}; - if (!docs->has_next()) + if (!docs.has_next()) return; // destructor for producer will write // any intermediate chunks - doc = docs->next(); + doc = docs.next(); progress(doc->id()); } From c7516eb3131de21837e7a00c839cde2097adc59f Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 12 Feb 2016 15:22:13 -0600 Subject: [PATCH 46/48] add unit tests for full text metadata storage --- include/meta/corpus/corpus.h | 8 ++++++-- tests/inverted_index_test.cpp | 30 ++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/include/meta/corpus/corpus.h b/include/meta/corpus/corpus.h index d609c24fd..d14565405 100644 --- a/include/meta/corpus/corpus.h +++ b/include/meta/corpus/corpus.h @@ -96,6 +96,12 @@ class corpus */ bool store_full_text() const; + /** + * @param store_full_text Tells this corpus to store full document text as + * metadata + */ + void set_store_full_text(bool store_full_text); + protected: /** * Helper function to be used by deriving classes in implementing @@ -108,8 +114,6 @@ class corpus void set_metadata_parser(metadata_parser&& mdparser); - void set_store_full_text(bool store_full_text); - /// The type of encoding this document uses std::string encoding_; /// The metadata parser diff --git a/tests/inverted_index_test.cpp b/tests/inverted_index_test.cpp index 483a73f86..d62db6fe2 100644 --- a/tests/inverted_index_test.cpp +++ b/tests/inverted_index_test.cpp @@ -54,6 +54,19 @@ void check_term_id(Index& idx) { AssertThat(second, EqualsWithDelta(count.second, 0.001)); } } + +void check_full_text(corpus::corpus& docs, const cpptoml::table& config) { + docs.set_store_full_text(true); + auto idx = index::make_index(config, docs); + + auto mdata = idx->metadata(doc_id{0}); + auto content = mdata.get("content"); + AssertThat(*content, StartsWith(" In my opinion,")); + + mdata = idx->metadata(doc_id{1007}); + content = mdata.get("content"); + AssertThat(*content, StartsWith("I think we")); +} } go_bandit([]() { @@ -73,6 +86,12 @@ go_bandit([]() { check_ceeaus_expected(*idx); check_term_id(*idx); }); + + filesystem::remove_all("ceeaus-inv"); + it("should be able to store full text metadata", [&]() { + auto docs = corpus::make_corpus(*file_cfg); + check_full_text(*docs, *file_cfg); + }); }); describe("[inverted-index] from line config", []() { @@ -93,6 +112,12 @@ go_bandit([]() { check_term_id(*idx); check_term_id(*idx); // twice to check splay_caching }); + + filesystem::remove_all("ceeaus-inv"); + it("should be able to store full text metadata", [&]() { + auto docs = corpus::make_corpus(*line_cfg); + check_full_text(*docs, *line_cfg); + }); }); describe("[inverted-index] with caches", []() { @@ -139,6 +164,11 @@ go_bandit([]() { check_term_id(*idx); }); + filesystem::remove_all("ceeaus-inv"); + it("should be able to store full text metadata", [&]() { + auto docs = corpus::make_corpus(*gz_cfg); + check_full_text(*docs, *gz_cfg); + }); }); filesystem::remove_all("ceeaus-inv"); From 524d581f4bc79dcae3042ac81ea4557d7b065f04 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 12 Feb 2016 21:03:24 -0600 Subject: [PATCH 47/48] update CHANGELOG.md for v2.1.0 --- CHANGELOG.md | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 95957989f..6e8e6224d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,39 @@ +# [v2.1.0][2.1.0] +## New features +- Add the [GloVe algorithm](http://www-nlp.stanford.edu/pubs/glove.pdf) for + training word embeddings and a library class `word_embeddings` for loading and + querying trained embeddings. To facilitate returning word embeddings, a simple + `util::array_view` class was added. +- Add simple vector math library (and move `fastapprox` into the `math` + namespace). + +## Bug fixes +- Fix `probe_map::extract()` for `inline_key_value_storage` type; old + implementation forgot to delete all sentinel values before returning the + vector. +- Fix incorrect definition of `l1norm()` in `sgd_model`. +- Fix `gmap` calculation where 0 average precision was ignored + +## Enhancements +- Improve performance of `printing::progress`. Before, `progress::operator()` in + tight loops could dramatically hurt performance, particularly due to frequent + calls to `std::chrono::steady_clock::now()`. Now, `progress::operator()` + simply sets an atomic iteration counter and a background thread periodically + wakes to update the progress output. +- Allow full text storage in index as metadata field. If `store-full-text = + true` (default false) in the corpus config, the string metadata field + "content" will be added. This is to simplify the creation of full text + metadata: the user doesn't have to duplicate their dataset in `metadata.dat`, + and `metadata.dat` will still be somewhat human-readable without large strings + of full text added. +- Allow `make_index` to take a user-supplied corpus object. + +## Miscellaneous +- ZLIB is now a required dependency. +- Switch to just using the standalone `./unit-test` instead of `ctest`. There + aren't really many advantages for us to using CTest at this point with the new + unit test framework, so just use our unit test executable. + # [v2.0.1][2.0.1] ## Bug fixes - Fix issue where `metadata_parser` would not consume spaces in string @@ -304,7 +340,8 @@ # [v1.0][1.0] - Initial release. -[unreleased]: https://github.com/meta-toolkit/meta/compare/v2.0.1...develop +[unreleased]: https://github.com/meta-toolkit/meta/compare/v2.1.0...develop +[2.1.0]: https://github.com/meta-toolkit/meta/compare/v2.0.1...v2.1.0 [2.0.1]: https://github.com/meta-toolkit/meta/compare/v2.0.0...v2.0.1 [2.0.0]: https://github.com/meta-toolkit/meta/compare/v1.3.8...v2.0.0 [1.3.8]: https://github.com/meta-toolkit/meta/compare/v1.3.7...v1.3.8 From 8de3b896d4a19335587fb9f2e6561a626ec527d3 Mon Sep 17 00:00:00 2001 From: Sean Massung Date: Fri, 12 Feb 2016 21:09:05 -0600 Subject: [PATCH 48/48] update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e8e6224d..32fd0dcd3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ vector. - Fix incorrect definition of `l1norm()` in `sgd_model`. - Fix `gmap` calculation where 0 average precision was ignored +- Fix progress output in `multiway_merge`. ## Enhancements - Improve performance of `printing::progress`. Before, `progress::operator()` in