diff --git a/deeplima/apps/deeplima.cpp b/deeplima/apps/deeplima.cpp index 818bcb58e..7fa99bc0d 100644 --- a/deeplima/apps/deeplima.cpp +++ b/deeplima/apps/deeplima.cpp @@ -48,8 +48,8 @@ using namespace deeplima; class file_parser { public: -std::shared_ptr psegm = nullptr; -std::shared_ptr< ITokenSequenceAnalyzer > panalyzer = nullptr; +std::shared_ptr psegm = nullptr; // tokenizer +std::shared_ptr< ITokenSequenceAnalyzer > panalyzer = nullptr; // tagger std::shared_ptr< dumper::AbstractDumper > pdumper_segm_only = nullptr; // used when using segmentation only std::shared_ptr< dumper::DumperBase > pdumper_complete = nullptr; // used when using tagger std::shared_ptr parser = nullptr; @@ -241,6 +241,8 @@ void init(const std::map& models_fn, }); } + // NOTE Commented out because psegm is now instantiated for each file in + // parse_file. This is a temporary solution while reusing it fails. // psegm->register_handler([panalyzer] // (const std::vector& tokens, // uint32_t len) @@ -289,7 +291,8 @@ void parse_file(std::istream& input, } catch (std::runtime_error& e) { - std::cerr << "In parse_file: failed to load model file " << models_fn.find("tok")->second << ": " + std::cerr << "In parse_file: failed to load model file " + << models_fn.find("tok")->second << ": " << e.what() << std::endl; throw; } @@ -340,7 +343,7 @@ void parse_file(std::istream& input, // std::cerr << "Waiting for PoS tagger to stop. Calling panalyzer->finalize" << std::endl; panalyzer->finalize(); pdumper_complete->flush(); - std::cerr << "Analyzer stopped. panalyzer->finalize returned" << std::endl; + // std::cerr << "Analyzer stopped. panalyzer->finalize returned" << std::endl; } if (parser) @@ -356,10 +359,14 @@ void parse_file(std::istream& input, uint64_t token_counter = 0; if(nullptr != pdumper_segm_only) + { token_counter = pdumper_segm_only->get_token_counter(); + pdumper_segm_only->reset(); + } else if (nullptr != pdumper_complete) { token_counter = pdumper_complete->get_token_counter(); + pdumper_complete->reset(); } else { diff --git a/deeplima/include/deeplima/dependency_parser.h b/deeplima/include/deeplima/dependency_parser.h index a20725de7..b17b596a3 100644 --- a/deeplima/include/deeplima/dependency_parser.h +++ b/deeplima/include/deeplima/dependency_parser.h @@ -16,6 +16,7 @@ #include "utils/str_index.h" #include "helpers/path_resolver.h" #include "deeplima/graph_dp.h" +#include "deeplima/token_type.h" // #include "graph_dp/impl/graph_dp_impl.h" #include "segmentation/impl/segmentation_decoder.h" #include "token_sequence_analyzer.h" @@ -76,7 +77,7 @@ class DependencyParser m_ptoken(nullptr) { } - inline typename tokens_with_analysis_t::token_t::token_flags_t flags() const + inline token_flags_t flags() const { assert(nullptr != m_ptoken); return m_ptoken->m_flags; @@ -85,7 +86,7 @@ class DependencyParser inline bool eos() const { assert(nullptr != m_ptoken); - return flags() & DependencyParser::tokens_with_analysis_t::token_t::token_flags_t::sentence_brk; + return flags() & token_flags_t::sentence_brk; } inline uint32_t cls(size_t idx) const @@ -151,7 +152,7 @@ class DependencyParser return m_current >= m_end; } - inline impl::token_t::token_flags_t flags() const + inline token_flags_t flags() const { assert(! end()); return m_buffer[m_current].m_flags; @@ -235,8 +236,9 @@ class DependencyParser : m_buffer_size(buffer_size), m_current_buffer(0), m_current_timepoint(0), - m_stridx_ptr(stridx)//, - // m_stridx(*stridx) + m_stridx_ptr(stridx), + // m_stridx(*stridx), + m_impl() { assert(m_buffer_size > 0); assert(num_buffers > 0); @@ -307,9 +309,10 @@ class DependencyParser } } + // Apply the model to the sequence of tokens given by iter from the tagger void operator()(TokenSequenceAnalyzer<>::TokenIterator& iter) { - // std::cerr << "DependencyParser::operator()" << std::endl; + // std::cerr << "DependencyParser::operator(TokenSequenceAnalyzer<>::TokenIterator& iter)" << std::endl; if (m_current_timepoint >= m_buffer_size) { acquire_buffer(); @@ -334,7 +337,7 @@ class DependencyParser token.m_len = 0; token.m_form_idx = m_stridx_ptr->get_idx(""); // std::cerr << "" << std::endl; - token.m_flags = impl::token_t::token_flags_t(segmentation::token_pos::flag_t::none); + token.m_flags = token_flags_t::none; token.m_lemm_idx = token.m_form_idx; insert_root = false; tokens_to_process--; @@ -360,8 +363,8 @@ class DependencyParser token.m_classes[i] = iter.token_class(i); } - if (iter.flags() & segmentation::token_pos::flag_t::sentence_brk || - iter.flags() & segmentation::token_pos::flag_t::paragraph_brk) + if (iter.flags() & token_flags_t::sentence_brk || + iter.flags() & token_flags_t::paragraph_brk) { insert_root = true; } @@ -480,8 +483,8 @@ class DependencyParser // << "; m_buffer_size=" << m_buffer_size // << "; token=" << iter.form() << std::endl; - if (iter.flags() & segmentation::token_pos::flag_t::sentence_brk || - iter.flags() & segmentation::token_pos::flag_t::paragraph_brk) + if (iter.flags() & token_flags_t::sentence_brk || + iter.flags() & token_flags_t::paragraph_brk) { break; // lengths.push_back(this_sentence_tokens); @@ -554,16 +557,16 @@ class GraphDpImpl: public deeplima::graph_dp::impl::GraphDependencyParser m_curr_buff_idx(0) {} - GraphDpImpl( - size_t threads, - size_t buffer_size_per_thread - ) - : deeplima::graph_dp::impl::GraphDependencyParser( - 0 /* TODO: FIX ME */, 4, threads * 2, buffer_size_per_thread, threads), - m_fastText(std::make_shared>()), - m_current_timepoint(deeplima::graph_dp::impl::GraphDependencyParser::get_start_timepoint()) - { - } + // GraphDpImpl( + // size_t threads, + // size_t buffer_size_per_thread + // ) + // : deeplima::graph_dp::impl::GraphDependencyParser( + // 0 /* TODO: FIX ME */, 4, threads * 2, buffer_size_per_thread, threads), + // m_fastText(std::make_shared>()), + // m_current_timepoint(deeplima::graph_dp::impl::GraphDependencyParser::get_start_timepoint()) + // { + // } std::shared_ptr convert(const EmbdStrFloat& src) { diff --git a/deeplima/include/deeplima/dumper_conllu.h b/deeplima/include/deeplima/dumper_conllu.h index 0daf4bd64..aab45a0b2 100644 --- a/deeplima/include/deeplima/dumper_conllu.h +++ b/deeplima/include/deeplima/dumper_conllu.h @@ -10,6 +10,8 @@ // #include "deeplima/segmentation/impl/segmentation_impl.h" +#include "deeplima/token_type.h" + namespace deeplima { namespace dumper @@ -158,6 +160,11 @@ class AbstractDumper : m_token_counter(0) { } virtual ~AbstractDumper() { } + + void reset() + { + m_token_counter = 0; + } }; class Horizontal : public AbstractDumper @@ -199,8 +206,8 @@ class Horizontal : public AbstractDumper } std::cout << str << " "; - if (tokens[i].m_flags & deeplima::segmentation::token_pos::flag_t::sentence_brk || - tokens[i].m_flags & deeplima::segmentation::token_pos::flag_t::paragraph_brk) + if (tokens[i].m_flags & token_flags_t::sentence_brk || + tokens[i].m_flags & token_flags_t::paragraph_brk) { // std::cerr << "Horizontal endl" << std::endl; std::cout << std::endl; @@ -265,8 +272,8 @@ class TokensToConllU : public AbstractDumper increment_token_counter(); m_next_token_idx += 1; - if (tokens[i].m_flags & deeplima::segmentation::token_pos::flag_t::sentence_brk || - tokens[i].m_flags & deeplima::segmentation::token_pos::flag_t::paragraph_brk) + if (tokens[i].m_flags & token_flags_t::sentence_brk || + tokens[i].m_flags & token_flags_t::paragraph_brk) { // std::cerr << "TokensToConllU end of sentence" << std::endl; std::cout << std::endl; @@ -285,6 +292,7 @@ class DumperBase virtual ~DumperBase() = default; virtual uint64_t get_token_counter() const = 0; virtual void flush() = 0; + virtual void reset() = 0; }; template @@ -296,6 +304,11 @@ class AnalysisToConllU : public DumperBase std::vector m_tokens; uint32_t m_root; + void reset() + { + m_token_counter = 0; + } + inline void increment_token_counter() { ++m_token_counter; @@ -315,13 +328,14 @@ class AnalysisToConllU : public DumperBase m_has_feats(false), m_first_feature_to_print(0) { + // std::cerr << "AnalysisToConllU()" << (void*)this << std::endl; } virtual ~AnalysisToConllU() { + // std::cerr << "~AnalysisToConllU " << (void*)this << std::endl; // if (m_next_token_idx > 1) // { - // std::cerr << "on AnalysisToConllU destructor" << std::endl; // std::cout << std::endl; // } } @@ -557,8 +571,8 @@ class AnalysisToConllU : public DumperBase increment_token_counter(); m_next_token_idx += 1; - if (iter.flags() & deeplima::segmentation::token_pos::flag_t::sentence_brk || - iter.flags() & deeplima::segmentation::token_pos::flag_t::paragraph_brk) + if (iter.flags() & token_flags_t::sentence_brk || + iter.flags() & token_flags_t::paragraph_brk) { // std::cerr << "AnalysisToConllU::operator() on sent/para break. m_next_token_idx=" // << m_next_token_idx << std::endl; diff --git a/deeplima/include/deeplima/eigen_wrp/bilstm_and_dense.h b/deeplima/include/deeplima/eigen_wrp/bilstm_and_dense.h index 7cd4a0988..4a124dc6f 100644 --- a/deeplima/include/deeplima/eigen_wrp/bilstm_and_dense.h +++ b/deeplima/include/deeplima/eigen_wrp/bilstm_and_dense.h @@ -98,10 +98,11 @@ class Op_BiLSTM_Dense_ArgMax : public Op_Base bool precompute() { - std::cerr << "fw weights size: " << bilstm.fw.weight_hh.rows() << " x " << bilstm.fw.weight_hh.cols() << std::endl; + // std::cerr << "fw weights size: " << bilstm.fw.weight_hh.rows() + // << " x " << bilstm.fw.weight_hh.cols() << std::endl; size_t hidden_size = bilstm.fw.weight_hh.cols(); - std::cerr << "precompute(fw.input):" << std::endl; + // std::cerr << "precompute(fw.input):" << std::endl; // /* mul_fw.matmul_input = bilstm.fw.weight_hh.block(0, 0, hidden_size, hidden_size).inverse().partialPivLu(); mul_fw.matmul_forget = bilstm.fw.weight_hh.block(hidden_size, 0, hidden_size, hidden_size).inverse().partialPivLu(); @@ -110,7 +111,7 @@ class Op_BiLSTM_Dense_ArgMax : public Op_Base // */ hidden_size = bilstm.bw.weight_hh.cols(); - std::cerr << "precompute(bw.input):" << std::endl; + // std::cerr << "precompute(bw.input):" << std::endl; // /* mul_bw.matmul_input = bilstm.bw.weight_hh.block(0, 0, hidden_size, hidden_size).inverse().partialPivLu(); mul_bw.matmul_forget = bilstm.bw.weight_hh.block(hidden_size, 0, hidden_size, hidden_size).inverse().partialPivLu(); @@ -118,7 +119,7 @@ class Op_BiLSTM_Dense_ArgMax : public Op_Base mul_bw.matmul_output = bilstm.bw.weight_hh.block(hidden_size*3, 0, hidden_size, hidden_size).inverse().partialPivLu(); // */ - std::cerr << "end of precomputing" << std::endl; + // std::cerr << "end of precomputing" << std::endl; return true; } #else @@ -142,18 +143,18 @@ class Op_BiLSTM_Dense_ArgMax : public Op_Base { if constexpr (std::is_integral_v && std::is_signed_v) { - std::cerr << "Converting hh to fixed_point" << std::endl; - std::cerr << "min(fw_weight_hh) = " << bilstm.fw.weight_hh.minCoeff() << " " - << "max(fw_weight_hh) = " << bilstm.fw.weight_hh.maxCoeff() << std::endl; + // std::cerr << "Converting hh to fixed_point" << std::endl; + // std::cerr << "min(fw_weight_hh) = " << bilstm.fw.weight_hh.minCoeff() << " " + // << "max(fw_weight_hh) = " << bilstm.fw.weight_hh.maxCoeff() << std::endl; convert_matrix(bilstm.fw.weight_hh, weight_fw_hh_fixed_point); - std::cerr << "min(fw_weight_hh) = " << static_cast(weight_fw_hh_fixed_point.minCoeff()) / WEIGHT_FRACTION_MULT << " " - << "max(fw_weight_hh) = " << static_cast(weight_fw_hh_fixed_point.maxCoeff()) / WEIGHT_FRACTION_MULT << std::endl; + // std::cerr << "min(fw_weight_hh) = " << static_cast(weight_fw_hh_fixed_point.minCoeff()) / WEIGHT_FRACTION_MULT << " " + // << "max(fw_weight_hh) = " << static_cast(weight_fw_hh_fixed_point.maxCoeff()) / WEIGHT_FRACTION_MULT << std::endl; - std::cerr << "min(bw_weight_hh) = " << bilstm.bw.weight_hh.minCoeff() << " " - << "max(bw_weight_hh) = " << bilstm.bw.weight_hh.maxCoeff() << std::endl; + // std::cerr << "min(bw_weight_hh) = " << bilstm.bw.weight_hh.minCoeff() << " " + // << "max(bw_weight_hh) = " << bilstm.bw.weight_hh.maxCoeff() << std::endl; convert_matrix(bilstm.bw.weight_hh, weight_bw_hh_fixed_point); - std::cerr << "min(bw_weight_hh) = " << static_cast(weight_bw_hh_fixed_point.minCoeff()) / WEIGHT_FRACTION_MULT << " " - << "max(bw_weight_hh) = " << static_cast(weight_bw_hh_fixed_point.maxCoeff()) / WEIGHT_FRACTION_MULT << std::endl; + // std::cerr << "min(bw_weight_hh) = " << static_cast(weight_bw_hh_fixed_point.minCoeff()) / /*WEIGHT_FRACTION_MULT << " " + // << "max(bw_weight_hh) = " << static_cast(weight_bw_hh_fixed_point.maxCoeff()) / WEIGHT_FRACTION_MULT << std::endl;*/ } return true; diff --git a/deeplima/include/deeplima/eigen_wrp/lstm_beam_decoder.h b/deeplima/include/deeplima/eigen_wrp/lstm_beam_decoder.h index a07eeb463..5ab5d1fa6 100644 --- a/deeplima/include/deeplima/eigen_wrp/lstm_beam_decoder.h +++ b/deeplima/include/deeplima/eigen_wrp/lstm_beam_decoder.h @@ -168,11 +168,11 @@ class Op_LSTM_Beam_Decoder : public Op_Base decoding_step++; M& states_c = wb->states_c; - if (states_c.cols() != beam_size) + if ((size_t)states_c.cols() != beam_size) states_c = M::Zero(hidden_size, beam_size); for (size_t i = 0; i < beam_size; ++i) states_c.col(i) = c; M& states_h = wb->states_h; - if (states_h.cols() != beam_size) + if ((size_t)states_h.cols() != beam_size) states_h = M::Zero(hidden_size, beam_size); for (size_t i = 0; i < beam_size; ++i) states_h.col(i) = h; diff --git a/deeplima/include/deeplima/eigen_wrp/word_seq_embd_vectorizer.h b/deeplima/include/deeplima/eigen_wrp/word_seq_embd_vectorizer.h index 57a04842d..49266330b 100644 --- a/deeplima/include/deeplima/eigen_wrp/word_seq_embd_vectorizer.h +++ b/deeplima/include/deeplima/eigen_wrp/word_seq_embd_vectorizer.h @@ -532,7 +532,7 @@ class WordSeqEmbdVectorizerWithPrecomputing m_pModel->precompute_inputs(input, Parent::m_precomputed_vectors[Parent::m_curr_bucket_id], 0); Parent::m_curr_bucket_id++; - if (Parent::m_curr_bucket_id >= Parent::m_precomputed_vectors.size()) + if ((size_t)Parent::m_curr_bucket_id >= Parent::m_precomputed_vectors.size()) { Parent::m_precomputed_vectors.resize(Parent::m_curr_bucket_id + 1); } diff --git a/deeplima/include/deeplima/ner.h b/deeplima/include/deeplima/ner.h index b7d21a43e..d6357c49d 100644 --- a/deeplima/include/deeplima/ner.h +++ b/deeplima/include/deeplima/ner.h @@ -120,9 +120,31 @@ namespace impl #error Unknown inference engine #endif + /** + * A kind of RnnSequenceClassifier, used for named entities tagging (?), but + * also the parent of TaggingImpl, used as member in TokenSequenceAnalyzer + */ template class EntityTaggingClassifier: public RnnSequenceClassifier, BaseMatrix, uint8_t> - {}; + { + public: + EntityTaggingClassifier() : + RnnSequenceClassifier, BaseMatrix, uint8_t>() + { + } + + // EntityTaggingClassifier(uint32_t max_feat, + // uint32_t overlap, + // uint32_t num_slots, + // uint32_t slot_len, + // uint32_t num_threads) : + // RnnSequenceClassifier, BaseMatrix, uint8_t>( + // max_feat, overlap, num_slots, slot_len, num_threads) + // { + // } + + virtual ~EntityTaggingClassifier() = default; + }; } // namespace impl diff --git a/deeplima/include/deeplima/nets/birnn_seq_cls.h b/deeplima/include/deeplima/nets/birnn_seq_cls.h index 1ad15891b..18d081fed 100644 --- a/deeplima/include/deeplima/nets/birnn_seq_cls.h +++ b/deeplima/include/deeplima/nets/birnn_seq_cls.h @@ -28,14 +28,52 @@ std::ostream& operator<< (std::ostream& out, const std::vector& v) { return out; } +/** + * The RnnSequenceClassifier is a Model, able to infer but also a thread pool + * to dispatch the work between several threads. And also a vectorizer, here + * a matrix. + */ template */, class Out> class RnnSequenceClassifier : public InputVectorizer, public ThreadPool< RnnSequenceClassifier >, public Model { +public: + RnnSequenceClassifier() + : m_overlap(0), + m_num_slots(0), + m_slot_len(0), + m_slots(), + m_lengths(), + m_output(std::make_shared< StdMatrix >()) + {} + + // RnnSequenceClassifier(uint32_t max_feat, + // uint32_t overlap, + // uint32_t num_slots, + // uint32_t slot_len, + // uint32_t num_threads) + // : m_overlap(0), + // m_num_slots(0), + // m_slot_len(0), + // m_slots(), + // m_lengths(), + // m_output(std::make_shared< StdMatrix >()) + // { + // init(max_feat, overlap, num_slots, slot_len, num_threads); + // } + + virtual ~RnnSequenceClassifier() + { + // std::cerr << "-> ~RnnSequenceClassifier" << std::endl; + RnnSequenceClassifierThreadPool::stop(); + // std::cerr << "<- ~RnnSequenceClassifier" << std::endl; + } + +protected: typedef RnnSequenceClassifier ThisClass; - typedef ThreadPool< RnnSequenceClassifier > ThreadPoolParent; - friend ThreadPoolParent; + typedef ThreadPool< ThisClass > RnnSequenceClassifierThreadPool; + friend RnnSequenceClassifierThreadPool; enum slot_flags_t : uint8_t { @@ -94,18 +132,23 @@ class RnnSequenceClassifier : public InputVectorizer, m_next(nullptr), m_lengths(s.m_lengths) { } + ~slot_t() = default; + slot_t& operator=(const slot_t&s) + { + m_input_begin = s.m_input_begin; + m_input_end = s.m_input_end; + m_output_begin = s.m_output_begin; + m_output_end = s.m_output_end; + m_flags = s.m_flags; + m_work_started = s.m_work_started; + m_lock_count = 0; + m_prev = nullptr; + m_next = nullptr; + m_lengths = s.m_lengths; + return *this; + } }; -protected: - uint32_t m_overlap; - uint32_t m_num_slots; - uint32_t m_slot_len; - - std::vector m_slots; - std::vector> m_lengths; - std::shared_ptr< StdMatrix > m_output; // external - classifier id, internal - time position - - inline int32_t prev_slot(uint32_t idx) { assert(idx < m_num_slots); @@ -129,6 +172,7 @@ class RnnSequenceClassifier : public InputVectorizer, } } + /** Push the slot @ref idx in the thread pool for starting the job on it. */ inline void start_job_impl(uint32_t idx) { assert(idx < m_num_slots); @@ -142,7 +186,7 @@ class RnnSequenceClassifier : public InputVectorizer, if (! slot.m_work_started) { slot.m_work_started = true; - ThreadPoolParent::push(&slot); + RnnSequenceClassifierThreadPool::push(&slot); } } @@ -178,8 +222,8 @@ class RnnSequenceClassifier : public InputVectorizer, // << "; flags= " << int(slot.m_flags) // << "; prev=" << (void*)slot.m_prev // << "; next=" << (void*)slot.m_next - // // << "; output=" << (*(this_ptr->m_output))[0] - // << std::endl; + // << "; output=" << (*(this_ptr->m_output))[0] + // << std::endl; // this_ptr->pretty_print(); assert(slot.m_lock_count > 0); @@ -210,37 +254,17 @@ class RnnSequenceClassifier : public InputVectorizer, return m_output; } - RnnSequenceClassifier() - : m_overlap(0), - m_num_slots(0), - m_slot_len(0), - m_slots(), - m_lengths(), - m_output(std::make_shared< StdMatrix >()) - {} - - RnnSequenceClassifier(uint32_t max_feat, - uint32_t overlap, - uint32_t num_slots, - uint32_t slot_len, - uint32_t num_threads) - : m_overlap(0), - m_num_slots(0), - m_slot_len(0), - m_slots(), - m_lengths(), - m_output(std::make_shared< StdMatrix >()) - { - init(max_feat, overlap, num_slots, slot_len, num_threads); - } - /** * Need to be called to be able to reuse this classifier on several sequences */ - void reset() + virtual void reset() { + // std::cerr << "RnnSequenceClassifier::reset()" << (void*)this << std::endl; + m_slots.clear(); + m_slots.resize(m_num_slots); for (size_t i = 0; i < m_num_slots; i++) { + m_slots[i] = slot_t(); slot_t& slot = m_slots[i]; slot.m_output_begin = m_overlap + i * m_slot_len; @@ -273,11 +297,10 @@ class RnnSequenceClassifier : public InputVectorizer, // << " output begin=" << slot.m_output_begin << ", end=" << slot.m_output_end // << std::endl; } - - } - void init(uint32_t max_feat, + + virtual void init(uint32_t max_feat, uint32_t overlap, uint32_t num_slots, uint32_t slot_len, @@ -288,7 +311,7 @@ class RnnSequenceClassifier : public InputVectorizer, // RnnSequenceClassifier::init 1024, 16, 8, 1024, 1, true // RnnSequenceClassifier::init 464, 0, 8, 1024, 1, false - // std::cerr << "RnnSequenceClassifier::init max_feat=" << max_feat << ", overlap=" << overlap + // std::cerr << "RnnSequenceClassifier::init "<<(void*)this<<" max_feat=" << max_feat << ", overlap=" << overlap // << ", num_slots=" << num_slots // << ", slot_len=" << slot_len // << ", num_threads=" << num_threads @@ -303,10 +326,8 @@ class RnnSequenceClassifier : public InputVectorizer, { Model::init_new_worker(m_slot_len + m_overlap * 2, precomputed_input); // skip id - all workers are identical } - ThreadPoolParent::init(num_threads); + RnnSequenceClassifierThreadPool::init(num_threads); - m_slots.clear(); - m_slots.resize(m_num_slots); reset(); // set up slots m_lengths.resize(m_num_slots); @@ -331,13 +352,6 @@ class RnnSequenceClassifier : public InputVectorizer, Model::get_classes_from_fn(fn, classes_names, classes); } - virtual ~RnnSequenceClassifier() - { - // std::cerr << "-> ~RnnSequenceClassifier" << std::endl; - ThreadPoolParent::stop(); - // std::cerr << "<- ~RnnSequenceClassifier" << std::endl; - } - inline uint8_t get_output(uint64_t pos, uint8_t cls) { assert(cls < m_output->size()); @@ -375,7 +389,8 @@ class RnnSequenceClassifier : public InputVectorizer, { assert(idx < m_num_slots); m_slots[idx].m_lock_count += v; - // std::cerr << "RnnSequenceClassifier::increment_lock_count by " << int(v) << " for slot " << int(idx+1) + // std::cerr << "RnnSequenceClassifier::increment_lock_count by " << int(v) + // << " for slot " << int(idx+1) // << ". it is now: " << int(m_slots[idx].m_lock_count) << std::endl; // pretty_print(); } @@ -516,7 +531,7 @@ class RnnSequenceClassifier : public InputVectorizer, { // std::cerr << "RnnSequenceClassifier::wait_for_slot in while lock_count=" << int(slot.m_lock_count) << std::endl; // pretty_print(); - ThreadPoolParent::wait_for_any_job_notification([&slot]() { + RnnSequenceClassifierThreadPool::wait_for_any_job_notification([&slot]() { return 1 == slot.m_lock_count; } ); @@ -525,13 +540,23 @@ class RnnSequenceClassifier : public InputVectorizer, void pretty_print() const { - std::cerr << "SLOTS: "; + std::cerr << (void*)this << " " << "SLOTS: "; for (size_t i = 0; i < m_num_slots; i++) { std::cerr << " | " << int(m_slots[i].m_lock_count); } std::cerr << " |" << std::endl; } + +protected: + uint32_t m_overlap; + uint32_t m_num_slots; + uint32_t m_slot_len; + + std::vector m_slots; + std::vector> m_lengths; + std::shared_ptr< StdMatrix > m_output; // external - classifier id, internal - time position + }; } // namespace deeplima diff --git a/deeplima/include/deeplima/reader_conllu.h b/deeplima/include/deeplima/reader_conllu.h index 778a3c2f8..bbe9fc74e 100644 --- a/deeplima/include/deeplima/reader_conllu.h +++ b/deeplima/include/deeplima/reader_conllu.h @@ -90,7 +90,7 @@ class CoNLLUReader : public FormattedReaderBase if (token_idx > 0) { token_pos& token = tokens[token_idx - 1]; - token.m_flags = token_pos::flag_t(token.m_flags | token_pos::flag_t::sentence_brk); + token.m_flags = token_flags_t(token.m_flags | token_flags_t::sentence_brk); } continue; } @@ -112,7 +112,7 @@ class CoNLLUReader : public FormattedReaderBase const char* p_after_eol = p_eol + 1; if (*p_after_eol == '\n') { token_pos& token = tokens[tokens.size() - 1]; - token.m_flags = token_pos::flag_t(token.m_flags | token_pos::flag_t::sentence_brk); + token.m_flags = token_flags_t(token.m_flags | token_flags_t::sentence_brk); } } @@ -133,7 +133,7 @@ class CoNLLUReader : public FormattedReaderBase p++; token_pos& token = tokens[token_idx - 1]; - token.m_flags = token_pos::flag_t(token.m_flags | token_pos::flag_t::sentence_brk); + token.m_flags = token_flags_t(token.m_flags | token_flags_t::sentence_brk); } m_callback(tokens, token_idx); token_idx = 0; @@ -197,7 +197,7 @@ class CoNLLUReader : public FormattedReaderBase if (eos) { - token.m_flags = token_pos::flag_t(token.m_flags | token_pos::flag_t::sentence_brk); + token.m_flags = token_flags_t(token.m_flags | token_flags_t::sentence_brk); } return true; diff --git a/deeplima/include/deeplima/segmentation/impl/char_ngram_encoder.h b/deeplima/include/deeplima/segmentation/impl/char_ngram_encoder.h index c3f01ecb7..9234f9482 100644 --- a/deeplima/include/deeplima/segmentation/impl/char_ngram_encoder.h +++ b/deeplima/include/deeplima/segmentation/impl/char_ngram_encoder.h @@ -138,7 +138,7 @@ class CharNgramEncoder : public StreamDecoder = ONE_POS_MASK(StreamDecoder::bits_per_position(nd.m_type), typename StreamDecoder::buffer_t); #ifndef NDEBUG - std::cerr << "one_pos_mask == " << pretty_bits_to_string(one_pos_mask) << std::endl; + // std::cerr << "one_pos_mask == " << pretty_bits_to_string(one_pos_mask) << std::endl; #endif typename StreamDecoder::buffer_t mask = 0; @@ -157,8 +157,8 @@ class CharNgramEncoder : public StreamDecoder m_shift[i] = StreamDecoder::bits_per_position(nd.m_type) * (m_lookahead - l); #ifndef NDEBUG - std::cerr << "mask [" << i << "] == " << pretty_bits_to_string(m_mask[i]) << std::endl; - std::cerr << "shift [" << i << "] == " << (uint32_t)m_shift[i] << std::endl; + // std::cerr << "mask [" << i << "] == " << pretty_bits_to_string(m_mask[i]) << std::endl; + // std::cerr << "shift [" << i << "] == " << (uint32_t)m_shift[i] << std::endl; #endif } diff --git a/deeplima/include/deeplima/segmentation/impl/segmentation_decoder.h b/deeplima/include/deeplima/segmentation/impl/segmentation_decoder.h index 9f6d416e0..7fd198b95 100644 --- a/deeplima/include/deeplima/segmentation/impl/segmentation_decoder.h +++ b/deeplima/include/deeplima/segmentation/impl/segmentation_decoder.h @@ -15,6 +15,7 @@ #include #include "deeplima/utils/std_matrix.h" +#include "deeplima/token_type.h" namespace deeplima { @@ -23,27 +24,19 @@ namespace segmentation struct token_pos { - enum flag_t : uint8_t - { - none = 0x00, - sentence_brk = 0x01, - paragraph_brk = 0x02, - max_flags - }; - uint16_t m_offset; // offset from previous token end uint16_t m_len; // length of this token in bytes const char* m_pch; - flag_t m_flags; + token_flags_t m_flags; token_pos() - : m_offset(0), m_len(0), m_pch(nullptr), m_flags(none) {} + : m_offset(0), m_len(0), m_pch(nullptr), m_flags(token_flags_t::none) {} inline void clear() { m_offset = m_len = 0; m_pch = nullptr; - m_flags = none; + m_flags = token_flags_t::none; } inline bool empty() const @@ -319,7 +312,7 @@ class SegmentationDecoder : public CharReader<> // TODO insert the marker for case continuing [[case_]] case segm_tag_t::E_EOS: - m_tokens[pos].m_flags = token_pos::flag_t(m_tokens[pos].m_flags | token_pos::flag_t::sentence_brk); + m_tokens[pos].m_flags = token_flags_t(m_tokens[pos].m_flags | token_flags_t::sentence_brk); [[fallthrough]]; case segm_tag_t::E: @@ -351,7 +344,7 @@ class SegmentationDecoder : public CharReader<> assert(0 == m_tokens[pos].m_len); m_tokens[pos].m_pch = *pch; m_tokens[pos].m_len += m_len[from]; - m_tokens[pos].m_flags = token_pos::flag_t(m_tokens[pos].m_flags | token_pos::flag_t::sentence_brk); + m_tokens[pos].m_flags = token_flags_t(m_tokens[pos].m_flags | token_flags_t::sentence_brk); save_current_token(pos, temp_token_len, start); } break; diff --git a/deeplima/include/deeplima/segmentation/impl/segmentation_impl.h b/deeplima/include/deeplima/segmentation/impl/segmentation_impl.h index 54f320233..36439d95f 100644 --- a/deeplima/include/deeplima/segmentation/impl/segmentation_impl.h +++ b/deeplima/include/deeplima/segmentation/impl/segmentation_impl.h @@ -49,24 +49,29 @@ namespace eigen_impl typedef DictEmbdVectorizer EmbdVectorizer; } -namespace impl { +namespace impl +{ using CharNgramEncoderFromUtf8 = CharNgramEncoder< Utf8Reader<> > ; using SegmentationClassifier = RnnSequenceClassifier ; using InputEncoder = CharNgramEncoderFromUtf8; using OutputDecoder = SegmentationDecoder; +/** + * The implementation of the segmenter, a SegmentationClassifier, itself a + * RnnSequenceClassifier + */ class SegmentationImpl: public ISegmentation, public SegmentationClassifier { public: SegmentationImpl(); - SegmentationImpl( - const std::vector& ngram_descr, - size_t threads, - size_t buffer_size_per_thread - ); + // SegmentationImpl( + // const std::vector& ngram_descr, + // size_t threads, + // size_t buffer_size_per_thread + // ); virtual ~SegmentationImpl() = default; diff --git a/deeplima/include/deeplima/tagging/impl/tagging_impl.h b/deeplima/include/deeplima/tagging/impl/tagging_impl.h index 9f0f19ce2..1e07f6d90 100644 --- a/deeplima/include/deeplima/tagging/impl/tagging_impl.h +++ b/deeplima/include/deeplima/tagging/impl/tagging_impl.h @@ -40,7 +40,7 @@ class enriched_token_t m_ptoken(nullptr) { } - inline token_buffer_t<>::token_t::token_flags_t flags() const + inline token_flags_t flags() const { assert(nullptr != m_ptoken); return m_ptoken->m_flags; @@ -49,7 +49,7 @@ class enriched_token_t inline bool eos() const { assert(nullptr != m_ptoken); - return flags() & token_buffer_t<>::token_t::token_flags_t::sentence_brk; + return flags() & token_flags_t::sentence_brk; } inline const std::string& form() const @@ -83,6 +83,12 @@ class enriched_token_buffer_t } }; + +/** + * Class implementing the tagger, used as member in TokenSequenceAnalyzer, the + * main tagger class + * Son of EntityTaggingClassifier (defined in ner.h), itself a RnnSequenceClassifier + */ template class TaggingImpl: public EntityTaggingClassifier { @@ -94,21 +100,59 @@ class TaggingImpl: public EntityTaggingClassifier public: TaggingImpl() : + Classifier(), m_fastText(std::make_shared>()), + m_current_timepoint(Classifier::get_start_timepoint()), m_current_slot_timepoints(0), - m_current_slot_no(-1), - m_last_completed_slot(-1), - m_curr_buff_idx(0) + // m_current_slot_no(-1), + m_last_completed_slot(-1) //, + // m_curr_buff_idx(0) {} - TaggingImpl( - size_t threads, - size_t buffer_size_per_thread - ) - : Classifier(0 /* TODO: FIX ME */, 4, threads * 2, buffer_size_per_thread, threads), - m_fastText(std::make_shared>()), - m_current_timepoint(Classifier::get_start_timepoint()) + // TaggingImpl( + // size_t threads, + // size_t buffer_size_per_thread + // ) : + // Classifier(), + // m_fastText(std::make_shared>()), + // m_current_timepoint(Classifier::get_start_timepoint()), + // m_current_slot_timepoints(0), + // // m_current_slot_no(-1), + // m_last_completed_slot(-1) //, + // // m_curr_buff_idx(0) + // { + // } + + virtual ~TaggingImpl() + { + // std::cerr << "~TaggingImpl" << std::endl; + } + + virtual void init(size_t threads, size_t num_buffers, + size_t buffer_size_per_thread, StringIndex& stridx) + { + m_fastText->get_words([&stridx](const std::string& word){ stridx.get_idx(word); }); + + m_vectorizer.init_features({ + { Vectorizer::str_feature, "form", m_fastText } + }); + + m_vectorizer.set_model(this); + + Classifier::init(m_vectorizer.dim(), + 16, num_buffers, buffer_size_per_thread, threads, + m_vectorizer.is_precomputing()); + + m_current_timepoint = Classifier::get_start_timepoint(); + } + + virtual void reset() { + // std::cerr << "TaggingImpl::reset" << std::endl; + Classifier::reset(); + m_current_timepoint = Classifier::get_start_timepoint(); + m_current_slot_timepoints = 0; + m_last_completed_slot = -1; } virtual void load(const std::string& fn, const PathResolver& path_resolver) @@ -129,23 +173,6 @@ class TaggingImpl: public EntityTaggingClassifier m_fastText->load(fastText_fn); } - void init(size_t threads, size_t num_buffers, size_t buffer_size_per_thread, StringIndex& stridx) - { - m_fastText->get_words([&stridx](const std::string& word){ stridx.get_idx(word); }); - - m_vectorizer.init_features({ - { Vectorizer::str_feature, "form", m_fastText } - }); - - m_vectorizer.set_model(this); - - Classifier::init(m_vectorizer.dim(), - 16, num_buffers, buffer_size_per_thread, threads, - m_vectorizer.is_precomputing()); - - m_current_timepoint = Classifier::get_start_timepoint(); - } - void precompute_inputs(const typename Vectorizer::dataset_t& buffer) { m_vectorizer.precompute(buffer); @@ -156,14 +183,10 @@ class TaggingImpl: public EntityTaggingClassifier virtual void register_handler(const tagging_callback_t fn) { + // std::cerr << "TaggingImpl::register_handler" << std::endl; m_callback = fn; } - virtual ~TaggingImpl() - { - // std::cerr << "~TaggingImpl" << std::endl; - } - protected: inline void increment_timepoint(uint64_t& timepoint) @@ -239,7 +262,7 @@ class TaggingImpl: public EntityTaggingClassifier while (lock_count > 1) { // Worker still uses this slot. Waiting... - // std::cerr << "TaggingImpl::send_all_results: waiting for slot " << slot_idx+1 + // std::cerr << "TaggingImpl::send_all_results: Worker still uses this slot. Waiting... " << slot_idx+1 // << " (lock_count==" << int(lock_count) << ")\n"; // Classifier::pretty_print(); Classifier::wait_for_slot(slot_idx); @@ -283,7 +306,7 @@ class TaggingImpl: public EntityTaggingClassifier while (lock_count > 1) { // Worker still uses this slot. Waiting... - // std::cerr << "tagging handle_timepoint, waiting for slot " << slot_no + // std::cerr << "TaggingImpl::acquire_slot tagging handle_timepoint, waiting for slot " << slot_no // << " lock_count=" << int(lock_count) << std::endl; // Classifier::pretty_print(); Classifier::wait_for_slot(slot_no); @@ -296,12 +319,14 @@ class TaggingImpl: public EntityTaggingClassifier } Classifier::increment_lock_count(slot_no); + m_current_slot_timepoints = Classifier::get_slot_size(); } public: virtual void handle_token_buffer(size_t slot_no, const typename Vectorizer::dataset_t& buffer, int timepoints_to_analyze = -1) { - // std::cerr << "TaggingImpl::handle_token_buffer " << slot_no << ", " << timepoints_to_analyze << std::endl; + // std::cerr << "TaggingImpl::handle_token_buffer " << slot_no << ", " + // << timepoints_to_analyze << std::endl; send_results_if_available(); acquire_slot(slot_no); size_t offset = slot_no * buffer.size() + Classifier::get_start_timepoint(); @@ -337,10 +362,10 @@ class TaggingImpl: public EntityTaggingClassifier uint64_t m_current_timepoint; uint32_t m_current_slot_timepoints; - int32_t m_current_slot_no; + // int32_t m_current_slot_no; int32_t m_last_completed_slot; - size_t m_curr_buff_idx; + // size_t m_curr_buff_idx; }; } // namespace impl diff --git a/deeplima/include/deeplima/token_sequence_analyzer.h b/deeplima/include/deeplima/token_sequence_analyzer.h index e3a52e6da..02cbdcf48 100644 --- a/deeplima/include/deeplima/token_sequence_analyzer.h +++ b/deeplima/include/deeplima/token_sequence_analyzer.h @@ -27,6 +27,7 @@ #include "deeplima/lemmatization/impl/lemmatization_impl.h" #include "deeplima/segmentation/impl/segmentation_decoder.h" #include "deeplima/tagging/impl/tagging_impl.h" +#include "deeplima/token_type.h" template<> struct std::hash { @@ -82,7 +83,7 @@ class TokenSequenceAnalyzer : public ITokenSequenceAnalyzer : m_stridx(stridx), m_buffer(buffer), m_lemm_buffer(lemm_buffer), m_classes(classes), m_current(0), m_offset(offset), m_end(end - offset) { - assert(end > offset + 1); + assert(end >= offset + 1); } inline bool end() const @@ -90,7 +91,7 @@ class TokenSequenceAnalyzer : public ITokenSequenceAnalyzer return m_current >= m_end; } - inline impl::token_t::token_flags_t flags() const + inline token_flags_t flags() const { assert(! end()); return m_buffer[m_current].m_flags; @@ -143,7 +144,7 @@ class TokenSequenceAnalyzer : public ITokenSequenceAnalyzer inline void reset(size_t position = 0) { - std::cerr << "TokenSequenceAnalyzer::reset" << std::endl; + // std::cerr << "TokenSequenceAnalyzer::reset" << std::endl; m_current = position; } @@ -184,7 +185,7 @@ class TokenSequenceAnalyzer : public ITokenSequenceAnalyzer m_ptoken(nullptr) { } - inline token_buffer_t<>::token_t::token_flags_t flags() const + inline token_flags_t flags() const { assert(nullptr != m_ptoken); return m_ptoken->m_flags; @@ -193,7 +194,7 @@ class TokenSequenceAnalyzer : public ITokenSequenceAnalyzer inline bool eos() const { assert(nullptr != m_ptoken); - return flags() & token_buffer_t<>::token_t::token_flags_t::sentence_brk; + return flags() & token_flags_t::sentence_brk; } inline const std::string& form() const @@ -276,13 +277,14 @@ class TokenSequenceAnalyzer : public ITokenSequenceAnalyzer m_current_timepoint(0), m_stridx_ptr(std::make_shared()), m_stridx(*m_stridx_ptr), + m_cls(), m_classes(std::make_shared>()) - { - std::cerr << "TokenSequenceAnalyzer::TokenSequenceAnalyzer " << model_fn << ", " - << lemm_model_fn << ", " << lemm_dict_fn << ", " - << fixed_ini_fn << ", " << lower_ini_fn << ", " - << fixed_lemm_fn - << std::endl; +{ + // std::cerr << "TokenSequenceAnalyzer::TokenSequenceAnalyzer " << model_fn << ", " + // << lemm_model_fn << ", " << lemm_dict_fn << ", " + // << fixed_ini_fn << ", " << lower_ini_fn << ", " + // << fixed_lemm_fn + // << std::endl; assert(m_buffer_size > 0); assert(num_buffers > 0); m_buffers.resize(num_buffers); @@ -324,7 +326,7 @@ class TokenSequenceAnalyzer : public ITokenSequenceAnalyzer m_cls.register_handler([this]( std::shared_ptr< StdMatrix > classes, size_t begin, size_t end, size_t slot_idx){ - std::cerr << "handler called: " << slot_idx << std::endl; + // std::cerr << "handler called: " << slot_idx << std::endl; lemmatize(m_buffers[slot_idx], m_lemm_buffers[slot_idx], classes, begin, end); @@ -344,7 +346,7 @@ class TokenSequenceAnalyzer : public ITokenSequenceAnalyzer std::shared_ptr< StdMatrix > classes, size_t begin, size_t end, size_t slot_idx) { - std::cerr << "handler called: " << slot_idx << std::endl; + // std::cerr << "handler called: " << slot_idx << std::endl; m_classes = classes; m_output_callback(m_stridx_ptr, m_buffers[slot_idx], @@ -388,6 +390,7 @@ class TokenSequenceAnalyzer : public ITokenSequenceAnalyzer virtual void register_handler(const output_callback_t fn) override { + // std::cerr << "TokenSequenceAnalyzer::register_handler" << std::endl; m_output_callback = fn; } @@ -415,6 +418,7 @@ class TokenSequenceAnalyzer : public ITokenSequenceAnalyzer */ virtual void finalize() override { + // std::cerr << "TokenSequenceAnalyzer::finalize" << std::endl; if (m_current_timepoint > 0) { if (m_current_timepoint < m_buffer_size) @@ -430,6 +434,8 @@ class TokenSequenceAnalyzer : public ITokenSequenceAnalyzer m_cls.send_all_results(); m_current_timepoint = 0; m_current_buffer = 0; + + m_cls.reset(); } virtual void operator()(const std::vector& tokens, uint32_t len) override @@ -449,7 +455,7 @@ class TokenSequenceAnalyzer : public ITokenSequenceAnalyzer token.m_offset = src.m_offset; token.m_len = src.m_len; token.m_form_idx = m_stridx.get_idx(src.m_pch, src.m_len); - token.m_flags = impl::token_t::token_flags_t(src.m_flags); + token.m_flags = token_flags_t(src.m_flags); m_current_timepoint++; if (m_current_timepoint >= m_buffer_size) @@ -468,10 +474,10 @@ class TokenSequenceAnalyzer : public ITokenSequenceAnalyzer void acquire_buffer() { - std::cerr << "acquire_buffer" << std::endl; + // std::cerr << "acquire_buffer" << std::endl; size_t next_buffer_idx = (m_current_buffer + 1 < m_buffers.size()) ? (m_current_buffer + 1) : 0; const token_buffer_t<>& next_buffer = m_buffers[next_buffer_idx]; - +// // wait for buffer while (next_buffer.locked()) { @@ -485,7 +491,7 @@ class TokenSequenceAnalyzer : public ITokenSequenceAnalyzer void start_analysis(size_t buffer_idx, int count = -1) { - std::cerr << "TokenSequenceAnalyzer::start_analysis " << buffer_idx << ", " << count << std::endl; + // std::cerr << "TokenSequenceAnalyzer::start_analysis " << buffer_idx << ", " << count << std::endl; assert(!m_buffers[buffer_idx].locked()); m_buffers[buffer_idx].lock(); @@ -583,7 +589,7 @@ class TokenSequenceAnalyzer : public ITokenSequenceAnalyzer std::map> feats; morph_model::morph_feats_t encoded_feats = mm.convert(line, feats); - std::cerr << "load_pos_cache add " << line << " " << encoded_feats.toBaseType() << std::endl; + // std::cerr << "load_pos_cache add " << line << " " << encoded_feats.toBaseType() << std::endl; result.insert(encoded_feats); } return result; diff --git a/deeplima/include/deeplima/token_type.h b/deeplima/include/deeplima/token_type.h index bea5e41a1..5e5211f99 100644 --- a/deeplima/include/deeplima/token_type.h +++ b/deeplima/include/deeplima/token_type.h @@ -15,17 +15,18 @@ namespace deeplima { +enum token_flags_t : uint8_t +{ + none = 0x00, + sentence_brk = 0x01, + paragraph_brk = 0x02, + max_flags +}; + namespace impl { struct token_t { - enum token_flags_t : uint8_t - { - none = 0x00, - sentence_brk = 0x01, - paragraph_brk = 0x02, - max_flags - }; inline bool eos() const { diff --git a/deeplima/include/deeplima/utils/locked_buffer.h b/deeplima/include/deeplima/utils/locked_buffer.h index 09882a2f4..35e421b9e 100644 --- a/deeplima/include/deeplima/utils/locked_buffer.h +++ b/deeplima/include/deeplima/utils/locked_buffer.h @@ -26,22 +26,22 @@ struct locked_buffer_t m_lock_count(0), m_char_aligned_data(nullptr) { - std::cerr << "locked_buffer_t::locked_buffer_t()" - << (void*)this << std::endl; + // std::cerr << "locked_buffer_t::locked_buffer_t()" + // << (void*)this << std::endl; } ~locked_buffer_t() { - std::cerr << "locked_buffer_t::~locked_buffer_t() " - << (void*)this << std::endl; + // std::cerr << "locked_buffer_t::~locked_buffer_t() " + // << (void*)this << std::endl; m_data = nullptr; m_char_aligned_data = nullptr; } locked_buffer_t(const locked_buffer_t& other) { - std::cerr << "locked_buffer_t::locked_buffer_t(other)" - << (void*)this << std::endl; + // std::cerr << "locked_buffer_t::locked_buffer_t(other)" + // << (void*)this << std::endl; m_data = other.m_data; m_char_aligned_data = other.m_char_aligned_data; m_len = other.m_len; @@ -66,19 +66,19 @@ struct locked_buffer_t inline void lock() { - std::cerr << "locked_buffer_t::lock " << (void*)this << " " << m_lock_count; + // std::cerr << "locked_buffer_t::lock " << (void*)this << " " << m_lock_count; m_lock_count++; - std::cerr << " -> " << m_lock_count << std::endl; + // std::cerr << " -> " << m_lock_count << std::endl; } inline void unlock() { - std::cerr << "locked_buffer_t::unlock " << (void*)this << " " << m_lock_count; + // std::cerr << "locked_buffer_t::unlock " << (void*)this << " " << m_lock_count; m_len = 0; m_char_aligned_data = nullptr; assert(m_lock_count > 0); m_lock_count--; - std::cerr << " -> " << m_lock_count << std::endl; + // std::cerr << " -> " << m_lock_count << std::endl; } inline void set_read_start(const char* new_start) @@ -118,7 +118,7 @@ struct locked_buffer_set_t void init(size_t n, uint32_t buffer_size) { - std::cerr << "locked_buffer_set_t::init" << std::endl; + // std::cerr << "locked_buffer_set_t::init" << std::endl; assert(n > 0); assert(buffer_size > 0); @@ -178,7 +178,7 @@ struct locked_buffer_set_t void pretty_print() { - std::cerr << "BUFFS: "; + std::cerr << (void*)this << " BUFFS: "; for (size_t i = 0; i < m_data.size(); i++) { std::cerr << " | " << m_data[i].m_lock_count; diff --git a/deeplima/include/deeplima/utils/thread_pool.h b/deeplima/include/deeplima/utils/thread_pool.h index 1c47dce5f..816848bd7 100644 --- a/deeplima/include/deeplima/utils/thread_pool.h +++ b/deeplima/include/deeplima/utils/thread_pool.h @@ -59,6 +59,7 @@ class ThreadPool void stop() { m_stop = true; + // push null jobs to ensure having enough joinable jobs for (size_t i = 0 ; i < m_workers.size(); i++) { push(nullptr); @@ -80,7 +81,7 @@ class ThreadPool } else { - throw std::runtime_error("All workers must be joinable here."); + throw std::runtime_error("All workers must be unjoinable (inactive threads) here."); } } } @@ -99,6 +100,9 @@ class ThreadPool protected: + /** This will wait until a job is available and then @ref job parameter will + * be set to this available which will be removed from the list. + */ inline bool wait_for_new_job(void** job) { std::unique_lock l(m_mutex); @@ -124,6 +128,8 @@ class ThreadPool void thread_fn(size_t worker_id) { void* job = nullptr; + // loop to dispatch pushed jobs to the threads of this pool + // wait_for_new_job is blocking until a job becomes available while (true) { // std::cerr << "thread_fn " << worker_id << " main loop" << std::endl; @@ -132,11 +138,13 @@ class ThreadPool // std::cerr << "wait_for_new_job is true" << std::endl; if (nullptr == job) { + // we should get a null job only when stopping + // std::cerr << "wait_for_new_job: we should get a null job only when stopping" << std::endl; break; } - // std::cerr << "worker: " << (void*) job << " started" << std::endl; + // std::cerr << "worker: running job " << (void*) job << std::endl; P::run_one_job(static_cast(this), worker_id, job); - // std::cerr << "worker: " << (void*) job << " completed" << std::endl; + // std::cerr << "worker: completed job " << (void*) job << std::endl; m_cv_notify.notify_all(); // std::cerr << "notify_all done" << std::endl; } diff --git a/deeplima/libs/tasks/segmentation/inference/segmentation_impl.cpp b/deeplima/libs/tasks/segmentation/inference/segmentation_impl.cpp index 0329505c3..7348dd789 100644 --- a/deeplima/libs/tasks/segmentation/inference/segmentation_impl.cpp +++ b/deeplima/libs/tasks/segmentation/inference/segmentation_impl.cpp @@ -7,8 +7,9 @@ namespace deeplima::segmentation::impl { -SegmentationImpl::SegmentationImpl() - : m_decoder(SegmentationClassifier::get_output(), m_char_len), +SegmentationImpl::SegmentationImpl() : + SegmentationClassifier(), + m_decoder(SegmentationClassifier::get_output(), m_char_len), m_current_slot_timepoints(0), m_current_slot_no(-1), m_last_completed_slot(-1), @@ -16,20 +17,20 @@ SegmentationImpl::SegmentationImpl() m_curr_buff_idx(0) {} -SegmentationImpl::SegmentationImpl( - const std::vector& ngram_descr, - size_t threads, - size_t buffer_size_per_thread - ) - : SegmentationClassifier( - ngram_descr.size() * 2, 4, threads * 2, buffer_size_per_thread, threads), - m_input_encoder(ngram_descr), - m_decoder(SegmentationClassifier::get_output(), m_char_len), - m_current_timepoint(SegmentationClassifier::get_start_timepoint()), - m_buff_set(SegmentationClassifier::get_num_threads() * 2, SegmentationClassifier::get_slot_size() * 4) -{ - m_char_len.resize(SegmentationClassifier::size()); -} +// SegmentationImpl::SegmentationImpl( +// const std::vector& ngram_descr, +// size_t threads, +// size_t buffer_size_per_thread +// ) +// : SegmentationClassifier( +// ngram_descr.size() * 2, 4, threads * 2, buffer_size_per_thread, threads), +// m_input_encoder(ngram_descr), +// m_decoder(SegmentationClassifier::get_output(), m_char_len), +// m_current_timepoint(SegmentationClassifier::get_start_timepoint()), +// m_buff_set(SegmentationClassifier::get_num_threads() * 2, SegmentationClassifier::get_slot_size() * 4) +// { +// m_char_len.resize(SegmentationClassifier::size()); +// } void SegmentationImpl::load(const std::string& fn) { @@ -59,7 +60,7 @@ void SegmentationImpl::init(size_t threads, size_t buffer_size_per_thread) void SegmentationImpl::parse_from_stream(const read_callback_t fn) { - std::cerr << "SegmentationImpl::parse_from_stream" << std::endl; + // std::cerr << "SegmentationImpl::parse_from_stream" << std::endl; size_t n = 0; bool just_started = true; bool continue_reading = true; @@ -85,13 +86,13 @@ void SegmentationImpl::parse_from_stream(const read_callback_t fn) break; } counter += bytes_read; - std::cerr << "SegmentationImpl::parse_from_stream Reading callback: " - << bytes_read << " bytes, continue_reading=" - << continue_reading << " counter=" << counter << std::endl; + // std::cerr << "SegmentationImpl::parse_from_stream Reading callback: " + // << bytes_read << " bytes, continue_reading=" + // << continue_reading << " counter=" << counter << std::endl; buff.m_char_aligned_data = (const char*)(buff.m_data); buff.m_len = bytes_read; - std::cerr << "SegmentationImpl::parse_from_stream locking (m_buff_set) buff " - << n << std::endl; + // std::cerr << "SegmentationImpl::parse_from_stream locking (m_buff_set) buff " + // << n << std::endl; buff.lock(); int32_t pos = 0; @@ -158,8 +159,8 @@ void SegmentationImpl::parse_from_stream(const read_callback_t fn) send_next_results(); } - m_buff_set.pretty_print(); - SegmentationClassifier::pretty_print(); + // m_buff_set.pretty_print(); + // SegmentationClassifier::pretty_print(); n = m_buff_set.next(n); } @@ -276,15 +277,15 @@ void SegmentationImpl::acquire_slot() if (0 == m_current_slot_timepoints || m_current_slot_no < 0) { m_current_slot_no = SegmentationClassifier::get_slot_idx(m_current_timepoint); - // std::cerr << "SegmentationImpl::acquire_slot: got " << m_current_slot_no << " for timepoint " - // << m_current_timepoint << std::endl; + // std::cerr << "SegmentationImpl::acquire_slot: got " << m_current_slot_no + // << " for timepoint " << m_current_timepoint << std::endl; uint8_t lock_count = SegmentationClassifier::get_lock_count(m_current_slot_no); while (lock_count > 1) { // Worker still uses this slot. Waiting... - // std::cerr << "handle_timepoint, waiting for slot " << m_current_slot_no - // << " lock_count=" << lock_count << std::endl; + // std::cerr << "SegmentationImpl::acquire_slot, waiting for slot " + // << m_current_slot_no << " / " << lock_count << std::endl; SegmentationClassifier::wait_for_slot(m_current_slot_no); lock_count = SegmentationClassifier::get_lock_count(m_current_slot_no); } @@ -327,8 +328,11 @@ void SegmentationImpl::no_more_data() void SegmentationImpl::finalize() { - std::cerr << "SegmentationImpl::finalize" << std::endl; + // std::cerr << "SegmentationImpl::finalize" << std::endl; + SegmentationClassifier::reset(); + m_char_len.resize(SegmentationClassifier::size()); + m_current_timepoint = SegmentationClassifier::get_start_timepoint(); // no_more_data(); // // for (size_t i = 0; i < m_buff_set.size(); i++) diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/DeepLimaUnits/RnnNER/RnnNER.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/DeepLimaUnits/RnnNER/RnnNER.cpp index f5f9a07ad..215dc9a3c 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/DeepLimaUnits/RnnNER/RnnNER.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/DeepLimaUnits/RnnNER/RnnNER.cpp @@ -205,7 +205,8 @@ Lima::LimaStatusCode RnnNER::process(Lima::AnalysisContent &analysis) const token.m_offset = src->position(); token.m_len = src->length(); token.m_pch = v[k].c_str(); - token.m_flags = segmentation::token_pos::flag_t(src->status().getStatus() & StatusType::T_SENTENCE_BRK); + token.m_flags = token_flags_t(src->status().getStatus() + & StatusType::T_SENTENCE_BRK); } } m_d->tagger(buffer); diff --git a/lima_linguisticprocessing/src/linguisticProcessing/core/DeepLimaUnits/RnnTokenizer/RnnTokenizer.cpp b/lima_linguisticprocessing/src/linguisticProcessing/core/DeepLimaUnits/RnnTokenizer/RnnTokenizer.cpp index fdbad84b1..d30ca4287 100644 --- a/lima_linguisticprocessing/src/linguisticProcessing/core/DeepLimaUnits/RnnTokenizer/RnnTokenizer.cpp +++ b/lima_linguisticprocessing/src/linguisticProcessing/core/DeepLimaUnits/RnnTokenizer/RnnTokenizer.cpp @@ -24,6 +24,7 @@ #include "RnnTokenizer.h" #include "deeplima/segmentation.h" +#include "deeplima/token_type.h" @@ -60,6 +61,7 @@ CONFIGURATIONHELPER_LOGGING_INIT(TOKENIZERLOGINIT); class RnnTokenizerPrivate : public DeepTokenizerBase, public ConfigurationHelper { + friend RnnTokenizer; public: RnnTokenizerPrivate(); virtual ~RnnTokenizerPrivate(); @@ -81,16 +83,16 @@ class RnnTokenizerPrivate : public DeepTokenizerBase, public ConfigurationHelper void init(GroupConfigurationStructure& unitConfiguration); void tokenize(const QString& text, std::vector>& sentences); - MediaId m_language; - FsaStringsPool* m_stringsPool; - LinguisticGraphVertex m_currentVx; - QString m_data; - protected: void append_new_word(std::vector< TPrimitiveToken >& current_sentence, const QString& current_token, int current_token_offset) const; + MediaId m_language; + FsaStringsPool* m_stringsPool; + LinguisticGraphVertex m_currentVx; + QString m_data; + size_t m_max_seq_len; std::map> m_trrules; @@ -109,6 +111,7 @@ RnnTokenizerPrivate::RnnTokenizerPrivate() : m_stringsPool(nullptr), m_currentVx(0), m_ignoreEOL(false), + m_segm(), m_loaded(false) { } @@ -337,7 +340,7 @@ void RnnTokenizerPrivate::tokenize(const QString& text, std::vectorposition(); token.m_len = src->length(); token.m_pch = v[k].c_str(); - token.m_flags = segmentation::token_pos::flag_t(src->status().getStatus() & StatusType::T_SENTENCE_BRK); + token.m_flags = token_flags_t(src->status().getStatus() + & StatusType::T_SENTENCE_BRK); } } m_d->analyzer(buffer);