Reset deeplima tokenizer on each text in lima too

This is necessary while issue #172 is not solved
aymara · May 15, 2024 · 2b3dcff · 2b3dcff
1 parent 55e6f8c
commit 2b3dcff
Showing 1 changed file with 17 additions and 14 deletions.
diff --git a/...isticprocessing/src/linguisticProcessing/core/DeepLimaUnits/RnnTokenizer/RnnTokenizer.cpp b/...isticprocessing/src/linguisticProcessing/core/DeepLimaUnits/RnnTokenizer/RnnTokenizer.cpp
@@ -100,7 +100,7 @@ class RnnTokenizerPrivate : public DeepTokenizerBase, public ConfigurationHelper
   // Parameters
   bool m_ignoreEOL;
 
-  segmentation::impl::SegmentationImpl m_segm;
+  std::shared_ptr<segmentation::impl::SegmentationImpl> m_segm;
 
   std::function<void()> m_load_fn;
   bool m_loaded;
@@ -111,7 +111,7 @@ RnnTokenizerPrivate::RnnTokenizerPrivate() :
   m_stringsPool(nullptr),
   m_currentVx(0),
   m_ignoreEOL(false),
-  m_segm(),
+  m_segm(nullptr),
   m_loaded(false)
 {
 }
@@ -148,6 +148,7 @@ LimaStatusCode RnnTokenizer::process(AnalysisContent& analysis) const
   LOG_MESSAGE_WITH_PROLOG(LINFO, "start tokenizer process");
   TimeUtilsController RnnTokenizerProcessTime("RnnTokenizer");
 
+
   auto anagraph = std::make_shared<AnalysisGraph>("AnalysisGraph", m_d->m_language, true, true);
   analysis.setData("AnalysisGraph", anagraph);
   auto graph = anagraph->getGraph();
@@ -268,21 +269,21 @@ void RnnTokenizerPrivate::init(GroupConfigurationStructure& unitConfiguration)
 
   m_load_fn = [this, model_file_name]()
   {
-    if (m_loaded)
-    {
-      return;
-    }
+    // if (m_loaded)
+    // {
+    //   return;
+    // }
 
-    m_segm.load(model_file_name.toStdString());
-    m_segm.init(1, 16*1024); // threads, buffer size per thread
+    m_segm->load(model_file_name.toStdString());
+    m_segm->init(1, 16*1024); // threads, buffer size per thread
 
     m_loaded = true;
   };
 
-  if (!isInitLazy())
-  {
-    m_load_fn();
-  }
+  // if (!isInitLazy())
+  // {
+  //   m_load_fn();
+  // }
 }
 
 void RnnTokenizerPrivate::append_new_word(std::vector< TPrimitiveToken >& current_sentence,
@@ -315,6 +316,8 @@ void RnnTokenizerPrivate::append_new_word(std::vector< TPrimitiveToken >& curren
 
 void RnnTokenizerPrivate::tokenize(const QString& text, std::vector<std::vector<TPrimitiveToken>>& sentences)
 {
+  m_segm = std::make_shared<segmentation::impl::SegmentationImpl>();
+
   m_load_fn();
 
   LOG_MESSAGE_WITH_PROLOG(LDEBUG, "RnnTokenizerPrivate::tokenize" << text.left(100));
@@ -327,7 +330,7 @@ void RnnTokenizerPrivate::tokenize(const QString& text, std::vector<std::vector<
 
   auto text_utf8 = text.toStdString();
 
-  m_segm.register_handler([this, &sentences, &current_sentence, &current_token_offset]
+  m_segm->register_handler([this, &sentences, &current_sentence, &current_token_offset]
                           (const std::vector<segmentation::token_pos>& tokens,
                            uint32_t len)
   {
@@ -349,7 +352,7 @@ void RnnTokenizerPrivate::tokenize(const QString& text, std::vector<std::vector<
   });
 
   size_t bytes_consumed = 0;
-  m_segm.parse_from_stream([&text_utf8, &bytes_consumed]
+  m_segm->parse_from_stream([&text_utf8, &bytes_consumed]
                          (uint8_t* buffer,
                          int32_t& read,
                          size_t max)