Skip to content

Commit

Permalink
Reset deeplima tokenizer on each text in lima too
Browse files Browse the repository at this point in the history
This is necessary while issue #172 is not solved
  • Loading branch information
kleag committed May 15, 2024
1 parent 55e6f8c commit 2b3dcff
Showing 1 changed file with 17 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ class RnnTokenizerPrivate : public DeepTokenizerBase, public ConfigurationHelper
// Parameters
bool m_ignoreEOL;

segmentation::impl::SegmentationImpl m_segm;
std::shared_ptr<segmentation::impl::SegmentationImpl> m_segm;

std::function<void()> m_load_fn;
bool m_loaded;
Expand All @@ -111,7 +111,7 @@ RnnTokenizerPrivate::RnnTokenizerPrivate() :
m_stringsPool(nullptr),
m_currentVx(0),
m_ignoreEOL(false),
m_segm(),
m_segm(nullptr),
m_loaded(false)
{
}
Expand Down Expand Up @@ -148,6 +148,7 @@ LimaStatusCode RnnTokenizer::process(AnalysisContent& analysis) const
LOG_MESSAGE_WITH_PROLOG(LINFO, "start tokenizer process");
TimeUtilsController RnnTokenizerProcessTime("RnnTokenizer");


auto anagraph = std::make_shared<AnalysisGraph>("AnalysisGraph", m_d->m_language, true, true);
analysis.setData("AnalysisGraph", anagraph);
auto graph = anagraph->getGraph();
Expand Down Expand Up @@ -268,21 +269,21 @@ void RnnTokenizerPrivate::init(GroupConfigurationStructure& unitConfiguration)

m_load_fn = [this, model_file_name]()
{
if (m_loaded)
{
return;
}
// if (m_loaded)
// {
// return;
// }

m_segm.load(model_file_name.toStdString());
m_segm.init(1, 16*1024); // threads, buffer size per thread
m_segm->load(model_file_name.toStdString());
m_segm->init(1, 16*1024); // threads, buffer size per thread

m_loaded = true;
};

if (!isInitLazy())
{
m_load_fn();
}
// if (!isInitLazy())
// {
// m_load_fn();
// }
}

void RnnTokenizerPrivate::append_new_word(std::vector< TPrimitiveToken >& current_sentence,
Expand Down Expand Up @@ -315,6 +316,8 @@ void RnnTokenizerPrivate::append_new_word(std::vector< TPrimitiveToken >& curren

void RnnTokenizerPrivate::tokenize(const QString& text, std::vector<std::vector<TPrimitiveToken>>& sentences)
{
m_segm = std::make_shared<segmentation::impl::SegmentationImpl>();

m_load_fn();

LOG_MESSAGE_WITH_PROLOG(LDEBUG, "RnnTokenizerPrivate::tokenize" << text.left(100));
Expand All @@ -327,7 +330,7 @@ void RnnTokenizerPrivate::tokenize(const QString& text, std::vector<std::vector<

auto text_utf8 = text.toStdString();

m_segm.register_handler([this, &sentences, &current_sentence, &current_token_offset]
m_segm->register_handler([this, &sentences, &current_sentence, &current_token_offset]
(const std::vector<segmentation::token_pos>& tokens,
uint32_t len)
{
Expand All @@ -349,7 +352,7 @@ void RnnTokenizerPrivate::tokenize(const QString& text, std::vector<std::vector<
});

size_t bytes_consumed = 0;
m_segm.parse_from_stream([&text_utf8, &bytes_consumed]
m_segm->parse_from_stream([&text_utf8, &bytes_consumed]
(uint8_t* buffer,
int32_t& read,
size_t max)
Expand Down

0 comments on commit 2b3dcff

Please sign in to comment.