Skip to content

Commit

Permalink
Tipline search by explainers (#1971)
Browse files Browse the repository at this point in the history
## Description

Explainers can match incoming user queries to the tipline and be returned as search results, like fact-checks. This feature has two main parts: indexing and retrieving. Steps below:

**Refactoring**

- [x] Implement a class `TiplineSearchResult` class that abstracts the logic for fact-check reports and explainers

**Indexing**

- [x] When an explainer is saved, index in Alegre each paragraph as a separate document
- [x] Before doing so, make sure that paragraphs that don't exist anymore are deleted from the index
- [x] Since on the UI explainers are updated on blur, try to avoid race conditions by making sure that an indexing job is superseded by a more recent one
- [x] Implement automated tests for this

**Retrieving**

- [x] In tipline queries, search for explainers if no published fact-checks are found
- [x] In tipline queries, return explainers if matched media has no published fact-check
- [x] Once explainers are returned by Alegre, get the items associated with them in order for the tipline request to be associated with the right media cluster
- [x] There is no concept of published explainer or report for now, so, just format the search result with the title, summary and link for the explainer
- [x] Search for explainers by keyword
- [x] Search for explainers by similarity (by calling Alegre)
- [x] Implement automated tests for this

Reference: CV2-4664.

## How has this been tested?

Automated tests implemented for new features. Things to test manually:

- [x] Search by fact-checks that return both text report and visual card report
- [x] Search that matches media clusters and return fact-checks
- [x] Search that matches media clusters and return explainers
- [x] Search that matches explainers and return explainers
  • Loading branch information
caiosba authored Jul 30, 2024
1 parent c32ddd1 commit 1192dad
Show file tree
Hide file tree
Showing 8 changed files with 262 additions and 63 deletions.
58 changes: 58 additions & 0 deletions app/lib/tipline_search_result.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
class TiplineSearchResult
attr_accessor :team, :title, :body, :image_url, :language, :url, :type, :format

def initialize(team:, title:, body:, image_url:, language:, url:, type:, format:)
self.team = team
self.title = title
self.body = body
self.image_url = image_url
self.language = language
self.url = url
self.type = type # :explainer or :fact_check
self.format = format # :text or :image
end

def should_send_in_language?(language)
return true if self.team.get_languages.to_a.size < 2
tbi = TeamBotInstallation.where(team_id: self.team.id, user: BotUser.alegre_user).last
should_send_report_in_different_language = !tbi&.alegre_settings&.dig('single_language_fact_checks_enabled')
self.language == language || should_send_report_in_different_language
end

def team_report_setting_value(key, language)
self.team.get_report.to_h.with_indifferent_access.dig(language, key)
end

def footer(language)
footer = []
prefixes = {
whatsapp: 'WhatsApp: ',
facebook: 'FB Messenger: m.me/',
twitter: 'Twitter: twitter.com/',
telegram: 'Telegram: t.me/',
viber: 'Viber: ',
line: 'LINE: ',
instagram: 'Instagram: instagram.com/'
}
[:signature, :whatsapp, :facebook, :twitter, :telegram, :viber, :line, :instagram].each do |field|
value = self.team_report_setting_value(field.to_s, language)
footer << "#{prefixes[field]}#{value}" unless value.blank?
end
footer.join("\n")
end

def text(language = nil, hide_body = false)
text = []
text << "*#{self.title.strip}*" unless self.title.blank?
text << self.body.to_s unless hide_body
text << self.url unless self.url.blank?
text = text.collect do |part|
self.team.get_shorten_outgoing_urls ? UrlRewriter.shorten_and_utmize_urls(part, self.team.get_outgoing_urls_utm_code) : part
end
unless language.nil?
footer = self.footer(language)
text << footer if !footer.blank? && self.team_report_setting_value('use_signature', language)
end
text.join("\n\n")
end
end
73 changes: 52 additions & 21 deletions app/models/concerns/smooch_search.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,34 @@ module SmoochSearch
extend ActiveSupport::Concern

module ClassMethods

# This method runs in background
def search(app_id, uid, language, message, team_id, workflow, provider = nil)
platform = self.get_platform_from_message(message)
begin
sm = CheckStateMachine.new(uid)
self.get_installation(self.installation_setting_id_keys, app_id) if self.config.blank?
RequestStore.store[:smooch_bot_provider] = provider unless provider.blank?
results = self.get_search_results(uid, message, team_id, language).select do |pm|
pm = Relationship.confirmed_parent(pm)
report = pm.get_dynamic_annotation('report_design')
!report.nil? && !!report.should_send_report_in_this_language?(language)
end.collect{ |pm| Relationship.confirmed_parent(pm) }.uniq
if results.empty?
query = self.get_search_query(uid, message)
results = self.get_search_results(uid, query, team_id, language).collect{ |pm| Relationship.confirmed_parent(pm) }.uniq
reports = results.collect{ |pm| pm.get_dynamic_annotation('report_design') }.reject{ |r| r.nil? }.collect{ |r| r.report_design_to_tipline_search_result }.select{ |r| r.should_send_in_language?(language) }

# Extract explainers from matched media if they don't have published fact-checks but they have explainers
reports = results.collect{ |pm| pm.explainers.to_a }.flatten.uniq.first(3).map(&:as_tipline_search_result) if !results.empty? && reports.empty?

# Search for explainers if fact-checks were not found
if reports.empty? && query['type'] == 'text'
explainers = self.search_for_explainers(uid, query['text'], team_id, language).first(3).select{ |explainer| explainer.as_tipline_search_result.should_send_in_language?(language) }
Rails.logger.info "[Smooch Bot] Text similarity search got #{explainers.count} explainers while looking for '#{query['text']}' for team #{team_id}"
results = explainers.collect{ |explainer| explainer.project_medias.to_a }.flatten.uniq.reject{ |pm| pm.blank? }.first(3)
reports = explainers.map(&:as_tipline_search_result)
end

if reports.empty?
self.bundle_messages(uid, '', app_id, 'default_requests', nil, true)
self.send_final_message_to_user(uid, self.get_custom_string('search_no_results', language), workflow, language, 'no_results')
else
self.send_search_results_to_user(uid, results, team_id, platform)
self.send_search_results_to_user(uid, reports, team_id, platform)
sm.go_to_search_result
self.save_search_results_for_user(uid, results.map(&:id))
self.delay_for(1.second, { queue: 'smooch_priority' }).ask_for_feedback_when_all_search_results_are_received(app_id, language, workflow, uid, platform, provider, 1)
Expand Down Expand Up @@ -80,7 +91,7 @@ def filter_search_results(pms, after, feed_id, team_ids)
end

def is_a_valid_search_result(pm)
pm.report_status == 'published' && [CheckArchivedFlags::FlagCodes::NONE, CheckArchivedFlags::FlagCodes::UNCONFIRMED].include?(pm.archived)
(pm.report_status == 'published' || pm.explainers.count > 0) && [CheckArchivedFlags::FlagCodes::NONE, CheckArchivedFlags::FlagCodes::UNCONFIRMED].include?(pm.archived)
end

def reject_temporary_results(results)
Expand All @@ -91,7 +102,7 @@ def reject_temporary_results(results)

def parse_search_results_from_alegre(results, after = nil, feed_id = nil, team_ids = nil)
pms = reject_temporary_results(results).sort_by{ |a| [a[1][:model] != Bot::Alegre::ELASTICSEARCH_MODEL ? 1 : 0, a[1][:score]] }.to_h.keys.reverse.collect{ |id| Relationship.confirmed_parent(ProjectMedia.find_by_id(id)) }
filter_search_results(pms, after, feed_id, team_ids).uniq(&:id).first(3)
filter_search_results(pms, after, feed_id, team_ids).uniq(&:id).sort_by{ |pm| pm.report_status == 'published' ? 0 : 1 }.first(3)
end

def date_filter(team_id)
Expand All @@ -111,11 +122,14 @@ def get_text_similarity_threshold
value == 0.0 ? 0.85 : value
end

def get_search_results(uid, last_message, team_id, language)
def get_search_query(uid, last_message)
list = self.list_of_bundled_messages_from_user(uid)
self.bundle_list_of_messages(list, last_message, true)
end

def get_search_results(uid, message, team_id, language)
results = []
begin
list = self.list_of_bundled_messages_from_user(uid)
message = self.bundle_list_of_messages(list, last_message, true)
type = message['type']
after = self.date_filter(team_id)
query = message['text']
Expand Down Expand Up @@ -243,22 +257,22 @@ def search_by_keywords_for_similar_published_fact_checks(words, after, team_ids,
results
end

def send_search_results_to_user(uid, results, team_id, platform)
def send_search_results_to_user(uid, reports, team_id, platform)
team = Team.find(team_id)
redis = Redis.new(REDIS_CONFIG)
language = self.get_user_language(uid)
reports = results.collect{ |r| r.get_dynamic_annotation('report_design') }
# Get reports languages
reports_language = reports.map { |r| r&.report_design_field_value('language') }.uniq
if team.get_languages.to_a.size > 1 && !reports_language.include?(language)
reports_languages = reports.map(&:language).uniq

if team.get_languages.to_a.size > 1 && !reports_languages.include?(language)
self.send_message_to_user(uid, self.get_string(:no_results_in_language, language).gsub('%{language}', CheckCldr.language_code_to_name(language, language)), {}, false, true, 'no_results')
sleep 1
end
reports.reject{ |r| r.blank? }.each do |report|

reports.each do |report|
response = nil
no_body = (platform == 'Facebook Messenger' && !report.report_design_field_value('published_article_url').blank?)
response = self.send_message_to_user(uid, report.report_design_text(nil, no_body), {}, false, true, 'search_result') if report.report_design_field_value('use_text_message')
response = self.send_message_to_user(uid, '', { 'type' => 'image', 'mediaUrl' => report.report_design_image_url }, false, true, 'search_result') if !report.report_design_field_value('use_text_message') && report.report_design_field_value('use_visual_card')
no_body = (platform == 'Facebook Messenger' && !report.url.blank?)
response = self.send_message_to_user(uid, report.text(nil, no_body), {}, false, true, 'search_result') if report.format == :text
response = self.send_message_to_user(uid, '', { 'type' => 'image', 'mediaUrl' => report.image_url }, false, true, 'search_result') if report.format == :image
id = self.get_id_from_send_response(response)
redis.rpush("smooch:search:#{uid}", id) unless id.blank?
end
Expand All @@ -284,5 +298,22 @@ def ask_for_feedback_when_all_search_results_are_received(app_id, language, work
self.delay_for(1.second, { queue: 'smooch_priority' }).ask_for_feedback_when_all_search_results_are_received(app_id, language, workflow, uid, platform, provider, attempts + 1) if attempts < max # Try for 20 seconds
end
end

def search_for_explainers(uid, query, team_id, language)
results = nil
begin
text = ::Bot::Smooch.extract_claim(query)
if Bot::Alegre.get_number_of_words(text) == 1
results = Explainer.where(team_id: team_id).where('description ILIKE ? OR title ILIKE ?', "%#{text}%", "%#{text}%")
results = results.where(language: language) if should_restrict_by_language?([team_id])
results = results.order('updated_at DESC')
else
results = Explainer.search_by_similarity(text, language, team_id)
end
rescue StandardError => e
self.handle_search_error(uid, e, language)
end
results.joins(:project_medias)
end
end
end
85 changes: 84 additions & 1 deletion app/models/explainer.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
class Explainer < ApplicationRecord
include Article

# FIXME: Read from workspace settings
ALEGRE_MODELS_AND_THRESHOLDS = {
# Bot::Alegre::ELASTICSEARCH_MODEL => 0.8 # Sometimes this is easier for local development
Bot::Alegre::PARAPHRASE_MULTILINGUAL_MODEL => 0.7
}

belongs_to :team

has_annotations
Expand All @@ -12,12 +18,89 @@ class Explainer < ApplicationRecord
validates_presence_of :team, :title, :description
validate :language_in_allowed_values, unless: proc { |e| e.language.blank? }

after_save :update_paragraphs_in_alegre

def notify_bots
# Nothing to do for Explainer
end

def send_to_alegre
# Nothing to do for Explainer
# Let's not use the same callbacks from article.rb
end

def as_tipline_search_result
TiplineSearchResult.new(
team: self.team,
title: self.title,
body: self.description,
image_url: nil,
language: self.language,
url: self.url,
type: :explainer,
format: :text
)
end

def update_paragraphs_in_alegre
previous_paragraphs_count = self.description_before_last_save.to_s.gsub(/\r\n?/, "\n").split(/\n+/).reject{ |paragraph| paragraph.strip.blank? }.size

# Schedule to run 5 seconds later - it's a way to be sure there won't be more updates coming
self.class.delay_for(5.seconds).update_paragraphs_in_alegre(self.id, previous_paragraphs_count, Time.now.to_f)
end

def self.update_paragraphs_in_alegre(id, previous_paragraphs_count, timestamp)
explainer = Explainer.find(id)

# Skip if the explainer was saved since this job was created (it means that there is a more recent job)
return if explainer.updated_at.to_f > timestamp

base_context = {
type: 'explainer',
team: explainer.team.slug,
language: explainer.language,
explainer_id: explainer.id
}

# Index paragraphs
count = 0
explainer.description.to_s.gsub(/\r\n?/, "\n").split(/\n+/).reject{ |paragraph| paragraph.strip.blank? }.each do |paragraph|
count += 1
params = {
doc_id: Digest::MD5.hexdigest(['explainer', explainer.id, 'paragraph', count].join(':')),
text: paragraph.strip,
models: ALEGRE_MODELS_AND_THRESHOLDS.keys,
context: base_context.merge({ paragraph: count })
}
Bot::Alegre.request('post', '/text/similarity/', params)
end

# Remove paragraphs that don't exist anymore (we delete after updating in order to avoid race conditions)
previous_paragraphs_count.times do |index|
next if index < count
params = {
doc_id: Digest::MD5.hexdigest(['explainer', explainer.id, 'paragraph', index + 1].join(':')),
quiet: true,
context: base_context.merge({ paragraph: count })
}
Bot::Alegre.request('delete', '/text/similarity/', params)
end
end

def self.search_by_similarity(text, language, team_id)
params = {
text: text,
models: ALEGRE_MODELS_AND_THRESHOLDS.keys,
per_model_threshold: ALEGRE_MODELS_AND_THRESHOLDS,
context: {
type: 'explainer',
team: Team.find(team_id).slug,
language: language
}
}
response = Bot::Alegre.request('post', '/text/similarity/search/', params)
results = response['result'].to_a.sort_by{ |result| result['_score'] }
explainer_ids = results.collect{ |result| result.dig('_source', 'context', 'explainer_id').to_i }.uniq.first(3)
explainer_ids.empty? ? Explainer.none : Explainer.where(team_id: team_id, id: explainer_ids)
end

private
Expand Down
49 changes: 14 additions & 35 deletions config/initializers/report_designer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -93,41 +93,24 @@ def report_design_team_setting_value(field, language)
self.annotated&.team&.get_report.to_h.with_indifferent_access.dig(language, field) if self.annotation_type == 'report_design'
end

def report_design_text_footer(language)
footer = []
prefixes = {
whatsapp: 'WhatsApp: ',
facebook: 'FB Messenger: m.me/',
twitter: 'Twitter: twitter.com/',
telegram: 'Telegram: t.me/',
viber: 'Viber: ',
line: 'LINE: ',
instagram: 'Instagram: instagram.com/'
}
[:signature, :whatsapp, :facebook, :twitter, :telegram, :viber, :line, :instagram].each do |field|
value = self.report_design_team_setting_value(field.to_s, language)
footer << "#{prefixes[field]}#{value}" unless value.blank?
def report_design_to_tipline_search_result
if self.annotation_type == 'report_design'
TiplineSearchResult.new(
type: :fact_check,
team: self.annotated.team,
title: self.report_design_field_value('title'),
body: self.report_design_field_value('text'),
image_url: self.report_design_image_url,
language: self.report_design_field_value('language'),
url: self.report_design_field_value('published_article_url'),
format: (!self.report_design_field_value('use_text_message') && self.report_design_field_value('use_visual_card')) ? :image : :text
)
end
footer.join("\n")
end

def report_design_text(language = nil, hide_body = false)
if self.annotation_type == 'report_design'
team = self.annotated.team
text = []
title = self.report_design_field_value('title')
text << "*#{title.strip}*" unless title.blank?
text << self.report_design_field_value('text').to_s unless hide_body
url = self.report_design_field_value('published_article_url')
text << url unless url.blank?
text = text.collect do |part|
team.get_shorten_outgoing_urls ? UrlRewriter.shorten_and_utmize_urls(part, team.get_outgoing_urls_utm_code) : part
end
unless language.nil?
footer = self.report_design_text_footer(language)
text << footer if !footer.blank? && self.report_design_team_setting_value('use_signature', language)
end
text.join("\n\n")
self.report_design_to_tipline_search_result.text(language, hide_body)
end
end

Expand Down Expand Up @@ -241,10 +224,6 @@ def sent_count
end

def should_send_report_in_this_language?(language)
team = self.annotated.team
return true if team.get_languages.to_a.size < 2
tbi = TeamBotInstallation.where(team_id: team.id, user: BotUser.alegre_user).last
should_send_report_in_different_language = !tbi&.alegre_settings&.dig('single_language_fact_checks_enabled')
self.annotation_type == 'report_design' && (self.report_design_field_value('language') == language || should_send_report_in_different_language)
self.annotation_type == 'report_design' && self.report_design_to_tipline_search_result.should_send_in_language?(language)
end
end
7 changes: 5 additions & 2 deletions test/models/bot/smooch_4_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -669,9 +669,12 @@ def teardown
CheckSearch.any_instance.stubs(:medias).returns([pm1])
Bot::Alegre.stubs(:get_merged_similar_items).returns({ pm2.id => { score: 0.9, model: 'elasticsearch', context: {foo: :bar} } })

assert_equal [pm2], Bot::Smooch.get_search_results(random_string, {}, t.id, 'en')
uid = random_string
query = Bot::Smooch.get_search_query(uid, {})
assert_equal [pm2], Bot::Smooch.get_search_results(uid, query, t.id, 'en')
Bot::Smooch.stubs(:bundle_list_of_messages).returns({ 'type' => 'text', 'text' => "Test #{url}" })
assert_equal [pm1], Bot::Smooch.get_search_results(random_string, {}, t.id, 'en')
query = Bot::Smooch.get_search_query(uid, {})
assert_equal [pm1], Bot::Smooch.get_search_results(uid, query, t.id, 'en')

ProjectMedia.any_instance.unstub(:report_status)
CheckSearch.any_instance.unstub(:medias)
Expand Down
Loading

0 comments on commit 1192dad

Please sign in to comment.