Skip to content

Commit

Permalink
Be able to export a full list of media clusters.
Browse files Browse the repository at this point in the history
The current export limit for media lists is 10.000 because this is the maximum size of a result window in ElasticSearch. The solution is to paginate the results.

Fixes: CV2-5205.
  • Loading branch information
caiosba committed Sep 5, 2024
1 parent 4dcbe5d commit f3501d9
Showing 1 changed file with 33 additions and 24 deletions.
57 changes: 33 additions & 24 deletions lib/check_search.rb
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,6 @@ def medias_get_search_result(query)

def self.get_exported_data(query, team_id)
team = Team.find(team_id)
search = CheckSearch.new(query, nil, team_id)

# Prepare the export
data = []
Expand All @@ -344,31 +343,41 @@ def self.get_exported_data(query, team_id)
fields.each { |tt| header << tt.label }
data << header

# No pagination for the export
search.set_option('esoffset', 0)
search.set_option('eslimit', CheckConfig.get(:export_csv_maximum_number_of_results, 10000, :integer))

# Iterate through each result and generate an output row for the CSV
search.medias.find_each do |pm|
row = [
pm.claim_description&.description,
pm.full_url,
pm.status_i18n,
pm.author_name.to_s.gsub(/ \[.*\]$/, ''),
pm.created_at.strftime("%Y-%m-%d %H:%M:%S"),
pm.published_at&.strftime("%Y-%m-%d %H:%M:%S"),
pm.linked_items_count,
pm.tags_as_sentence
]
annotations = pm.get_annotations('task').map(&:load)
fields.each do |field|
annotation = annotations.find { |a| a.team_task_id == field.id }
answer = (annotation ? (begin annotation.first_response_obj.file_data[:file_urls].join("\n") rescue annotation.first_response.to_s end) : '')
answer = begin JSON.parse(answer).collect{ |x| x['url'] }.join(', ') rescue answer end
row << answer
# Paginate
page_size = 10000
search = CheckSearch.new(query, nil, team_id)
total = search.number_of_results
offset = 0
while offset < total
search = CheckSearch.new(query, nil, team_id)
search.set_option('eslimit', page_size)
search.set_option('esoffset', offset)

# Iterate through each result and generate an output row for the CSV
search.medias.find_each do |pm|
row = [
pm.claim_description&.description,
pm.full_url,
pm.status_i18n,
pm.author_name.to_s.gsub(/ \[.*\]$/, ''),
pm.created_at.strftime("%Y-%m-%d %H:%M:%S"),
pm.published_at&.strftime("%Y-%m-%d %H:%M:%S"),
pm.linked_items_count,
pm.tags_as_sentence
]
annotations = pm.get_annotations('task').map(&:load)
fields.each do |field|
annotation = annotations.find { |a| a.team_task_id == field.id }
answer = (annotation ? (begin annotation.first_response_obj.file_data[:file_urls].join("\n") rescue annotation.first_response.to_s end) : '')
answer = begin JSON.parse(answer).collect{ |x| x['url'] }.join(', ') rescue answer end
row << answer
end
data << row
end
data << row

offset += page_size
end

data
end

Expand Down

0 comments on commit f3501d9

Please sign in to comment.