Skip to content

Commit

Permalink
Be able to export a full list of media clusters. (#2024)
Browse files Browse the repository at this point in the history
The current export limit for media lists is 10.000 because this is the maximum size of a result window in ElasticSearch. The solution is to paginate the results.

Fixes: CV2-5205.
  • Loading branch information
caiosba committed Sep 10, 2024
1 parent 7caf085 commit 905b37f
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 32 deletions.
2 changes: 1 addition & 1 deletion .codeclimate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ checks:
enabled: false
method-complexity:
config:
threshold: 22
threshold: 25
method-count:
config:
threshold: 65
Expand Down
82 changes: 56 additions & 26 deletions lib/check_search.rb
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,10 @@ def team
Team.find_by_id(team_id)
end

def feed
@feed
end

def teams
[]
end
Expand Down Expand Up @@ -335,40 +339,66 @@ def medias_get_search_result(query)

def self.get_exported_data(query, team_id)
team = Team.find(team_id)
Team.current = team
search = CheckSearch.new(query, nil, team_id)
feed_sharing_only_fact_checks = (search.feed && search.feed.data_points == [1])

# Prepare the export
data = []
header = ['Claim', 'Item page URL', 'Status', 'Created by', 'Submitted at', 'Published at', 'Number of media', 'Tags']
fields = team.team_tasks.sort
fields.each { |tt| header << tt.label }
header = nil
if feed_sharing_only_fact_checks
header = ['Fact-check title', 'Fact-check summary', 'Fact-check URL', 'Tags', 'Workspace', 'Updated at', 'Rating']
else
header = ['Claim', 'Item page URL', 'Status', 'Created by', 'Submitted at', 'Published at', 'Number of media', 'Tags']
fields = team.team_tasks.sort
fields.each { |tt| header << tt.label }
end
data << header

# No pagination for the export
search.set_option('esoffset', 0)
search.set_option('eslimit', CheckConfig.get(:export_csv_maximum_number_of_results, 10000, :integer))

# Iterate through each result and generate an output row for the CSV
search.medias.find_each do |pm|
row = [
pm.claim_description&.description,
pm.full_url,
pm.status_i18n,
pm.author_name.to_s.gsub(/ \[.*\]$/, ''),
pm.created_at.strftime("%Y-%m-%d %H:%M:%S"),
pm.published_at&.strftime("%Y-%m-%d %H:%M:%S"),
pm.linked_items_count,
pm.tags_as_sentence
]
annotations = pm.get_annotations('task').map(&:load)
fields.each do |field|
annotation = annotations.find { |a| a.team_task_id == field.id }
answer = (annotation ? (begin annotation.first_response_obj.file_data[:file_urls].join("\n") rescue annotation.first_response.to_s end) : '')
answer = begin JSON.parse(answer).collect{ |x| x['url'] }.join(', ') rescue answer end
row << answer
# Paginate
search_after = [0]
while !search_after.empty?
result = $repository.search(_source: 'annotated_id', query: search.medias_query, sort: [{ annotated_id: { order: :asc } }], size: 10000, search_after: search_after).results
ids = result.collect{ |i| i['annotated_id'] }.uniq.compact.map(&:to_i)

# Iterate through each result and generate an output row for the CSV
ProjectMedia.where(id: ids, team_id: search.team_condition(team_id)).find_each do |pm|
row = nil
if feed_sharing_only_fact_checks
row = [
pm.fact_check_title,
pm.fact_check_summary,
pm.fact_check_url,
pm.tags_as_sentence,
pm.team_name,
pm.updated_at_timestamp,
pm.status
]
else
row = [
pm.claim_description&.description,
pm.full_url,
pm.status_i18n,
pm.author_name.to_s.gsub(/ \[.*\]$/, ''),
pm.created_at.strftime("%Y-%m-%d %H:%M:%S"),
pm.published_at&.strftime("%Y-%m-%d %H:%M:%S"),
pm.linked_items_count,
pm.tags_as_sentence
]
annotations = pm.get_annotations('task').map(&:load)
fields.each do |field|
annotation = annotations.find { |a| a.team_task_id == field.id }
answer = (annotation ? (begin annotation.first_response_obj.file_data[:file_urls].join("\n") rescue annotation.first_response.to_s end) : '')
answer = begin JSON.parse(answer).collect{ |x| x['url'] }.join(', ') rescue answer end
row << answer
end
end
data << row
end
data << row

search_after = [ids.max].compact
end

data
end

Expand Down
45 changes: 40 additions & 5 deletions test/lib/list_export_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,21 +26,26 @@ def teardown
end
end

test "should export media CSV" do
test "should export media (including child media) CSV" do
setup_elasticsearch
t = create_team
create_team_task team_id: t.id, fieldset: 'tasks'
2.times { create_project_media team: t }
parent = create_project_media team: t, disable_es_callbacks: false
child = create_project_media team: t, disable_es_callbacks: false
create_relationship source_id: parent.id, target_id: child.id, relationship_type: Relationship.confirmed_type

export = ListExport.new(:media, '{}', t.id)
sleep 2 # Wait for indexing

export = ListExport.new(:media, { show_similar: true }.to_json, t.id)
csv_url = export.generate_csv_and_send_email(create_user)
response = Net::HTTP.get_response(URI(csv_url))
assert_equal 200, response.code.to_i
csv_content = CSV.parse(response.body, headers: true)
assert_equal 2, csv_content.size
assert_equal 2, export.number_of_rows
assert_equal 2, csv_content.size
end

test "should export feed CSV" do
test "should export media feed CSV" do
t = create_team
f = create_feed team: t
2.times { f.clusters << create_cluster }
Expand All @@ -54,6 +59,36 @@ def teardown
assert_equal 2, export.number_of_rows
end

test "should export fact-check feed CSV" do
setup_elasticsearch
RequestStore.store[:skip_cached_field_update] = false

pender_url = CheckConfig.get('pender_url_private')
WebMock.stub_request(:get, /#{pender_url}/).to_return(body: '{}', status: 200)

t = create_team
2.times do
pm = create_project_media team: t, disable_es_callbacks: false
r = publish_report(pm, {}, nil, { language: 'en', use_visual_card: false })
r = Dynamic.find(r.id)
r.disable_es_callbacks = false
r.set_fields = { state: 'published' }.to_json
r.save!
end
ss = create_saved_search team: t
f = create_feed team: t, data_points: [1], saved_search: ss, published: true

sleep 2 # Wait for indexing

export = ListExport.new(:media, { feed_id: f.id, feed_view: 'fact_check' }.to_json, t.id)
csv_url = export.generate_csv_and_send_email(create_user)
response = Net::HTTP.get_response(URI(csv_url))
assert_equal 200, response.code.to_i
csv_content = CSV.parse(response.body, headers: true)
assert_equal 2, export.number_of_rows
assert_equal 2, csv_content.size
end

test "should export fact-checks CSV" do
t = create_team
2.times do
Expand Down

0 comments on commit 905b37f

Please sign in to comment.