Be able to export a full list of media clusters. (#2024)

The current export limit for media lists is 10.000 because this is the maximum size of a result window in ElasticSearch. The solution is to paginate the results. Fixes: CV2-5205.
meedan · Sep 10, 2024 · 7fed86c · 7fed86c
1 parent dab3b7d
commit 7fed86c
Show file tree

Hide file tree

Showing 3 changed files with 97 additions and 32 deletions.
diff --git a/.codeclimate.yml b/.codeclimate.yml
@@ -10,7 +10,7 @@ checks:
     enabled: false
   method-complexity:
     config:
-      threshold: 22
+      threshold: 25
   method-count:
     config:
       threshold: 65

diff --git a/lib/check_search.rb b/lib/check_search.rb
@@ -89,6 +89,10 @@ def team
     Team.find_by_id(team_id)
   end
 
+  def feed
+    @feed
+  end
+
   def teams
     []
   end
@@ -335,40 +339,66 @@ def medias_get_search_result(query)
 
   def self.get_exported_data(query, team_id)
     team = Team.find(team_id)
+    Team.current = team
     search = CheckSearch.new(query, nil, team_id)
+    feed_sharing_only_fact_checks = (search.feed && search.feed.data_points == [1])
 
     # Prepare the export
     data = []
-    header = ['Claim', 'Item page URL', 'Status', 'Created by', 'Submitted at', 'Published at', 'Number of media', 'Tags']
-    fields = team.team_tasks.sort
-    fields.each { |tt| header << tt.label }
+    header = nil
+    if feed_sharing_only_fact_checks
+      header = ['Fact-check title', 'Fact-check summary', 'Fact-check URL', 'Tags', 'Workspace', 'Updated at', 'Rating']
+    else
+      header = ['Claim', 'Item page URL', 'Status', 'Created by', 'Submitted at', 'Published at', 'Number of media', 'Tags']
+      fields = team.team_tasks.sort
+      fields.each { |tt| header << tt.label }
+    end
     data << header
 
-    # No pagination for the export
-    search.set_option('esoffset', 0)
-    search.set_option('eslimit', CheckConfig.get(:export_csv_maximum_number_of_results, 10000, :integer))
-
-    # Iterate through each result and generate an output row for the CSV
-    search.medias.find_each do |pm|
-      row = [
-        pm.claim_description&.description,
-        pm.full_url,
-        pm.status_i18n,
-        pm.author_name.to_s.gsub(/ \[.*\]$/, ''),
-        pm.created_at.strftime("%Y-%m-%d %H:%M:%S"),
-        pm.published_at&.strftime("%Y-%m-%d %H:%M:%S"),
-        pm.linked_items_count,
-        pm.tags_as_sentence
-      ]
-      annotations = pm.get_annotations('task').map(&:load)
-      fields.each do |field|
-        annotation = annotations.find { |a| a.team_task_id == field.id }
-        answer = (annotation ? (begin annotation.first_response_obj.file_data[:file_urls].join("\n") rescue annotation.first_response.to_s end) : '')
-        answer = begin JSON.parse(answer).collect{ |x| x['url'] }.join(', ') rescue answer end
-        row << answer
+    # Paginate
+    search_after = [0]
+    while !search_after.empty?
+      result = $repository.search(_source: 'annotated_id', query: search.medias_query, sort: [{ annotated_id: { order: :asc } }], size: 10000, search_after: search_after).results
+      ids = result.collect{ |i| i['annotated_id'] }.uniq.compact.map(&:to_i)
+
+      # Iterate through each result and generate an output row for the CSV
+      ProjectMedia.where(id: ids, team_id: search.team_condition(team_id)).find_each do |pm|
+        row = nil
+        if feed_sharing_only_fact_checks
+          row = [
+            pm.fact_check_title,
+            pm.fact_check_summary,
+            pm.fact_check_url,
+            pm.tags_as_sentence,
+            pm.team_name,
+            pm.updated_at_timestamp,
+            pm.status
+          ]
+        else
+          row = [
+            pm.claim_description&.description,
+            pm.full_url,
+            pm.status_i18n,
+            pm.author_name.to_s.gsub(/ \[.*\]$/, ''),
+            pm.created_at.strftime("%Y-%m-%d %H:%M:%S"),
+            pm.published_at&.strftime("%Y-%m-%d %H:%M:%S"),
+            pm.linked_items_count,
+            pm.tags_as_sentence
+          ]
+          annotations = pm.get_annotations('task').map(&:load)
+          fields.each do |field|
+            annotation = annotations.find { |a| a.team_task_id == field.id }
+            answer = (annotation ? (begin annotation.first_response_obj.file_data[:file_urls].join("\n") rescue annotation.first_response.to_s end) : '')
+            answer = begin JSON.parse(answer).collect{ |x| x['url'] }.join(', ') rescue answer end
+            row << answer
+          end
+        end
+        data << row
       end
-      data << row
+
+      search_after = [ids.max].compact
     end
+
     data
   end
 

diff --git a/test/lib/list_export_test.rb b/test/lib/list_export_test.rb
@@ -26,21 +26,26 @@ def teardown
     end
   end
 
-  test "should export media CSV" do
+  test "should export media (including child media) CSV" do
+    setup_elasticsearch
     t = create_team
     create_team_task team_id: t.id, fieldset: 'tasks'
-    2.times { create_project_media team: t }
+    parent = create_project_media team: t, disable_es_callbacks: false
+    child = create_project_media team: t, disable_es_callbacks: false
+    create_relationship source_id: parent.id, target_id: child.id, relationship_type: Relationship.confirmed_type
 
-    export = ListExport.new(:media, '{}', t.id)
+    sleep 2 # Wait for indexing
+
+    export = ListExport.new(:media, { show_similar: true }.to_json, t.id)
     csv_url = export.generate_csv_and_send_email(create_user)
     response = Net::HTTP.get_response(URI(csv_url))
     assert_equal 200, response.code.to_i
     csv_content = CSV.parse(response.body, headers: true)
-    assert_equal 2, csv_content.size
     assert_equal 2, export.number_of_rows
+    assert_equal 2, csv_content.size
   end
 
-  test "should export feed CSV" do
+  test "should export media feed CSV" do
     t = create_team
     f = create_feed team: t
     2.times { f.clusters << create_cluster }
@@ -54,6 +59,36 @@ def teardown
     assert_equal 2, export.number_of_rows
   end
 
+  test "should export fact-check feed CSV" do
+    setup_elasticsearch
+    RequestStore.store[:skip_cached_field_update] = false
+
+    pender_url = CheckConfig.get('pender_url_private')
+    WebMock.stub_request(:get, /#{pender_url}/).to_return(body: '{}', status: 200)
+
+    t = create_team
+    2.times do
+      pm = create_project_media team: t, disable_es_callbacks: false
+      r = publish_report(pm, {}, nil, { language: 'en', use_visual_card: false })
+      r = Dynamic.find(r.id)
+      r.disable_es_callbacks = false
+      r.set_fields = { state: 'published' }.to_json
+      r.save!
+    end
+    ss = create_saved_search team: t
+    f = create_feed team: t, data_points: [1], saved_search: ss, published: true
+
+    sleep 2 # Wait for indexing
+
+    export = ListExport.new(:media, { feed_id: f.id, feed_view: 'fact_check' }.to_json, t.id)
+    csv_url = export.generate_csv_and_send_email(create_user)
+    response = Net::HTTP.get_response(URI(csv_url))
+    assert_equal 200, response.code.to_i
+    csv_content = CSV.parse(response.body, headers: true)
+    assert_equal 2, export.number_of_rows
+    assert_equal 2, csv_content.size
+  end
+
   test "should export fact-checks CSV" do
     t = create_team
     2.times do