From f3501d99809d7efa4c3a11e92a2883ea9e472d40 Mon Sep 17 00:00:00 2001 From: Caio <117518+caiosba@users.noreply.github.com> Date: Thu, 5 Sep 2024 01:40:39 -0300 Subject: [PATCH 1/7] Be able to export a full list of media clusters. The current export limit for media lists is 10.000 because this is the maximum size of a result window in ElasticSearch. The solution is to paginate the results. Fixes: CV2-5205. --- lib/check_search.rb | 57 ++++++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/lib/check_search.rb b/lib/check_search.rb index ce8746209..b156e3f40 100644 --- a/lib/check_search.rb +++ b/lib/check_search.rb @@ -335,7 +335,6 @@ def medias_get_search_result(query) def self.get_exported_data(query, team_id) team = Team.find(team_id) - search = CheckSearch.new(query, nil, team_id) # Prepare the export data = [] @@ -344,31 +343,41 @@ def self.get_exported_data(query, team_id) fields.each { |tt| header << tt.label } data << header - # No pagination for the export - search.set_option('esoffset', 0) - search.set_option('eslimit', CheckConfig.get(:export_csv_maximum_number_of_results, 10000, :integer)) - - # Iterate through each result and generate an output row for the CSV - search.medias.find_each do |pm| - row = [ - pm.claim_description&.description, - pm.full_url, - pm.status_i18n, - pm.author_name.to_s.gsub(/ \[.*\]$/, ''), - pm.created_at.strftime("%Y-%m-%d %H:%M:%S"), - pm.published_at&.strftime("%Y-%m-%d %H:%M:%S"), - pm.linked_items_count, - pm.tags_as_sentence - ] - annotations = pm.get_annotations('task').map(&:load) - fields.each do |field| - annotation = annotations.find { |a| a.team_task_id == field.id } - answer = (annotation ? (begin annotation.first_response_obj.file_data[:file_urls].join("\n") rescue annotation.first_response.to_s end) : '') - answer = begin JSON.parse(answer).collect{ |x| x['url'] }.join(', ') rescue answer end - row << answer + # Paginate + page_size = 10000 + search = CheckSearch.new(query, nil, team_id) + total = search.number_of_results + offset = 0 + while offset < total + search = CheckSearch.new(query, nil, team_id) + search.set_option('eslimit', page_size) + search.set_option('esoffset', offset) + + # Iterate through each result and generate an output row for the CSV + search.medias.find_each do |pm| + row = [ + pm.claim_description&.description, + pm.full_url, + pm.status_i18n, + pm.author_name.to_s.gsub(/ \[.*\]$/, ''), + pm.created_at.strftime("%Y-%m-%d %H:%M:%S"), + pm.published_at&.strftime("%Y-%m-%d %H:%M:%S"), + pm.linked_items_count, + pm.tags_as_sentence + ] + annotations = pm.get_annotations('task').map(&:load) + fields.each do |field| + annotation = annotations.find { |a| a.team_task_id == field.id } + answer = (annotation ? (begin annotation.first_response_obj.file_data[:file_urls].join("\n") rescue annotation.first_response.to_s end) : '') + answer = begin JSON.parse(answer).collect{ |x| x['url'] }.join(', ') rescue answer end + row << answer + end + data << row end - data << row + + offset += page_size end + data end From 35061bcb6ab252ce6ebd59af4664b5389ebda9f5 Mon Sep 17 00:00:00 2001 From: Caio <117518+caiosba@users.noreply.github.com> Date: Sun, 8 Sep 2024 22:01:44 -0300 Subject: [PATCH 2/7] Applying code review, which now supports more than 10.000 results too --- lib/check_search.rb | 17 ++++++++--------- test/lib/list_export_test.rb | 13 +++++++++---- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/lib/check_search.rb b/lib/check_search.rb index b156e3f40..e4debbfe2 100644 --- a/lib/check_search.rb +++ b/lib/check_search.rb @@ -335,6 +335,7 @@ def medias_get_search_result(query) def self.get_exported_data(query, team_id) team = Team.find(team_id) + Team.current = team # Prepare the export data = [] @@ -344,17 +345,15 @@ def self.get_exported_data(query, team_id) data << header # Paginate - page_size = 10000 search = CheckSearch.new(query, nil, team_id) - total = search.number_of_results - offset = 0 - while offset < total - search = CheckSearch.new(query, nil, team_id) - search.set_option('eslimit', page_size) - search.set_option('esoffset', offset) + search_after = [0] + while true + result = $repository.search(_source: 'annotated_id', query: search.medias_query, sort: [{ annotated_id: { order: :asc } }], size: 10000, search_after: search_after).results + ids = result.collect{ |i| i['annotated_id'] }.uniq.map(&:to_i) + break if ids.empty? # Iterate through each result and generate an output row for the CSV - search.medias.find_each do |pm| + ProjectMedia.where(id: ids, team_id: search.team_condition(team_id)).find_each do |pm| row = [ pm.claim_description&.description, pm.full_url, @@ -375,7 +374,7 @@ def self.get_exported_data(query, team_id) data << row end - offset += page_size + search_after = [ids.max] end data diff --git a/test/lib/list_export_test.rb b/test/lib/list_export_test.rb index 668797d6e..dfff4e524 100644 --- a/test/lib/list_export_test.rb +++ b/test/lib/list_export_test.rb @@ -26,18 +26,23 @@ def teardown end end - test "should export media CSV" do + test "should export media (including child media) CSV" do + setup_elasticsearch t = create_team create_team_task team_id: t.id, fieldset: 'tasks' - 2.times { create_project_media team: t } + parent = create_project_media team: t, disable_es_callbacks: false + child = create_project_media team: t, disable_es_callbacks: false + create_relationship source_id: parent.id, target_id: child.id, relationship_type: Relationship.confirmed_type - export = ListExport.new(:media, '{}', t.id) + sleep 2 # Wait for indexing + + export = ListExport.new(:media, { show_similar: true }.to_json, t.id) csv_url = export.generate_csv_and_send_email(create_user) response = Net::HTTP.get_response(URI(csv_url)) assert_equal 200, response.code.to_i csv_content = CSV.parse(response.body, headers: true) - assert_equal 2, csv_content.size assert_equal 2, export.number_of_rows + assert_equal 2, csv_content.size end test "should export feed CSV" do From 55bbfb4d235086003da9cd9d897cf771238be99a Mon Sep 17 00:00:00 2001 From: Caio <117518+caiosba@users.noreply.github.com> Date: Mon, 9 Sep 2024 10:33:43 -0300 Subject: [PATCH 3/7] Fixing CC issue --- lib/check_search.rb | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/lib/check_search.rb b/lib/check_search.rb index 86e3c045a..57fa535ce 100644 --- a/lib/check_search.rb +++ b/lib/check_search.rb @@ -347,10 +347,9 @@ def self.get_exported_data(query, team_id) # Paginate search = CheckSearch.new(query, nil, team_id) search_after = [0] - while true + while !search_after.empty? result = $repository.search(_source: 'annotated_id', query: search.medias_query, sort: [{ annotated_id: { order: :asc } }], size: 10000, search_after: search_after).results - ids = result.collect{ |i| i['annotated_id'] }.uniq.map(&:to_i) - break if ids.empty? + ids = result.collect{ |i| i['annotated_id'] }.uniq.compact.map(&:to_i) # Iterate through each result and generate an output row for the CSV ProjectMedia.where(id: ids, team_id: search.team_condition(team_id)).find_each do |pm| @@ -374,7 +373,7 @@ def self.get_exported_data(query, team_id) data << row end - search_after = [ids.max] + search_after = [ids.max].compact end data From c1d5047339bfc9d063f98dddc5ec0ee8bbc42948 Mon Sep 17 00:00:00 2001 From: Caio <117518+caiosba@users.noreply.github.com> Date: Mon, 9 Sep 2024 11:29:51 -0300 Subject: [PATCH 4/7] Fixing export for feeds sharing only fact-checks --- lib/check_search.rb | 63 +++++++++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 20 deletions(-) diff --git a/lib/check_search.rb b/lib/check_search.rb index 57fa535ce..d9fbb18b2 100644 --- a/lib/check_search.rb +++ b/lib/check_search.rb @@ -89,6 +89,10 @@ def team Team.find_by_id(team_id) end + def feed + @feed + end + def teams [] end @@ -336,16 +340,22 @@ def medias_get_search_result(query) def self.get_exported_data(query, team_id) team = Team.find(team_id) Team.current = team + search = CheckSearch.new(query, nil, team_id) + feed_sharing_only_fact_checks = (search.feed && search.feed.data_points == [1]) # Prepare the export data = [] - header = ['Claim', 'Item page URL', 'Status', 'Created by', 'Submitted at', 'Published at', 'Number of media', 'Tags'] - fields = team.team_tasks.sort - fields.each { |tt| header << tt.label } + header = nil + if feed_sharing_only_fact_checks + header = ['Fact-check title', 'Fact-check summary', 'Fact-check URL', 'Tags', 'Workspace', 'Updated at', 'Rating'] + else + header = ['Claim', 'Item page URL', 'Status', 'Created by', 'Submitted at', 'Published at', 'Number of media', 'Tags'] + fields = team.team_tasks.sort + fields.each { |tt| header << tt.label } + end data << header # Paginate - search = CheckSearch.new(query, nil, team_id) search_after = [0] while !search_after.empty? result = $repository.search(_source: 'annotated_id', query: search.medias_query, sort: [{ annotated_id: { order: :asc } }], size: 10000, search_after: search_after).results @@ -353,22 +363,35 @@ def self.get_exported_data(query, team_id) # Iterate through each result and generate an output row for the CSV ProjectMedia.where(id: ids, team_id: search.team_condition(team_id)).find_each do |pm| - row = [ - pm.claim_description&.description, - pm.full_url, - pm.status_i18n, - pm.author_name.to_s.gsub(/ \[.*\]$/, ''), - pm.created_at.strftime("%Y-%m-%d %H:%M:%S"), - pm.published_at&.strftime("%Y-%m-%d %H:%M:%S"), - pm.linked_items_count, - pm.tags_as_sentence - ] - annotations = pm.get_annotations('task').map(&:load) - fields.each do |field| - annotation = annotations.find { |a| a.team_task_id == field.id } - answer = (annotation ? (begin annotation.first_response_obj.file_data[:file_urls].join("\n") rescue annotation.first_response.to_s end) : '') - answer = begin JSON.parse(answer).collect{ |x| x['url'] }.join(', ') rescue answer end - row << answer + row = nil + if feed_sharing_only_fact_checks + row = [ + pm.fact_check_title, + pm.fact_check_summary, + pm.fact_check_url, + pm.tags_as_sentence, + pm.team_name, + pm.updated_at_timestamp, + pm.status + ] + else + row = [ + pm.claim_description&.description, + pm.full_url, + pm.status_i18n, + pm.author_name.to_s.gsub(/ \[.*\]$/, ''), + pm.created_at.strftime("%Y-%m-%d %H:%M:%S"), + pm.published_at&.strftime("%Y-%m-%d %H:%M:%S"), + pm.linked_items_count, + pm.tags_as_sentence + ] + annotations = pm.get_annotations('task').map(&:load) + fields.each do |field| + annotation = annotations.find { |a| a.team_task_id == field.id } + answer = (annotation ? (begin annotation.first_response_obj.file_data[:file_urls].join("\n") rescue annotation.first_response.to_s end) : '') + answer = begin JSON.parse(answer).collect{ |x| x['url'] }.join(', ') rescue answer end + row << answer + end end data << row end From 14cbefa78d3defcfc64a0c033032a231b263d427 Mon Sep 17 00:00:00 2001 From: Caio <117518+caiosba@users.noreply.github.com> Date: Mon, 9 Sep 2024 17:34:38 -0300 Subject: [PATCH 5/7] Adding test for new case --- test/lib/list_export_test.rb | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/test/lib/list_export_test.rb b/test/lib/list_export_test.rb index dfff4e524..e850977bb 100644 --- a/test/lib/list_export_test.rb +++ b/test/lib/list_export_test.rb @@ -45,7 +45,7 @@ def teardown assert_equal 2, csv_content.size end - test "should export feed CSV" do + test "should export media feed CSV" do t = create_team f = create_feed team: t 2.times { f.clusters << create_cluster } @@ -59,6 +59,32 @@ def teardown assert_equal 2, export.number_of_rows end + test "should export fact-check feed CSV" do + setup_elasticsearch + RequestStore.store[:skip_cached_field_update] = false + t = create_team + 2.times do + pm = create_project_media team: t, disable_es_callbacks: false + r = publish_report(pm, {}, nil, { language: 'en', use_visual_card: false }) + r = Dynamic.find(r.id) + r.disable_es_callbacks = false + r.set_fields = { state: 'published' }.to_json + r.save! + end + ss = create_saved_search team: t + f = create_feed team: t, data_points: [1], saved_search: ss, published: true + + sleep 2 # Wait for indexing + + export = ListExport.new(:media, { feed_id: f.id, feed_view: 'fact_check' }.to_json, t.id) + csv_url = export.generate_csv_and_send_email(create_user) + response = Net::HTTP.get_response(URI(csv_url)) + assert_equal 200, response.code.to_i + csv_content = CSV.parse(response.body, headers: true) + assert_equal 2, export.number_of_rows + assert_equal 2, csv_content.size + end + test "should export fact-checks CSV" do t = create_team 2.times do From dff87af62ac188dfb50e8f82ce3bd8bf2d97700c Mon Sep 17 00:00:00 2001 From: Caio <117518+caiosba@users.noreply.github.com> Date: Mon, 9 Sep 2024 17:38:28 -0300 Subject: [PATCH 6/7] Updating Code Climate --- .codeclimate.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.codeclimate.yml b/.codeclimate.yml index 773f85a55..ff19a8b13 100644 --- a/.codeclimate.yml +++ b/.codeclimate.yml @@ -10,7 +10,7 @@ checks: enabled: false method-complexity: config: - threshold: 22 + threshold: 25 method-count: config: threshold: 65 From 896e08a5a8e43ea0e7ee1893ccad1e72c743179d Mon Sep 17 00:00:00 2001 From: Caio <117518+caiosba@users.noreply.github.com> Date: Mon, 9 Sep 2024 20:42:57 -0300 Subject: [PATCH 7/7] Fixing test --- test/lib/list_export_test.rb | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/lib/list_export_test.rb b/test/lib/list_export_test.rb index e850977bb..15551ba12 100644 --- a/test/lib/list_export_test.rb +++ b/test/lib/list_export_test.rb @@ -62,6 +62,10 @@ def teardown test "should export fact-check feed CSV" do setup_elasticsearch RequestStore.store[:skip_cached_field_update] = false + + pender_url = CheckConfig.get('pender_url_private') + WebMock.stub_request(:get, /#{pender_url}/).to_return(body: '{}', status: 200) + t = create_team 2.times do pm = create_project_media team: t, disable_es_callbacks: false