Skip to content

Commit

Permalink
CV2-4458: improve-cluster-rake-task (#1853)
Browse files Browse the repository at this point in the history
* CV2-4458: add uuid column to media

* CV2-4458: fix rake task

* CV2-4458: use downcase to get uuid
  • Loading branch information
melsawy committed Apr 9, 2024
1 parent d5a975e commit 216147b
Show file tree
Hide file tree
Showing 7 changed files with 63 additions and 47 deletions.
10 changes: 6 additions & 4 deletions app/models/claim.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,15 @@ def media_type
'quote'
end

def uuid
Media.where(type: 'Claim', quote: self.quote.to_s.strip).joins("INNER JOIN project_medias pm ON pm.media_id = medias.id").first&.id || self.id
end

private

def remove_null_bytes
self.quote = self.quote.gsub("\u0000", "\\u0000") unless self.quote.nil?
end

def set_uuid
uuid = Media.where(type: 'Claim').where('lower(quote) = ?', self.quote.to_s.strip.downcase).joins("INNER JOIN project_medias pm ON pm.media_id = medias.id").first&.id
uuid ||= self.id
self.update_column(:uuid, uuid)
end
end
10 changes: 6 additions & 4 deletions app/models/media.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ class Media < ApplicationRecord

before_validation :set_type, :set_url_nil_if_empty, :set_user, on: :create

after_create :set_uuid

def self.types
%w(Link Claim UploadedFile UploadedImage UploadedVideo UploadedAudio Blank)
end
Expand Down Expand Up @@ -74,10 +76,6 @@ def domain
''
end

def uuid
self.id
end

private

def set_url_nil_if_empty
Expand All @@ -101,4 +99,8 @@ def self.class_from_input(input)
def set_type
self.type = Media.class_from_input({ url: self.url, quote: self.quote }) if self.type.blank?
end

def set_uuid
self.update_column(:uuid, self.id)
end
end
5 changes: 5 additions & 0 deletions db/migrate/20240404154458_add_uuid_to_medias.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
class AddUuidToMedias < ActiveRecord::Migration[6.1]
def change
add_column :medias, :uuid, :integer, null: false, default: 0
end
end
6 changes: 3 additions & 3 deletions db/schema.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#
# It's strongly recommended that you check this file into your version control system.

ActiveRecord::Schema.define(version: 2024_03_04_160338) do
ActiveRecord::Schema.define(version: 2024_04_04_154458) do

# These are extensions that must be enabled in order to support this database
enable_extension "plpgsql"
Expand Down Expand Up @@ -293,7 +293,6 @@
t.index ["field_type"], name: "index_dynamic_annotation_fields_on_field_type"
t.index ["value"], name: "fetch_unique_id", unique: true, where: "(((field_name)::text = 'external_id'::text) AND (value <> ''::text) AND (value <> '\"\"'::text))"
t.index ["value"], name: "index_status", where: "((field_name)::text = 'verification_status_status'::text)"
t.index ["value"], name: "smooch_request_message_id_unique_id", unique: true, where: "(((field_name)::text = 'smooch_message_id'::text) AND (value <> ''::text) AND (value <> '\"\"'::text))"
t.index ["value"], name: "smooch_user_unique_id", unique: true, where: "(((field_name)::text = 'smooch_user_id'::text) AND (value <> ''::text) AND (value <> '\"\"'::text))"
t.index ["value"], name: "translation_request_id", unique: true, where: "((field_name)::text = 'translation_request_id'::text)"
t.index ["value_json"], name: "index_dynamic_annotation_fields_on_value_json", using: :gin
Expand Down Expand Up @@ -389,6 +388,7 @@
t.string "type"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.integer "uuid", default: 0, null: false
t.index ["url"], name: "index_medias_on_url", unique: true
end

Expand Down Expand Up @@ -850,7 +850,7 @@
t.integer "failed_attempts", default: 0, null: false
t.string "unlock_token"
t.datetime "locked_at"
t.datetime "last_received_terms_email_at", default: -> { "CURRENT_TIMESTAMP" }
t.datetime "last_received_terms_email_at"
t.index ["confirmation_token"], name: "index_users_on_confirmation_token", unique: true
t.index ["email"], name: "index_users_on_email", unique: true, where: "((email IS NOT NULL) AND ((email)::text <> ''::text))"
t.index ["invitation_token"], name: "index_users_on_invitation_token", unique: true
Expand Down
42 changes: 6 additions & 36 deletions lib/tasks/check_khousheh.rake
Original file line number Diff line number Diff line change
Expand Up @@ -13,36 +13,6 @@ namespace :check do
puts
end

def get_claim_uuid(quote)
quote_es = quote[0..1023]
# Remove last word as may be the splitter cut the last word and we are hitting ES with `AND`
quote_es = quote_es[0...quote_es.rindex(' ')]
# Quote stored in title or description(for tipline items) so I used both fields in search
query = {
bool: {
must: [
{ term: { associated_type: { value: 'Claim' } } },
{
simple_query_string: {
fields: ["title", "description"],
query: quote_es,
default_operator: "AND"
}
}
]
}
}
result = $repository.search(query: query, size: 10000)
pm_ids = []
result.each do |r|
if r['title'] == quote || r['description'] == quote
pm_ids << r['annotated_id']
end
end
uuid = ProjectMedia.where(id: pm_ids.uniq.compact).map(&:media_id).sort.first
uuid.blank? ? uuid : uuid.to_s
end

# docker-compose exec -e elasticsearch_log=0 api bundle exec rake check:khousheh:generate_input
desc 'Generate input files in JSON format.'
task generate_input: :environment do
Expand Down Expand Up @@ -78,9 +48,9 @@ namespace :check do
end
m_ids = pms.map(&:media_id)
Media.where(id: m_ids, type: 'Claim').find_each do |m|
print '.'
uuid[m.id] = get_claim_uuid(m.quote) || m.id.to_s
end
print '.'
uuid[m.id] = m.uuid.to_s
end
end
pm_ids.each do |pm_id|
m_uuid = uuid[pm_media_mapping[pm_id]]
Expand All @@ -103,7 +73,7 @@ namespace :check do
end
Media.where(id: tpm_m_mapping.values, type: 'Claim').find_each do |m|
print '.'
t_uuid[m.id] = get_claim_uuid(m.quote) || m.id.to_s
t_uuid[m.id] = m.uuid.to_s
end
relations.each do |r|
print '.'
Expand Down Expand Up @@ -262,8 +232,8 @@ namespace :check do
uuid[pm.media_id] = pm.media_id.to_s
end
Media.where(id: pms.map(&:media_id), type: 'Claim').find_each do |m|
print '.'
uuid[m.id] = get_claim_uuid(m.quote) || m.id.to_s
print '.'
uuid[m.id] = m.uuid.to_s
end
# Fact-checks
pm_fc_mapping = {} # Project Media ID => Fact-Check Updated At
Expand Down
36 changes: 36 additions & 0 deletions lib/tasks/migrate/20240404154458_add_uuid_to_medias.rake
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
namespace :check do
namespace :migrate do
task migrate_media_uuid: :environment do
started = Time.now.to_i
# Media of type Claim
last_claim_id = Rails.cache.read('check:migrate:migrate_media_uuid:claim_id') || 0
Media.where(type: 'Claim').where('id > ?', last_claim_id).find_in_batches(batch_size: 2000) do |medias|
m_items = []
medias.each do |m|
print '.'
uuid = Media.where(type: 'Claim')
.where('lower(quote) = ?', m.quote.to_s.strip.downcase)
.joins("INNER JOIN project_medias pm ON pm.media_id = medias.id").first&.id
uuid ||= m.id
m.uuid = uuid
m_items << m.attributes
end
Media.upsert_all(m_items)
last_id = medias.map(&:id).max
Rails.cache.write('check:migrate:migrate_media_uuid:claim_id', last_id)
end
# Other medias (link, image, audio, etc)
Media.where.not(type: 'Claim').find_in_batches(batch_size: 2000) do |medias|
m_items = []
medias.each do |m|
print '.'
m.uuid = m.id
m_items << m.attributes
end
Media.upsert_all(m_items)
end
minutes = ((Time.now.to_i - started) / 60).to_i
puts "[#{Time.now}] Done in #{minutes} minutes."
end
end
end
1 change: 1 addition & 0 deletions test/models/media_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -608,6 +608,7 @@ def setup
m = create_media
assert_equal m.id, m.uuid
c1 = create_claim_media quote: 'Foo'
assert_equal c1.id, c1.uuid
create_project_media media: c1
assert_equal c1.id, c1.uuid
c2 = create_claim_media quote: 'Foo'
Expand Down

0 comments on commit 216147b

Please sign in to comment.