From 0a60eaff7afab7e320762a870d6cb62e96e521e2 Mon Sep 17 00:00:00 2001 From: Caio Almeida <117518+caiosba@users.noreply.github.com> Date: Thu, 5 Oct 2023 16:47:34 -0300 Subject: [PATCH] Fixing URL shortening for Arabic URLs. We have to check if the URL itself contains Arabic characters, not only the full text where the URL is in. Fixes: CV2-3814. --- lib/url_rewriter.rb | 2 +- test/lib/url_rewriter_test.rb | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/lib/url_rewriter.rb b/lib/url_rewriter.rb index e97fff262b..8955a21be0 100644 --- a/lib/url_rewriter.rb +++ b/lib/url_rewriter.rb @@ -23,7 +23,7 @@ def self.utmize(url, source) def self.shorten_and_utmize_urls(input_text, source = nil, owner = nil) text = input_text # Encode URLs in Arabic which are not detected by the URL extraction methods - text = text.gsub(/https?:\/\/[\S]+/) { |url| Addressable::URI.escape(url) } if input_text =~ /\p{Arabic}/ + text = text.gsub(/https?:\/\/[\S]+/) { |url| url =~ /\p{Arabic}/ ? Addressable::URI.escape(url) : url } if input_text =~ /\p{Arabic}/ entities = Twitter::TwitterText::Extractor.extract_urls_with_indices(text, extract_url_without_protocol: true) # Ruby 2.7 freezes the empty string from nil.to_s, which causes an error within the rewriter Twitter::TwitterText::Rewriter.rewrite_entities(text || '', entities) do |entity, _codepoints| diff --git a/test/lib/url_rewriter_test.rb b/test/lib/url_rewriter_test.rb index 4485cedf2c..4c83641f06 100644 --- a/test/lib/url_rewriter_test.rb +++ b/test/lib/url_rewriter_test.rb @@ -54,9 +54,19 @@ def teardown test 'should shorten Arabic URL' do shortened = nil stub_configs({ 'short_url_host_display' => 'https://chck.media' }) do - shortened = UrlRewriter.shorten_and_utmize_urls('Visit https://fatabyyano.net/هذا-المقطع-ليس-لاشتباكات-حديثة-بين-الج/ for more information.', nil) + shortened = UrlRewriter.shorten_and_utmize_urls('Visit https://fatabyyano.net/هذا-المقطع-قديم،-ولا-يبين-لحظة-إنقاذ-شا/ for more information.', nil) end - assert_equal 'https://fatabyyano.net/%D9%87%D8%B0%D8%A7-%D8%A7%D9%84%D9%85%D9%82%D8%B7%D8%B9-%D9%84%D9%8A%D8%B3-%D9%84%D8%A7%D8%B4%D8%AA%D8%A8%D8%A7%D9%83%D8%A7%D8%AA-%D8%AD%D8%AF%D9%8A%D8%AB%D8%A9-%D8%A8%D9%8A%D9%86-%D8%A7%D9%84%D8%AC/', Shortener::ShortenedUrl.last.url + assert_equal 'https://fatabyyano.net/%D9%87%D8%B0%D8%A7-%D8%A7%D9%84%D9%85%D9%82%D8%B7%D8%B9-%D9%82%D8%AF%D9%8A%D9%85%D8%8C-%D9%88%D9%84%D8%A7-%D9%8A%D8%A8%D9%8A%D9%86-%D9%84%D8%AD%D8%B8%D8%A9-%D8%A5%D9%86%D9%82%D8%A7%D8%B0-%D8%B4%D8%A7/', Shortener::ShortenedUrl.last.url assert_match /^Visit https:\/\/chck\.media\/[a-zA-Z0-9]+ for more information\.$/, shortened end + + test 'should not shorten decoded Arabic URL' do + url = 'https://fatabyyano.net/%da%af%d8%b1%d8%aa%db%95-%da%a4%db%8c%d8%af%db%8c%db%86%db%8c%db%8c%db%95%da%a9%db%95-%d8%b3%d8%a7%d8%ae%d8%aa%db%95%db%8c%db%95-%d9%88-%d9%84%d8%a7%d9%81%d8%a7%d9%88-%d9%88-%d8%b2%d8%b1%db%8c%d8%a7/' + shortened = nil + stub_configs({ 'short_url_host_display' => 'https://chck.media' }) do + shortened = UrlRewriter.shorten_and_utmize_urls("فتبينوا | Visit #{url} for more information.", nil) + end + assert_equal url, Shortener::ShortenedUrl.last.url + assert_match /^فتبينوا \| Visit https:\/\/chck\.media\/[a-zA-Z0-9]+ for more information\.$/, shortened + end end