Skip to content

Commit

Permalink
Fixing URL shortening for Arabic URLs.
Browse files Browse the repository at this point in the history
We have to check if the URL itself contains Arabic characters, not only the full text where the URL is in.

Fixes: CV2-3814.
  • Loading branch information
caiosba authored Oct 5, 2023
1 parent dfbb1bc commit 5da8afa
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 3 deletions.
2 changes: 1 addition & 1 deletion lib/url_rewriter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def self.utmize(url, source)
def self.shorten_and_utmize_urls(input_text, source = nil, owner = nil)
text = input_text
# Encode URLs in Arabic which are not detected by the URL extraction methods
text = text.gsub(/https?:\/\/[\S]+/) { |url| Addressable::URI.escape(url) } if input_text =~ /\p{Arabic}/
text = text.gsub(/https?:\/\/[\S]+/) { |url| url =~ /\p{Arabic}/ ? Addressable::URI.escape(url) : url } if input_text =~ /\p{Arabic}/
entities = Twitter::TwitterText::Extractor.extract_urls_with_indices(text, extract_url_without_protocol: true)
# Ruby 2.7 freezes the empty string from nil.to_s, which causes an error within the rewriter
Twitter::TwitterText::Rewriter.rewrite_entities(text || '', entities) do |entity, _codepoints|
Expand Down
14 changes: 12 additions & 2 deletions test/lib/url_rewriter_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,19 @@ def teardown
test 'should shorten Arabic URL' do
shortened = nil
stub_configs({ 'short_url_host_display' => 'https://chck.media' }) do
shortened = UrlRewriter.shorten_and_utmize_urls('Visit https://fatabyyano.net/هذا-المقطع-ليس-لاشتباكات-حديثة-بين-الج/ for more information.', nil)
shortened = UrlRewriter.shorten_and_utmize_urls('Visit https://fatabyyano.net/هذا-المقطع-قديم،-ولا-يبين-لحظة-إنقاذ-شا/ for more information.', nil)
end
assert_equal 'https://fatabyyano.net/%D9%87%D8%B0%D8%A7-%D8%A7%D9%84%D9%85%D9%82%D8%B7%D8%B9-%D9%84%D9%8A%D8%B3-%D9%84%D8%A7%D8%B4%D8%AA%D8%A8%D8%A7%D9%83%D8%A7%D8%AA-%D8%AD%D8%AF%D9%8A%D8%AB%D8%A9-%D8%A8%D9%8A%D9%86-%D8%A7%D9%84%D8%AC/', Shortener::ShortenedUrl.last.url
assert_equal 'https://fatabyyano.net/%D9%87%D8%B0%D8%A7-%D8%A7%D9%84%D9%85%D9%82%D8%B7%D8%B9-%D9%82%D8%AF%D9%8A%D9%85%D8%8C-%D9%88%D9%84%D8%A7-%D9%8A%D8%A8%D9%8A%D9%86-%D9%84%D8%AD%D8%B8%D8%A9-%D8%A5%D9%86%D9%82%D8%A7%D8%B0-%D8%B4%D8%A7/', Shortener::ShortenedUrl.last.url
assert_match /^Visit https:\/\/chck\.media\/[a-zA-Z0-9]+ for more information\.$/, shortened
end

test 'should not shorten decoded Arabic URL' do
url = 'https://fatabyyano.net/%da%af%d8%b1%d8%aa%db%95-%da%a4%db%8c%d8%af%db%8c%db%86%db%8c%db%8c%db%95%da%a9%db%95-%d8%b3%d8%a7%d8%ae%d8%aa%db%95%db%8c%db%95-%d9%88-%d9%84%d8%a7%d9%81%d8%a7%d9%88-%d9%88-%d8%b2%d8%b1%db%8c%d8%a7/'
shortened = nil
stub_configs({ 'short_url_host_display' => 'https://chck.media' }) do
shortened = UrlRewriter.shorten_and_utmize_urls("فتبينوا | Visit #{url} for more information.", nil)
end
assert_equal url, Shortener::ShortenedUrl.last.url
assert_match /^فتبينوا \| Visit https:\/\/chck\.media\/[a-zA-Z0-9]+ for more information\.$/, shortened
end
end

0 comments on commit 5da8afa

Please sign in to comment.