From 5e8e55de0ff9a188b14494ecb0a5e94a349dff4c Mon Sep 17 00:00:00 2001 From: Caio <117518+caiosba@users.noreply.github.com> Date: Tue, 12 Sep 2023 22:59:57 -0300 Subject: [PATCH] Fixing URL shortening for URLs with Arabic characters. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When URLs have unescaped Arabic characters, they are not extracted correctly. This happens not only with the library we use (`twitter-text`), but also with Ruby's `uri` library and `postrank-uri` gem: ``` irb(main):009:0> PostRank::URI.extract('https://fatabyyano.net/هذا-المقطع-ليس-لاشتباكات-حديثة-بين-الج/') => ["https://fatabyyano.net/"] irb(main):010:0> URI.extract('https://fatabyyano.net/هذا-المقطع-ليس-لاشتباكات-حديثة-بين-الج/') => ["https://fatabyyano.net/"] irb(main):011:0> Twitter::TwitterText::Extractor.extract_urls('https://fatabyyano.net/هذا-المقطع-ليس-لاشتباكات-حديثة-بين-الج/') => ["https://fatabyyano.net/"] ``` So, the fix here is to first escape URLs that contain Arabic characters before sending them to the URL extraction method when shortening URLs. Fixes CV2-3690. --- lib/url_rewriter.rb | 5 ++++- test/lib/url_rewriter_test.rb | 9 +++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/lib/url_rewriter.rb b/lib/url_rewriter.rb index 6f1ad65040..e97fff262b 100644 --- a/lib/url_rewriter.rb +++ b/lib/url_rewriter.rb @@ -20,7 +20,10 @@ def self.utmize(url, source) end end - def self.shorten_and_utmize_urls(text, source = nil, owner = nil) + def self.shorten_and_utmize_urls(input_text, source = nil, owner = nil) + text = input_text + # Encode URLs in Arabic which are not detected by the URL extraction methods + text = text.gsub(/https?:\/\/[\S]+/) { |url| Addressable::URI.escape(url) } if input_text =~ /\p{Arabic}/ entities = Twitter::TwitterText::Extractor.extract_urls_with_indices(text, extract_url_without_protocol: true) # Ruby 2.7 freezes the empty string from nil.to_s, which causes an error within the rewriter Twitter::TwitterText::Rewriter.rewrite_entities(text || '', entities) do |entity, _codepoints| diff --git a/test/lib/url_rewriter_test.rb b/test/lib/url_rewriter_test.rb index 441d79f67f..4485cedf2c 100644 --- a/test/lib/url_rewriter_test.rb +++ b/test/lib/url_rewriter_test.rb @@ -50,4 +50,13 @@ def teardown assert_equal url, UrlRewriter.shorten(url, nil) end end + + test 'should shorten Arabic URL' do + shortened = nil + stub_configs({ 'short_url_host_display' => 'https://chck.media' }) do + shortened = UrlRewriter.shorten_and_utmize_urls('Visit https://fatabyyano.net/هذا-المقطع-ليس-لاشتباكات-حديثة-بين-الج/ for more information.', nil) + end + assert_equal 'https://fatabyyano.net/%D9%87%D8%B0%D8%A7-%D8%A7%D9%84%D9%85%D9%82%D8%B7%D8%B9-%D9%84%D9%8A%D8%B3-%D9%84%D8%A7%D8%B4%D8%AA%D8%A8%D8%A7%D9%83%D8%A7%D8%AA-%D8%AD%D8%AF%D9%8A%D8%AB%D8%A9-%D8%A8%D9%8A%D9%86-%D8%A7%D9%84%D8%AC/', Shortener::ShortenedUrl.last.url + assert_match /^Visit https:\/\/chck\.media\/[a-zA-Z0-9]+ for more information\.$/, shortened + end end