diff --git a/lib/url_rewriter.rb b/lib/url_rewriter.rb index 6f1ad65040..e97fff262b 100644 --- a/lib/url_rewriter.rb +++ b/lib/url_rewriter.rb @@ -20,7 +20,10 @@ def self.utmize(url, source) end end - def self.shorten_and_utmize_urls(text, source = nil, owner = nil) + def self.shorten_and_utmize_urls(input_text, source = nil, owner = nil) + text = input_text + # Encode URLs in Arabic which are not detected by the URL extraction methods + text = text.gsub(/https?:\/\/[\S]+/) { |url| Addressable::URI.escape(url) } if input_text =~ /\p{Arabic}/ entities = Twitter::TwitterText::Extractor.extract_urls_with_indices(text, extract_url_without_protocol: true) # Ruby 2.7 freezes the empty string from nil.to_s, which causes an error within the rewriter Twitter::TwitterText::Rewriter.rewrite_entities(text || '', entities) do |entity, _codepoints| diff --git a/test/lib/url_rewriter_test.rb b/test/lib/url_rewriter_test.rb index 441d79f67f..4485cedf2c 100644 --- a/test/lib/url_rewriter_test.rb +++ b/test/lib/url_rewriter_test.rb @@ -50,4 +50,13 @@ def teardown assert_equal url, UrlRewriter.shorten(url, nil) end end + + test 'should shorten Arabic URL' do + shortened = nil + stub_configs({ 'short_url_host_display' => 'https://chck.media' }) do + shortened = UrlRewriter.shorten_and_utmize_urls('Visit https://fatabyyano.net/هذا-المقطع-ليس-لاشتباكات-حديثة-بين-الج/ for more information.', nil) + end + assert_equal 'https://fatabyyano.net/%D9%87%D8%B0%D8%A7-%D8%A7%D9%84%D9%85%D9%82%D8%B7%D8%B9-%D9%84%D9%8A%D8%B3-%D9%84%D8%A7%D8%B4%D8%AA%D8%A8%D8%A7%D9%83%D8%A7%D8%AA-%D8%AD%D8%AF%D9%8A%D8%AB%D8%A9-%D8%A8%D9%8A%D9%86-%D8%A7%D9%84%D8%AC/', Shortener::ShortenedUrl.last.url + assert_match /^Visit https:\/\/chck\.media\/[a-zA-Z0-9]+ for more information\.$/, shortened + end end