whitequark · avit · Nov 14, 2020 · Nov 14, 2020 · Nov 14, 2020
diff --git a/lib/rack/utf8_sanitizer.rb b/lib/rack/utf8_sanitizer.rb
@@ -205,25 +205,37 @@ def decode_string(input)
           force_encoding(Encoding::ASCII_8BIT))
     end
 
-    # This regexp matches all 'unreserved' characters from RFC3986 (2.3),
-    # plus all multibyte UTF-8 characters.
-    UNRESERVED_OR_UTF8 = /[A-Za-z0-9\-._~\x80-\xFF]/
+    # All 'unreserved' characters from RFC3986 (2.3)
+    UNRESERVED = [
+      '-'.ord,
+      '.'.ord,
+      '_'.ord,
+      ('A'.ord)..('Z'.ord),
+      ('a'.ord)..('z'.ord),
+      ('0'.ord)..('9'.ord)
+    ].map(&:freeze).freeze
+
+    # All multibyte UTF-8 octets
+    MULTIBYTE = (0x80..0xFF).freeze
 
     # RFC3986, 2.2 states that the characters from 'reserved' group must be
     # protected during normalization (which is what UTF8Sanitizer does).
     #
     # However, the regexp approach used by URI.unescape is not sophisticated
     # enough for our task.
     def unescape_unreserved(input)
-      input.gsub(/%([a-f\d]{2})/i) do |encoded|
-        decoded = $1.hex.chr
+      @percent_decoded_mapping ||= Hash.new do |table, encoded|
+        octet = encoded.slice(1, 2).hex
 
-        if decoded =~ UNRESERVED_OR_UTF8
-          decoded
+        case octet
+        when *UNRESERVED, MULTIBYTE
+          table[encoded] = octet.chr
         else
-          encoded
+          table[encoded] = encoded
         end
       end
+
+      input.gsub(/%\h\h/, @percent_decoded_mapping)
     end
 
     # This regexp matches unsafe characters, i.e. everything except 'reserved'
@@ -234,10 +246,15 @@ def unescape_unreserved(input)
     # See also URI::REGEXP::PATTERN::{UNRESERVED,RESERVED}.
     UNSAFE           = /[^\-_.!~*'()a-zA-Z\d;\/?:@&=+$,\[\]%]/
 
-    # Performs the reverse function of `unescape_unreserved`. Unlike
-    # the previous function, we can reuse the logic in URI#encode
+    # Performs the reverse function of `unescape_unreserved`. The logic here is
+    # optimized from URI::RFC2396_Parser#escape
     def escape_unreserved(input)
-      URI::DEFAULT_PARSER.escape(input, UNSAFE)
+      @unsafe_map ||= Hash.new do |table, us|
+        table[us] = us.each_byte.reduce('') do |tmp, uc|
+          tmp << "%#{uc.ord.to_s(16)}"
+        end
+      end
+      input.gsub(UNSAFE, @unsafe_map).force_encoding(Encoding::US_ASCII)
     end
 
     def sanitize_string(input)

diff --git a/test/bench_utf8_sanitizer.rb b/test/bench_utf8_sanitizer.rb
@@ -0,0 +1,45 @@
+# frozen_string_literal: true
+
+require 'minitest/autorun'
+require 'minitest/benchmark'
+
+require_relative '../lib/rack/utf8_sanitizer'
+
+class RackUTF8SanitizerBenchmark < Minitest::Benchmark
+  def self.bench_range
+    bench_exp(10, 1_000_000, 10)
+  end
+
+  def hex
+    rand(255).to_s(16)
+  end
+
+  def data(size, encode_ratio: 1.0)
+    buffer = String.new
+    size.times.reduce(buffer) { |str, _|
+      encoded = rand + encode_ratio >= 1.0
+      str << (encoded ? "%#{hex}" : '___')
+    }
+  end
+
+  def setup
+    @data = data(10_000_000, encode_ratio: 0.2)
+  end
+
+  def bench_urlencoded_input
+    app = Rack::UTF8Sanitizer.new(->(env) { env })
+
+    request_env = {
+      'REQUEST_METHOD' => 'POST',
+      'CONTENT_TYPE' => 'application/x-www-form-urlencoded'
+    }
+
+    assert_performance_linear 0.99 do |n|
+      20.times do
+        offset = rand((@data.size / 3) - n)
+        data = @data.slice(offset, n)
+        app.call(request_env.merge('rack.input' => StringIO.new(data)))
+      end
+    end
+  end
+end