diff --git a/lib/rack/utf8_sanitizer.rb b/lib/rack/utf8_sanitizer.rb index 7b0a708..f8166cf 100644 --- a/lib/rack/utf8_sanitizer.rb +++ b/lib/rack/utf8_sanitizer.rb @@ -205,9 +205,18 @@ def decode_string(input) force_encoding(Encoding::ASCII_8BIT)) end - # This regexp matches all 'unreserved' characters from RFC3986 (2.3), - # plus all multibyte UTF-8 characters. - UNRESERVED_OR_UTF8 = /[A-Za-z0-9\-._~\x80-\xFF]/ + # All 'unreserved' characters from RFC3986 (2.3) + UNRESERVED = [ + '-'.ord, + '.'.ord, + '_'.ord, + ('A'.ord)..('Z'.ord), + ('a'.ord)..('z'.ord), + ('0'.ord)..('9'.ord) + ].map(&:freeze).freeze + + # All multibyte UTF-8 octets + MULTIBYTE = (0x80..0xFF).freeze # RFC3986, 2.2 states that the characters from 'reserved' group must be # protected during normalization (which is what UTF8Sanitizer does). @@ -215,15 +224,18 @@ def decode_string(input) # However, the regexp approach used by URI.unescape is not sophisticated # enough for our task. def unescape_unreserved(input) - input.gsub(/%([a-f\d]{2})/i) do |encoded| - decoded = $1.hex.chr + @percent_decoded_mapping ||= Hash.new do |table, encoded| + octet = encoded.slice(1, 2).hex - if decoded =~ UNRESERVED_OR_UTF8 - decoded + case octet + when *UNRESERVED, MULTIBYTE + table[encoded] = octet.chr else - encoded + table[encoded] = encoded end end + + input.gsub(/%\h\h/, @percent_decoded_mapping) end # This regexp matches unsafe characters, i.e. everything except 'reserved' @@ -234,10 +246,17 @@ def unescape_unreserved(input) # See also URI::REGEXP::PATTERN::{UNRESERVED,RESERVED}. UNSAFE = /[^\-_.!~*'()a-zA-Z\d;\/?:@&=+$,\[\]%]/ - # Performs the reverse function of `unescape_unreserved`. Unlike - # the previous function, we can reuse the logic in URI#encode + # Performs the reverse function of `unescape_unreserved`. The logic here is + # optimized from URI::RFC2396_Parser#escape def escape_unreserved(input) - URI::DEFAULT_PARSER.escape(input, UNSAFE) + @unsafe_map ||= Hash.new do |table, us| + encoded = us.each_byte.reduce('') do |tmp, uc| + tmp << sprintf('%%%02X', uc) + end + table[us] = encoded if us.bytesize <= 3 + encoded + end + input.gsub(UNSAFE, @unsafe_map).force_encoding(Encoding::US_ASCII) end def sanitize_string(input) diff --git a/test/bench_utf8_sanitizer.rb b/test/bench_utf8_sanitizer.rb new file mode 100644 index 0000000..e435c60 --- /dev/null +++ b/test/bench_utf8_sanitizer.rb @@ -0,0 +1,45 @@ +# frozen_string_literal: true + +require 'minitest/autorun' +require 'minitest/benchmark' + +require_relative '../lib/rack/utf8_sanitizer' + +class RackUTF8SanitizerBenchmark < Minitest::Benchmark + def self.bench_range + bench_exp(10, 1_000_000, 10) + end + + def hex + rand(255).to_s(16) + end + + def data(size, encode_ratio: 1.0) + buffer = String.new + size.times.reduce(buffer) { |str, _| + encoded = rand + encode_ratio >= 1.0 + str << (encoded ? "%#{hex}" : '___') + } + end + + def setup + @data = data(10_000_000, encode_ratio: 0.2) + end + + def bench_urlencoded_input + app = Rack::UTF8Sanitizer.new(->(env) { env }) + + request_env = { + 'REQUEST_METHOD' => 'POST', + 'CONTENT_TYPE' => 'application/x-www-form-urlencoded' + } + + assert_performance_linear 0.99 do |n| + 20.times do + offset = rand((@data.size / 3) - n) + data = @data.slice(offset, n) + app.call(request_env.merge('rack.input' => StringIO.new(data))) + end + end + end +end