Skip to content

Commit

Permalink
Increase test coverage and coerce inputs to string
Browse files Browse the repository at this point in the history
Where possible, coerce inputs to strings with StringValue rather than
raising a TypeError. This particularly impacts the relatively recent
RE2::Set API which was excessively strict about its arguments.

Add test coverage to all parts of the API, better covering edge cases
including how encoding is handled based on the encoding of the RE2
pattern.
  • Loading branch information
mudge committed Sep 15, 2023
1 parent 7e499c2 commit 5bde328
Show file tree
Hide file tree
Showing 7 changed files with 227 additions and 53 deletions.
28 changes: 14 additions & 14 deletions ext/re2/re2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,6 @@ static VALUE re2_scanner_scan(VALUE self) {
original_input_size = c->input->size();

for (i = 0; i < c->number_of_capturing_groups; i++) {
matches[i] = "";
argv[i] = &matches[i];
args[i] = &argv[i];
}
Expand Down Expand Up @@ -1404,24 +1403,23 @@ static VALUE re2_Replace(VALUE self, VALUE str, VALUE pattern,
UNUSED(self);
re2_pattern *p;

/* Convert all the inputs to be pumped into RE2::Replace. */
/* Take a copy of str so it can be modified in-place by
* RE2::Replace.
*/
string str_as_string(StringValuePtr(str));

/* Do the replacement. */
if (rb_obj_is_kind_of(pattern, re2_cRegexp)) {
Data_Get_Struct(pattern, re2_pattern, p);
RE2::Replace(&str_as_string, *p->pattern, StringValuePtr(rewrite));

return ENCODED_STR_NEW(str_as_string.data(), str_as_string.size(),
p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
return ENCODED_STR_NEW(str_as_string.data(), str_as_string.size(), p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
} else {
RE2::Replace(&str_as_string, StringValuePtr(pattern),
StringValuePtr(rewrite));

return ENCODED_STR_NEW2(str_as_string.data(), str_as_string.size(),
pattern);
return ENCODED_STR_NEW2(str_as_string.data(), str_as_string.size(), pattern);
}

}

/*
Expand All @@ -1440,7 +1438,9 @@ static VALUE re2_GlobalReplace(VALUE self, VALUE str, VALUE pattern,
VALUE rewrite) {
UNUSED(self);

/* Convert all the inputs to be pumped into RE2::GlobalReplace. */
/* Take a copy of str so it can be modified in-place by
* RE2::GlobalReplace.
*/
re2_pattern *p;
string str_as_string(StringValuePtr(str));

Expand All @@ -1449,14 +1449,12 @@ static VALUE re2_GlobalReplace(VALUE self, VALUE str, VALUE pattern,
Data_Get_Struct(pattern, re2_pattern, p);
RE2::GlobalReplace(&str_as_string, *p->pattern, StringValuePtr(rewrite));

return ENCODED_STR_NEW(str_as_string.data(), str_as_string.size(),
p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
return ENCODED_STR_NEW(str_as_string.data(), str_as_string.size(), p->pattern->options().encoding() == RE2::Options::EncodingUTF8 ? "UTF-8" : "ISO-8859-1");
} else {
RE2::GlobalReplace(&str_as_string, StringValuePtr(pattern),
StringValuePtr(rewrite));

return ENCODED_STR_NEW2(str_as_string.data(), str_as_string.size(),
pattern);
return ENCODED_STR_NEW2(str_as_string.data(), str_as_string.size(), pattern);
}
}

Expand Down Expand Up @@ -1579,11 +1577,12 @@ static VALUE re2_set_initialize(int argc, VALUE *argv, VALUE self) {
* set.add("def") #=> 1
*/
static VALUE re2_set_add(VALUE self, VALUE pattern) {
Check_Type(pattern, T_STRING);
StringValue(pattern);
re2::StringPiece regex(RSTRING_PTR(pattern), RSTRING_LEN(pattern));
std::string err;
re2_set *s;
Data_Get_Struct(self, re2_set, s);

int index = s->set->Add(regex, &err);
if (index < 0) {
rb_raise(rb_eArgError, "str rejected by RE2::Set->Add(): %s", err.c_str());
Expand Down Expand Up @@ -1669,7 +1668,8 @@ static VALUE re2_set_match(int argc, VALUE *argv, VALUE self) {
VALUE str, options, exception_option;
bool raise_exception = true;
rb_scan_args(argc, argv, "11", &str, &options);
Check_Type(str, T_STRING);

StringValue(str);
re2::StringPiece data(RSTRING_PTR(str), RSTRING_LEN(str));
std::vector<int> v;
re2_set *s;
Expand Down
6 changes: 3 additions & 3 deletions spec/kernel_spec.rb
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
RSpec.describe Kernel do
describe "#RE2" do
describe ".RE2" do
it "returns an RE2::Regexp instance given a pattern" do
expect(RE2('w(o)(o)')).to be_a(RE2::Regexp)
end

it "returns an RE2::Regexp instance given a pattern and options" do
re = RE2('w(o)(o)', :case_sensitive => false)
expect(re).to be_a(RE2::Regexp)
expect(re).to_not be_case_sensitive

expect(re).not_to be_case_sensitive
end
end
end
82 changes: 60 additions & 22 deletions spec/re2/scanner_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@
end

describe "#scan" do
it "returns the next array of matches" do
it "returns the next array of matches", :aggregate_failures do
r = RE2::Regexp.new('(\w+)')
scanner = r.scan("It is a truth universally acknowledged")

expect(scanner.scan).to eq(["It"])
expect(scanner.scan).to eq(["is"])
expect(scanner.scan).to eq(["a"])
Expand All @@ -33,92 +34,123 @@
expect(scanner.scan).to be_nil
end

it "returns multiple capturing groups at a time", :aggregate_failures do
r = RE2::Regexp.new('(\w+) (\w+)')
scanner = r.scan("It is a truth universally acknowledged")

expect(scanner.scan).to eq(["It", "is"])
expect(scanner.scan).to eq(["a", "truth"])
expect(scanner.scan).to eq(["universally", "acknowledged"])
expect(scanner.scan).to be_nil
end

it "returns an empty array if there are no capturing groups" do
r = RE2::Regexp.new('\w+')
scanner = r.scan("Foo bar")

expect(scanner.scan).to eq([])
end

it "returns nil if there is no match" do
r = RE2::Regexp.new('\d+')
scanner = r.scan("Foo bar")

expect(scanner.scan).to be_nil
end

it "returns nil if the regexp is invalid" do
r = RE2::Regexp.new('???', :log_errors => false)
scanner = r.scan("Foo bar")

expect(scanner.scan).to be_nil
end

it "returns an empty array if the input is empty" do
it "returns an empty array if the input is empty", :aggregate_failures do
r = RE2::Regexp.new("")
scanner = r.scan("")

expect(scanner.scan).to eq([])
expect(scanner.scan).to be_nil
end

it "returns an array of nil with an empty input and capture" do
it "returns an array of nil with an empty input and capture", :aggregate_failures do
r = RE2::Regexp.new("()")
scanner = r.scan("")

expect(scanner.scan).to eq([nil])
expect(scanner.scan).to be_nil
end

it "returns an empty array for every match if the pattern is empty" do
it "returns an empty array for every match if the pattern is empty", :aggregate_failures do
r = RE2::Regexp.new("")
scanner = r.scan("Foo")

expect(scanner.scan).to eq([])
expect(scanner.scan).to eq([])
expect(scanner.scan).to eq([])
expect(scanner.scan).to eq([])
expect(scanner.scan).to be_nil
end

it "returns an array of nil if the pattern is an empty capturing group" do
it "returns an array of nil if the pattern is an empty capturing group", :aggregate_failures do
r = RE2::Regexp.new("()")
scanner = r.scan("Foo")

expect(scanner.scan).to eq([nil])
expect(scanner.scan).to eq([nil])
expect(scanner.scan).to eq([nil])
expect(scanner.scan).to eq([nil])
expect(scanner.scan).to be_nil
end

it "returns array of nils with multiple empty capturing groups" do
it "returns array of nils with multiple empty capturing groups", :aggregate_failures do
r = RE2::Regexp.new("()()()")
scanner = r.scan("Foo")

expect(scanner.scan).to eq([nil, nil, nil])
expect(scanner.scan).to eq([nil, nil, nil])
expect(scanner.scan).to eq([nil, nil, nil])
expect(scanner.scan).to eq([nil, nil, nil])
expect(scanner.scan).to be_nil
end

it "supports empty groups with multibyte characters" do
it "supports empty groups with multibyte characters", :aggregate_failures do
r = RE2::Regexp.new("()€")
scanner = r.scan("€")

expect(scanner.scan).to eq([nil])
expect(scanner.scan).to be_nil
end

it "raises a type error if given input that can't be coerced to a String" do
r = RE2::Regexp.new('(\w+)')

expect { r.scan(0) }.to raise_error(TypeError)
end

it "accepts input that can be coerced to a String", :aggregate_failures do
r = RE2::Regexp.new('(\w+)')
scanner = r.scan(StringLike.new("Hello world"))

expect(scanner.scan).to eq(["Hello"])
expect(scanner.scan).to eq(["world"])
expect(scanner.scan).to be_nil
end
end

it "is enumerable" do
r = RE2::Regexp.new('(\d)')
scanner = r.scan("There are 1 some 2 numbers 3")

expect(scanner).to be_a(Enumerable)
end

describe "#each" do
it "yields each match" do
r = RE2::Regexp.new('(\d)')
scanner = r.scan("There are 1 some 2 numbers 3")
matches = []
scanner.each do |match|
matches << match
end

expect(matches).to eq([["1"], ["2"], ["3"]])
expect { |b| scanner.each(&b) }.to yield_successive_args(["1"], ["2"], ["3"])
end

it "returns an enumerator when not given a block" do
Expand All @@ -135,22 +167,28 @@
end

describe "#rewind" do
it "resets any consumption" do
it "resets any consumption", :aggregate_failures do
r = RE2::Regexp.new('(\d)')
scanner = r.scan("There are 1 some 2 numbers 3")

expect(scanner.to_enum.first).to eq(["1"])
expect(scanner.to_enum.first).to eq(["2"])

scanner.rewind

expect(scanner.to_enum.first).to eq(["1"])
end

it "resets the eof? check" do
it "resets the eof? check", :aggregate_failures do
r = RE2::Regexp.new('(\d)')
scanner = r.scan("1")
scanner.scan
expect(scanner.eof?).to be_truthy

expect(scanner).to be_eof

scanner.rewind
expect(scanner.eof?).to be_falsey

expect(scanner).not_to be_eof
end
end

Expand All @@ -159,46 +197,46 @@
r = RE2::Regexp.new('(\d)')
scanner = r.scan("1 2 3")

expect(scanner.eof?).to be_falsey
expect(scanner).not_to be_eof
end

it "returns true if the input has been consumed" do
r = RE2::Regexp.new('(\d)')
scanner = r.scan("1")
scanner.scan

expect(scanner.eof?).to be_truthy
expect(scanner).to be_eof
end

it "returns false if no match is made" do
r = RE2::Regexp.new('(\d)')
scanner = r.scan("a")
scanner.scan

expect(scanner.eof?).to be_falsey
expect(scanner).not_to be_eof
end

it "returns false with an empty input that has not been scanned" do
r = RE2::Regexp.new("")
scanner = r.scan("")

expect(scanner.eof?).to be_falsey
expect(scanner).not_to be_eof
end

it "returns false with an empty input that has not been matched" do
r = RE2::Regexp.new('(\d)')
scanner = r.scan("")
scanner.scan

expect(scanner.eof?).to be_falsey
expect(scanner).not_to be_eof
end

it "returns true with an empty input that has been matched" do
r = RE2::Regexp.new("")
scanner = r.scan("")
scanner.scan

expect(scanner.eof?).to be_truthy
expect(scanner).to be_eof
end
end
end
Loading

0 comments on commit 5bde328

Please sign in to comment.