Skip to content

Commit

Permalink
AVRO-1521 [Perl] Fix boolean encoding errors
Browse files Browse the repository at this point in the history
This change fixes a long-standing issue with the binary encoding
of boolean values. In particular, that while several "smart" values
were accepted as valid boolean values by Avro::Schema (eg. "true"
and "no"), Avro::BinaryEncoder encoded them as true or false depending
on their truth value for Perl. This resulted in both of those examples
being encoded as true, because for Perl any non-empty string is true.

This change makes it so that those values are accepted and properly
handled, and handles other values that represent boolean values
like JSON::PP::Boolean references and native Perl booleans (those
that would be returned by eg. builtin::true).

This also includes a small but possibly breaking bugfix for the
detection of valid boolean values in Avro::Schema, which was using
a non-anchored regular expression to filter values, meaning that
eg. any value that had an "n" anywhere would be considered valid.
This was most likely an involuntary error, so while breaking, it
feels like we have to fix it.
  • Loading branch information
jjatria committed Jun 28, 2024
1 parent 677e982 commit 56e0655
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 26 deletions.
12 changes: 12 additions & 0 deletions lang/perl/Changes
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,18 @@ Revision history for Perl extension Avro
are encoded in fields of type 'byte' or 'fixed'.
Errors resulting from this process will be raised as
Avro::BinaryEncoder::Error exceptions
- Fixed a bug with the detection of valid boolean values
which was not using anchors in a regular expression.
Valid values are (case insensitively) 'yes', 'y', 'no',
'n', 'true', 't', 'false', 'f', 0, 1, and anything that
is accepted by JSON::PP::is_bool. References rejected by
this last function are not valid
- Fixed some issues around the binary encoding of boolean
values. In particular, Avro::BinaryEncoder now correctly
encodes all the values that Avro::Schema accepts as
valid boolean values, which before were blindly evaluated
in boolean context resulting in strings like 'false'
being serialised as if they were true

1.00 Fri Jan 17 15:00:00 2014
- Relicense under apache license 2.0
Expand Down
23 changes: 22 additions & 1 deletion lang/perl/lib/Avro/BinaryEncoder.pm
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,28 @@ sub encode_null {
sub encode_boolean {
my $class = shift;
my ($schema, $data, $cb) = @_;
$cb->( $data ? \"\x1" : \"\x0" );

throw Avro::BinaryEncoder::Error( "<UNDEF> is not a valid boolean value")
unless defined $data;

if ( my $type = ref $data ) {
throw Avro::BinaryEncoder::Error("cannot encode a '$type' reference as boolean")
unless $type eq 'JSON::PP::Boolean';
}
else {
throw Avro::BinaryEncoder::Error( "'$data' is not a valid boolean value")
unless $data eq '' # For Perl versions without builtin::is_bool
|| JSON::PP::is_bool($data)
|| $data =~ /^(?:true|t|false|f|yes|y|no|n|0|1)$/i;
}

# Some values might be false but evaluate to "truthy" for Perl,
# like the string 'false'. For these known exceptions, which
# would otherwise be false, we read a false value. For everything
# else, we rely on their value evaluated in a boolean context.
my $true = $data =~ /^(?:no|n|false|f)$/i ? 0 : !!$data;

$cb->( $true ? \"\x1" : \"\x0" );
}

sub encode_int {
Expand Down
4 changes: 2 additions & 2 deletions lang/perl/lib/Avro/Schema.pm
Original file line number Diff line number Diff line change
Expand Up @@ -321,9 +321,9 @@ sub is_data_valid {
return 1 unless ref $data;
}
if ($type eq 'boolean') {
return 1 if JSON::PP::is_bool($data);
return 0 if ref $data; # sometimes risky
return 1 if $data =~ m{yes|no|y|n|t|f|true|false}i;
return 0;
return $data =~ m{^(?:yes|no|y|n|t|f|true|false|0|1)$}i;
}
return 0;
}
Expand Down
91 changes: 68 additions & 23 deletions lang/perl/t/02_bin_encode.t
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,26 @@ use Config;
use Test::More;
use Test::Exception;
use Math::BigInt;
use JSON::PP; # For booleans

use_ok 'Avro::BinaryEncoder';

sub primitive_ok {
my ($primitive_type, $primitive_val, $expected_enc) = @_;
sub primitive {
my ($type, $data) = @_;

my $data;
my $meth = "encode_$primitive_type";
Avro::BinaryEncoder->$meth(
undef, $primitive_val, sub { $data = ${$_[0]} }
my $encoded;
my $method = "encode_$type";
Avro::BinaryEncoder->$method(
undef, $data, sub { $encoded = ${$_[0]} }
);
is $data, $expected_enc, "primitive $primitive_type encoded correctly";
return $encoded;
}

sub primitive_ok {
my ($type, $have, $want) = @_;
my $data = primitive( $type => $have );
is primitive( $type => $have ), $want,
"primitive $type encoded @{[ $have // '<UNDEF>' ]} correctly";
return $data;
}

Expand All @@ -44,8 +52,48 @@ sub primitive_ok {
primitive_ok null => undef, '';
primitive_ok null => 'whatev', '';

primitive_ok boolean => 0, "\x0";
primitive_ok boolean => 1, "\x1";
primitive_ok boolean => 0, "\x0";
primitive_ok boolean => 1, "\x1";
primitive_ok boolean => 'false', "\x0";
primitive_ok boolean => 'true', "\x1";
primitive_ok boolean => 'f', "\x0";
primitive_ok boolean => 't', "\x1";
primitive_ok boolean => 'no', "\x0";
primitive_ok boolean => 'yes', "\x1";
primitive_ok boolean => 'n', "\x0";
primitive_ok boolean => 'y', "\x1";
primitive_ok boolean => 'FALSE', "\x0";
primitive_ok boolean => 'TRUE', "\x1";
primitive_ok boolean => 'F', "\x0";
primitive_ok boolean => 'T', "\x1";
primitive_ok boolean => 'NO', "\x0";
primitive_ok boolean => 'YES', "\x1";
primitive_ok boolean => 'N', "\x0";
primitive_ok boolean => 'Y', "\x1";
primitive_ok boolean => !!0, "\x0"; # Native false
primitive_ok boolean => !!1, "\x1"; # Native true
primitive_ok boolean => $JSON::PP::false, "\x0";
primitive_ok boolean => $JSON::PP::true, "\x1";

throws_ok {
primitive boolean => undef;
} "Avro::BinaryEncoder::Error", "<UNDEF> is not a valid boolean value";

throws_ok {
primitive boolean => 2;
} "Avro::BinaryEncoder::Error", "'2' is not a valid boolean value";

throws_ok {
primitive boolean => -1;
} "Avro::BinaryEncoder::Error", "'-1' is not a valid boolean value";

throws_ok {
primitive boolean => 'anything truthy';
} "Avro::BinaryEncoder::Error", "'anything truthy' is not a valid boolean value";

throws_ok {
primitive boolean => {};
} "Avro::BinaryEncoder::Error", "cannot encode a 'HASH' reference as boolean";

## - high-bit of each byte should be set except for last one
## - rest of bits are:
Expand Down Expand Up @@ -74,33 +122,30 @@ sub primitive_ok {
primitive_ok long => -Math::BigInt->new(2)**63, "\xff\xff\xff\xff\xff\xff\xff\xff\xff\x01";

throws_ok {
primitive_ok int => Math::BigInt->new(2)**31;
primitive int => Math::BigInt->new(2)**31;
} "Avro::BinaryEncoder::Error", "32-bit signed int overflow";

throws_ok {
primitive_ok int => -Math::BigInt->new(2)**31 - 1;
primitive int => -Math::BigInt->new(2)**31 - 1;
} "Avro::BinaryEncoder::Error", "32-bit signed int underflow";

throws_ok {
primitive_ok int => Math::BigInt->new(2)**63;
primitive int => Math::BigInt->new(2)**63;
} "Avro::BinaryEncoder::Error", "64-bit signed int overflow";

throws_ok {
primitive_ok long => -Math::BigInt->new(2)**63 - 1;
primitive long => -Math::BigInt->new(2)**63 - 1;
} "Avro::BinaryEncoder::Error", "64-bit signed int underflow";

for (qw(long int)) {
throws_ok {
primitive_ok $_ => "x", undef;
} 'Avro::BinaryEncoder::Error', 'numeric values only';
};
throws_ok {
primitive $_ => "x";
} 'Avro::BinaryEncoder::Error', 'numeric values only' for qw(long int);

# In Unicode, there are decimals that aren't 0-9.
# Make sure we handle non-ascii decimals cleanly.
for (qw(long int)) {
throws_ok {
primitive_ok $_ => "\N{U+0661}", undef;
} 'Avro::BinaryEncoder::Error', 'ascii decimals only';
};
throws_ok {
primitive $_ => "\N{U+0661}";
} 'Avro::BinaryEncoder::Error', 'ascii decimals only' for qw(long int);
}

## spec examples
Expand Down

0 comments on commit 56e0655

Please sign in to comment.