Skip to content

Commit

Permalink
AVRO-1517: [Perl] Encode UTF-8 strings as bytes (#2979)
Browse files Browse the repository at this point in the history
From John Karp's original description of [the issue]:

> By default in Perl, a string is a sequence of bytes, values 0-255.
> However, if a Unicode character is included that cannot be represented
> with a single byte, the string gets 'upgraded' to a non-byte-based
> Unicode string allowing ordinals outside that range. When string
> operations are done with byte and non-byte Unicode strings, the result
> is always non-byte, with the byte string first 'upgraded'. Upgrading
> consists of utf8 encoding and setting a utf8 flag on the string. ('utf8'
> is a variant of UTF-8 used by Perl)
>
> The Perl Avro API is accepting these Unicode strings as-is for the
> 'bytes' type. This is a problem because
>
>   1. values >255 are not valid as bytes, and any encoding is their job
>
>   2. As Avro assembles the serialized data, Perl 'upgrades' all the data,
>      having the effect of utf8 encoding our serialized binary data.
>
> The correct behavior is for the Avro Perl API is to attempt to downgrade
> the string, and if this fails because it contained values >255 then to
> raise an error. (The behavior of 'string' won't change, it will still
> take Unicode strings as expected.)

This change, based on the one submitted for that ticket, adds these
behaviours and tests to exercise them.

[the issue]: https://issues.apache.org/jira/browse/AVRO-1517
jjatria authored Jun 26, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
1 parent 477f221 commit 677e982
Showing 4 changed files with 54 additions and 14 deletions.
4 changes: 4 additions & 0 deletions lang/perl/Changes
Original file line number Diff line number Diff line change
@@ -11,6 +11,10 @@ Revision history for Perl extension Avro
for int and long types were off by one
- Silenced a spurious warning that was raised when
validating an undefined value for some data types
- Make sure Unicode strings are downgraded when they
are encoded in fields of type 'byte' or 'fixed'.
Errors resulting from this process will be raised as
Avro::BinaryEncoder::Error exceptions

1.00 Fri Jan 17 15:00:00 2014
- Relicense under apache license 2.0
21 changes: 14 additions & 7 deletions lang/perl/lib/Avro/BinaryEncoder.pm
Original file line number Diff line number Diff line change
@@ -138,15 +138,17 @@ sub encode_double {
sub encode_bytes {
my $class = shift;
my ($schema, $data, $cb) = @_;
encode_long($class, undef, bytes::length($data), $cb);
throw Avro::BinaryEncoder::Error("Invalid data given for 'bytes': Contains values >255")
unless utf8::downgrade($data, 1);
encode_long($class, undef, length($data), $cb);
$cb->(\$data);
}

sub encode_string {
my $class = shift;
my ($schema, $data, $cb) = @_;
my $bytes = Encode::encode_utf8($data);
encode_long($class, undef, bytes::length($bytes), $cb);
encode_long($class, undef, length($bytes), $cb);
$cb->(\$bytes);
}

@@ -270,11 +272,16 @@ sub encode_union {
sub encode_fixed {
my $class = shift;
my ($schema, $data, $cb) = @_;
if (bytes::length $data != $schema->size) {
my $s1 = bytes::length $data;
my $s2 = $schema->size;
throw Avro::BinaryEncoder::Error("Fixed size doesn't match $s1!=$s2");
}

throw Avro::BinaryEncoder::Error("Invalid data given for 'fixed': Contains values >255")
unless utf8::downgrade($data, 1);

my $length = length $data;
my $size = $schema->size;

throw Avro::BinaryEncoder::Error("Fixed size doesn't match $length!=$size")
unless $length == $size;

$cb->(\$data);
}

15 changes: 9 additions & 6 deletions lang/perl/lib/Avro/Schema.pm
Original file line number Diff line number Diff line change
@@ -313,7 +313,11 @@ sub is_data_valid {
if ($type eq 'float' or $type eq 'double') {
$data =~ /^$RE{num}{real}$/ ? return 1 : 0;
}
if ($type eq "bytes" or $type eq "string") {
if ($type eq 'bytes') {
return 0 if ref $data;
return 1 unless utf8::is_utf8($data) and $data =~ /[^\x00-\xFF]/;
}
if ($type eq 'string') {
return 1 unless ref $data;
}
if ($type eq 'boolean') {
@@ -807,11 +811,10 @@ sub new {
}

sub is_data_valid {
my $schema = shift;
my $default = shift;
my $size = $schema->{size};
return 1 if $default && bytes::length $default == $size;
return 0;
my ( $schema, $data ) = @_;

return 0 if utf8::is_utf8($data) && $data =~ /[^\x00-\xFF]/;
return $data && length($data) == $schema->{size};
}

sub size {
28 changes: 27 additions & 1 deletion lang/perl/t/01_schema.t
Original file line number Diff line number Diff line change
@@ -19,7 +19,7 @@ use strict;
use warnings;

use Test::More;
plan tests => 137;
plan tests => 145;
use Test::Exception;
use_ok 'Avro::Schema';

@@ -42,6 +42,32 @@ isa_ok $s2, 'Avro::Schema::Primitive';
is $s2->type, "string", "type is string";
is $s, $s2, "string Schematas are singletons";

## Perl strings as bytes
{
my $schema = Avro::Schema->parse(q({"type": "bytes"}));
ok $schema->is_data_valid(''), 'Empty string is valid as bytes';
ok $schema->is_data_valid("\0"), 'Zero byte is valid as bytes';
ok !$schema->is_data_valid("\x{100}"), 'Values > 255 not valid as bytes';

my $bytes = '';
utf8::upgrade($bytes);

ok $schema->is_data_valid($bytes), 'Upgraded string valid as bytes';
}

## Perl strings as fixed
{
my $schema = Avro::Schema->parse(q({"type": "fixed", "name": "foo", "size": 1 }));
ok !$schema->is_data_valid(''), 'Too few bytes vs. schema';
ok $schema->is_data_valid("\0"), 'Zero byte is valid as fixed';
ok !$schema->is_data_valid("\x{100}"), 'Values > 255 not valid as fixed';

my $bytes = "\xff";
utf8::upgrade($bytes);

ok $schema->is_data_valid($bytes), 'Upgraded string valid as fixed';
}

## Records
{
my $s3 = Avro::Schema::Record->new(

0 comments on commit 677e982

Please sign in to comment.