From 677e9829bae30cc76527c6f5702f8c2384be61c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Joaqu=C3=ADn=20Atria?= Date: Wed, 26 Jun 2024 16:12:45 +0100 Subject: [PATCH] AVRO-1517: [Perl] Encode UTF-8 strings as bytes (#2979) From John Karp's original description of [the issue]: > By default in Perl, a string is a sequence of bytes, values 0-255. > However, if a Unicode character is included that cannot be represented > with a single byte, the string gets 'upgraded' to a non-byte-based > Unicode string allowing ordinals outside that range. When string > operations are done with byte and non-byte Unicode strings, the result > is always non-byte, with the byte string first 'upgraded'. Upgrading > consists of utf8 encoding and setting a utf8 flag on the string. ('utf8' > is a variant of UTF-8 used by Perl) > > The Perl Avro API is accepting these Unicode strings as-is for the > 'bytes' type. This is a problem because > > 1. values >255 are not valid as bytes, and any encoding is their job > > 2. As Avro assembles the serialized data, Perl 'upgrades' all the data, > having the effect of utf8 encoding our serialized binary data. > > The correct behavior is for the Avro Perl API is to attempt to downgrade > the string, and if this fails because it contained values >255 then to > raise an error. (The behavior of 'string' won't change, it will still > take Unicode strings as expected.) This change, based on the one submitted for that ticket, adds these behaviours and tests to exercise them. [the issue]: https://issues.apache.org/jira/browse/AVRO-1517 --- lang/perl/Changes | 4 ++++ lang/perl/lib/Avro/BinaryEncoder.pm | 21 ++++++++++++++------- lang/perl/lib/Avro/Schema.pm | 15 +++++++++------ lang/perl/t/01_schema.t | 28 +++++++++++++++++++++++++++- 4 files changed, 54 insertions(+), 14 deletions(-) diff --git a/lang/perl/Changes b/lang/perl/Changes index 67805602843..c1551566f5f 100644 --- a/lang/perl/Changes +++ b/lang/perl/Changes @@ -11,6 +11,10 @@ Revision history for Perl extension Avro for int and long types were off by one - Silenced a spurious warning that was raised when validating an undefined value for some data types + - Make sure Unicode strings are downgraded when they + are encoded in fields of type 'byte' or 'fixed'. + Errors resulting from this process will be raised as + Avro::BinaryEncoder::Error exceptions 1.00 Fri Jan 17 15:00:00 2014 - Relicense under apache license 2.0 diff --git a/lang/perl/lib/Avro/BinaryEncoder.pm b/lang/perl/lib/Avro/BinaryEncoder.pm index 18a25813e40..47eb93116f0 100644 --- a/lang/perl/lib/Avro/BinaryEncoder.pm +++ b/lang/perl/lib/Avro/BinaryEncoder.pm @@ -138,7 +138,9 @@ sub encode_double { sub encode_bytes { my $class = shift; my ($schema, $data, $cb) = @_; - encode_long($class, undef, bytes::length($data), $cb); + throw Avro::BinaryEncoder::Error("Invalid data given for 'bytes': Contains values >255") + unless utf8::downgrade($data, 1); + encode_long($class, undef, length($data), $cb); $cb->(\$data); } @@ -146,7 +148,7 @@ sub encode_string { my $class = shift; my ($schema, $data, $cb) = @_; my $bytes = Encode::encode_utf8($data); - encode_long($class, undef, bytes::length($bytes), $cb); + encode_long($class, undef, length($bytes), $cb); $cb->(\$bytes); } @@ -270,11 +272,16 @@ sub encode_union { sub encode_fixed { my $class = shift; my ($schema, $data, $cb) = @_; - if (bytes::length $data != $schema->size) { - my $s1 = bytes::length $data; - my $s2 = $schema->size; - throw Avro::BinaryEncoder::Error("Fixed size doesn't match $s1!=$s2"); - } + + throw Avro::BinaryEncoder::Error("Invalid data given for 'fixed': Contains values >255") + unless utf8::downgrade($data, 1); + + my $length = length $data; + my $size = $schema->size; + + throw Avro::BinaryEncoder::Error("Fixed size doesn't match $length!=$size") + unless $length == $size; + $cb->(\$data); } diff --git a/lang/perl/lib/Avro/Schema.pm b/lang/perl/lib/Avro/Schema.pm index 3451b0b4d71..200d2947cd7 100644 --- a/lang/perl/lib/Avro/Schema.pm +++ b/lang/perl/lib/Avro/Schema.pm @@ -313,7 +313,11 @@ sub is_data_valid { if ($type eq 'float' or $type eq 'double') { $data =~ /^$RE{num}{real}$/ ? return 1 : 0; } - if ($type eq "bytes" or $type eq "string") { + if ($type eq 'bytes') { + return 0 if ref $data; + return 1 unless utf8::is_utf8($data) and $data =~ /[^\x00-\xFF]/; + } + if ($type eq 'string') { return 1 unless ref $data; } if ($type eq 'boolean') { @@ -807,11 +811,10 @@ sub new { } sub is_data_valid { - my $schema = shift; - my $default = shift; - my $size = $schema->{size}; - return 1 if $default && bytes::length $default == $size; - return 0; + my ( $schema, $data ) = @_; + + return 0 if utf8::is_utf8($data) && $data =~ /[^\x00-\xFF]/; + return $data && length($data) == $schema->{size}; } sub size { diff --git a/lang/perl/t/01_schema.t b/lang/perl/t/01_schema.t index f844ef0f619..44a5809293c 100644 --- a/lang/perl/t/01_schema.t +++ b/lang/perl/t/01_schema.t @@ -19,7 +19,7 @@ use strict; use warnings; use Test::More; -plan tests => 137; +plan tests => 145; use Test::Exception; use_ok 'Avro::Schema'; @@ -42,6 +42,32 @@ isa_ok $s2, 'Avro::Schema::Primitive'; is $s2->type, "string", "type is string"; is $s, $s2, "string Schematas are singletons"; +## Perl strings as bytes +{ + my $schema = Avro::Schema->parse(q({"type": "bytes"})); + ok $schema->is_data_valid(''), 'Empty string is valid as bytes'; + ok $schema->is_data_valid("\0"), 'Zero byte is valid as bytes'; + ok !$schema->is_data_valid("\x{100}"), 'Values > 255 not valid as bytes'; + + my $bytes = ''; + utf8::upgrade($bytes); + + ok $schema->is_data_valid($bytes), 'Upgraded string valid as bytes'; +} + +## Perl strings as fixed +{ + my $schema = Avro::Schema->parse(q({"type": "fixed", "name": "foo", "size": 1 })); + ok !$schema->is_data_valid(''), 'Too few bytes vs. schema'; + ok $schema->is_data_valid("\0"), 'Zero byte is valid as fixed'; + ok !$schema->is_data_valid("\x{100}"), 'Values > 255 not valid as fixed'; + + my $bytes = "\xff"; + utf8::upgrade($bytes); + + ok $schema->is_data_valid($bytes), 'Upgraded string valid as fixed'; +} + ## Records { my $s3 = Avro::Schema::Record->new(