From e5179e930d4abf03740576b196182b4cbf17cb1e Mon Sep 17 00:00:00 2001 From: Oscar Westra van Holthe - Kind Date: Thu, 3 Nov 2022 09:38:52 +0100 Subject: [PATCH] AVRO-3666: Separate parsing from Schema class This allows using pluggable parser implementations, allowing multiple formats to be parsed with the same code. --- .../Getting started (Java)/_index.md | 6 +- .../apache/avro/FormattedSchemaParser.java | 68 +++++ .../org/apache/avro/JsonSchemaParser.java | 90 +++++++ .../java/org/apache/avro/SchemaParser.java | 251 ++++++++++++++++++ .../org/apache/avro/util/UtfTextUtils.java | 247 +++++++++++++++++ .../org/apache/avro/DummySchemaParser.java | 45 ++++ .../org/apache/avro/TestSchemaParser.java | 107 ++++++++ .../apache/avro/util/UtfTextUtilsTest.java | 132 +++++++++ .../org.apache.avro.FormattedSchemaParser | 18 ++ .../java/org/apache/avro/idl/IdlReader.java | 65 +++-- .../org/apache/avro/idl/IdlSchemaParser.java | 52 ++++ .../org.apache.avro.FormattedSchemaParser | 18 ++ 12 files changed, 1073 insertions(+), 26 deletions(-) create mode 100644 lang/java/avro/src/main/java/org/apache/avro/FormattedSchemaParser.java create mode 100644 lang/java/avro/src/main/java/org/apache/avro/JsonSchemaParser.java create mode 100644 lang/java/avro/src/main/java/org/apache/avro/SchemaParser.java create mode 100644 lang/java/avro/src/main/java/org/apache/avro/util/UtfTextUtils.java create mode 100644 lang/java/avro/src/test/java/org/apache/avro/DummySchemaParser.java create mode 100644 lang/java/avro/src/test/java/org/apache/avro/TestSchemaParser.java create mode 100644 lang/java/avro/src/test/java/org/apache/avro/util/UtfTextUtilsTest.java create mode 100644 lang/java/avro/src/test/resources/META-INF/services/org.apache.avro.FormattedSchemaParser create mode 100644 lang/java/idl/src/main/java/org/apache/avro/idl/IdlSchemaParser.java create mode 100644 lang/java/idl/src/main/resources/META-INF/services/org.apache.avro.FormattedSchemaParser diff --git a/doc/content/en/docs/++version++/Getting started (Java)/_index.md b/doc/content/en/docs/++version++/Getting started (Java)/_index.md index f3944cf020e..a4be7a64ae0 100644 --- a/doc/content/en/docs/++version++/Getting started (Java)/_index.md +++ b/doc/content/en/docs/++version++/Getting started (Java)/_index.md @@ -77,7 +77,7 @@ You may also build the required Avro jars from source. Building Avro is beyond t ## Defining a schema -Avro schemas are defined using JSON. Schemas are composed of primitive types (null, boolean, int, long, float, double, bytes, and string) and complex types (record, enum, array, map, union, and fixed). You can learn more about Avro schemas and types from the specification, but for now let's start with a simple schema example, user.avsc: +Avro schemas are defined using JSON or IDL (the latter requires an extra dependency). Schemas are composed of primitive types (null, boolean, int, long, float, double, bytes, and string) and complex types (record, enum, array, map, union, and fixed). You can learn more about Avro schemas and types from the specification, but for now let's start with a simple schema example, user.avsc: ```json {"namespace": "example.avro", @@ -209,10 +209,10 @@ Data in Avro is always stored with its corresponding schema, meaning we can alwa Let's go over the same example as in the previous section, but without using code generation: we'll create some users, serialize them to a data file on disk, and then read back the file and deserialize the users objects. ### Creating users -First, we use a Parser to read our schema definition and create a Schema object. +First, we use a SchemaParser to read our schema definition and create a Schema object. ```java -Schema schema = new Schema.Parser().parse(new File("user.avsc")); +Schema schema = new SchemaParser().parse(new File("user.avsc")); ``` Using this schema, let's create some users. diff --git a/lang/java/avro/src/main/java/org/apache/avro/FormattedSchemaParser.java b/lang/java/avro/src/main/java/org/apache/avro/FormattedSchemaParser.java new file mode 100644 index 00000000000..2b4eeaa77e5 --- /dev/null +++ b/lang/java/avro/src/main/java/org/apache/avro/FormattedSchemaParser.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.avro; + +import java.io.IOException; +import java.net.URI; +import java.util.Collection; + +/** + * Schema parser for a specific schema format. + * + *

+ * The {@link SchemaParser} class uses this interface, supporting text based schema sources. + *

+ * + *

Note to implementers:

+ * + *

+ * Implementations are located using a {@link java.util.ServiceLoader}. See that class for details. + *

+ * + *

+ * You can expect that schemas being read are invalid, so you are encouraged to return {@code null} upon parsing failure + * where the input clearly doesn't make sense (e.g., reading "/**" when expecting JSON). If the input is likely in the + * correct format, but invalid, throw a {@link SchemaParseException} instead. + *

+ * + *

+ * Note that throwing anything other than a {@code SchemaParseException} will abort the parsing process, so reserve that + * for rethrowing exceptions. + *

+ * + * @see java.util.ServiceLoader + */ +public interface FormattedSchemaParser { + /** + * Parse a schema from a text based source. Can use the base location of the schema (e.g., the directory where the + * schema file lives) if available. + * + *

+ * Implementations should add all named schemas they parse to the collection. + *

+ * + * @param types a mutable collection of known types; parsed named schemata will be added + * @param baseUri the base location of the schema, or {@code null} if not known + * @param formattedSchema the schema as text + * @return the parsed schema, or {@code null} if the format is not supported + * @throws IOException when the schema cannot be read + * @throws SchemaParseException when the schema cannot be parsed + */ + Schema parse(Collection types, URI baseUri, CharSequence formattedSchema) + throws IOException, SchemaParseException; +} diff --git a/lang/java/avro/src/main/java/org/apache/avro/JsonSchemaParser.java b/lang/java/avro/src/main/java/org/apache/avro/JsonSchemaParser.java new file mode 100644 index 00000000000..9a1da447d15 --- /dev/null +++ b/lang/java/avro/src/main/java/org/apache/avro/JsonSchemaParser.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.avro; + +import java.io.IOException; +import java.net.URI; +import java.util.ArrayList; +import java.util.Collection; + +/** + * Schema parser for JSON formatted schemata. This initial implementation simply + * delegates to the {@link Schema.Parser} class, though it should be refactored + * out of there. + * + *

+ * Note: this class is intentionally not available via the Java + * {@link java.util.ServiceLoader}, as its use is hardcoded as fallback when no + * service exists. This enables users to reliably override the standard JSON + * parser as well. + *

+ */ +public class JsonSchemaParser implements FormattedSchemaParser { + /** + *

+ * Parse a schema written in the internal (JSON) format without any validations. + *

+ * + *

+ * Using this method is only safe if used to parse a write schema (i.e., a + * schema used to read Avro data). Other usages, for example by generated Avro + * code, can cause interoperability problems. + *

+ * + *

+ * Use with care and sufficient testing! + *

+ * + * @param fragments one or more strings making up the schema (some schemata + * exceed the compiler limits) + * @return the parsed schema + */ + public static Schema parseInternal(String... fragments) { + StringBuilder buffer = new StringBuilder(); + for (String fragment : fragments) { + buffer.append(fragment); + } + return new JsonSchemaParser().parse(new ArrayList<>(), buffer, true); + } + + @Override + public Schema parse(Collection schemas, URI baseUri, CharSequence formattedSchema) + throws IOException, SchemaParseException { + return parse(schemas, formattedSchema, false); + } + + private Schema parse(Collection schemas, CharSequence formattedSchema, boolean skipValidation) + throws SchemaParseException { + // TODO: refactor JSON parsing out of the Schema class + Schema.Parser parser; + if (skipValidation) { + parser = new Schema.Parser(Schema.NameValidator.NO_VALIDATION); + parser.setValidateDefaults(false); + } else { + parser = new Schema.Parser(); + } + if (schemas != null) { + parser.addTypes(schemas); + } + Schema schema = parser.parse(formattedSchema.toString()); + if (schemas != null) { + schemas.addAll(parser.getTypes().values()); + } + return schema; + } +} diff --git a/lang/java/avro/src/main/java/org/apache/avro/SchemaParser.java b/lang/java/avro/src/main/java/org/apache/avro/SchemaParser.java new file mode 100644 index 00000000000..c100f724b8e --- /dev/null +++ b/lang/java/avro/src/main/java/org/apache/avro/SchemaParser.java @@ -0,0 +1,251 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.avro; + +import org.apache.avro.util.UtfTextUtils; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.net.URI; +import java.nio.charset.Charset; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Collection; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.ServiceLoader; +import java.util.Set; + +/** + * Avro schema parser for text-based formats like JSON, IDL, etc. + * + *

+ * Parses formatted (i.e., text based) schemata from a given source using the + * available {@link FormattedSchemaParser} implementations, and returns the + * first result. This means it can transparently handle any schema format. The + * Avro project defines a JSON based format and an IDL format (the latter + * available as a separate dependency), but you can also provide your own. + *

+ * + *

+ * The parser can handle various text based sources. If the source contains a + * UTF encoded latin text based format it can even detect which UTF encoding was + * used (UTF-8, UTF16BE, UTF16LE, UTF-32BE or UTF32LE). + *

+ * + * @see FormattedSchemaParser + * @see UtfTextUtils + */ +public class SchemaParser { + private final Set knownSchemata; + private final Collection formattedSchemaParsers; + + /** + * Create a schema parser. Initially, the list of known (named) schemata is + * empty. + */ + public SchemaParser() { + this.knownSchemata = new LinkedHashSet<>(); + this.formattedSchemaParsers = new ArrayList<>(); + for (FormattedSchemaParser formattedSchemaParser : ServiceLoader.load(FormattedSchemaParser.class)) { + formattedSchemaParsers.add(formattedSchemaParser); + } + // Add the default / JSON parser last (not as a service, even though it + // implements the service interface), to allow implementations that parse JSON + // files into schemata differently. + formattedSchemaParsers.add(new JsonSchemaParser()); + } + + /** + * Parse an Avro schema from a file. The file content is assumed to be UTF-8 + * text. + * + * @param file the file to read + * @return the schema + * @throws IOException when the schema cannot be read + * @throws SchemaParseException if parsing the schema failed; contains + * suppressed underlying parse exceptions if + * available + * @see UtfTextUtils + */ + public Schema parse(File file) throws IOException, SchemaParseException { + return parse(file, null); + } + + /** + * Parse an Avro schema from a file written with a specific character set. + * + * @param file the file to read + * @param charset the character set of the file contents + * @return the schema + * @throws IOException when the schema cannot be read + * @throws SchemaParseException if parsing the schema failed; contains + * suppressed underlying parse exceptions if + * available + */ + public Schema parse(File file, Charset charset) throws IOException, SchemaParseException { + return parse(file.toPath(), charset); + } + + /** + * Parse an Avro schema from a file. The file content is assumed to be UTF-8 + * text. + * + * @param file the file to read + * @return the schema + * @throws IOException when the schema cannot be read + * @throws SchemaParseException if parsing the schema failed; contains + * suppressed underlying parse exceptions if + * available + * @see UtfTextUtils + */ + public Schema parse(Path file) throws IOException, SchemaParseException { + return parse(file, null); + } + + /** + * Parse an Avro schema from a file written with a specific character set. + * + * @param file the file to read + * @param charset the character set of the file contents + * @return the schema + * @throws IOException when the schema cannot be read + * @throws SchemaParseException if parsing the schema failed; contains + * suppressed underlying parse exceptions if + * available + */ + public Schema parse(Path file, Charset charset) throws IOException, SchemaParseException { + URI inputDir = file.getParent().toUri(); + try (InputStream stream = Files.newInputStream(file)) { + String formattedSchema = UtfTextUtils.readAllBytes(stream, charset); + return parse(inputDir, formattedSchema); + } + } + + /** + * Parse an Avro schema from an input stream. The stream content is assumed to + * be UTF-8 text. Note that the stream stays open after reading. + * + * @param in the stream to read + * @return the schema + * @throws IOException when the schema cannot be read + * @throws SchemaParseException if parsing the schema failed; contains + * suppressed underlying parse exceptions if + * available + * @see UtfTextUtils + */ + public Schema parse(InputStream in) throws IOException, SchemaParseException { + return parse(in, null); + } + + /** + * Parse an Avro schema from an input stream. Note that the stream stays open + * after reading. + * + * @param in the stream to read + * @param charset the character set of the stream contents + * @return the schema + * @throws IOException when the schema cannot be read + * @throws SchemaParseException if parsing the schema failed; contains + * suppressed underlying parse exceptions if + * available + */ + public Schema parse(InputStream in, Charset charset) throws IOException, SchemaParseException { + return parse(UtfTextUtils.readAllBytes(in, charset)); + } + + /** + * Parse an Avro schema from an input reader. + * + * @param in the stream to read + * @return the schema + * @throws IOException when the schema cannot be read + * @throws SchemaParseException if parsing the schema failed; contains + * suppressed underlying parse exceptions if + * available + */ + public Schema parse(Reader in) throws IOException, SchemaParseException { + return parse(UtfTextUtils.readAllChars(in)); + } + + /** + * Parse an Avro schema from a string. + * + * @param text the text to parse + * @return the schema + * @throws SchemaParseException if parsing the schema failed; contains + * suppressed underlying parse exceptions if + * available + */ + public Schema parse(CharSequence text) throws SchemaParseException { + try { + return parse(null, text); + } catch (IOException e) { + // This can only happen if parser implementations try to read other (related) + // schemata from somewhere. + throw new AvroRuntimeException("Could not read schema", e); + } + } + + /** + * Parse the given schema (string) within the specified context using all + * available {@link FormattedSchemaParser} implementations, collecting any + * {@link SchemaParseException}s that occur, and return the first successfully + * parsed schema. If all parsers fail, throw a {@code SchemaParseException} with + * all collected parse exceptions added as suppressed exceptions. Uses the base + * location of the schema (e.g., the directory where the schema file lives) if + * available. + * + * @param baseUri the base location of the schema, or {@code null} if + * not known + * @param formattedSchema the schema as text + * @return the parsed schema + * @throws IOException if thrown by one of the parsers + * @throws RuntimeException if thrown by one of the parsers + * @throws SchemaParseException when all parsers fail + */ + private Schema parse(URI baseUri, CharSequence formattedSchema) throws IOException, SchemaParseException { + List parseExceptions = new ArrayList<>(); + for (FormattedSchemaParser formattedSchemaParser : formattedSchemaParsers) { + try { + // Ensure we're only changing (adding to) the known types when a parser succeeds + Set schemaSet = new LinkedHashSet<>(knownSchemata); + Schema schema = formattedSchemaParser.parse(schemaSet, baseUri, formattedSchema); + if (schema != null) { + knownSchemata.addAll(schemaSet); + return schema; + } + } catch (SchemaParseException e) { + parseExceptions.add(e); + } + } + + // None of the available parsers succeeded + + if (parseExceptions.size() == 1) { + throw parseExceptions.get(0); + } + SchemaParseException parseException = new SchemaParseException( + "Could not parse the schema (the suppressed exceptions tell why)."); + parseExceptions.forEach(parseException::addSuppressed); + throw parseException; + } +} diff --git a/lang/java/avro/src/main/java/org/apache/avro/util/UtfTextUtils.java b/lang/java/avro/src/main/java/org/apache/avro/util/UtfTextUtils.java new file mode 100644 index 00000000000..447c791edee --- /dev/null +++ b/lang/java/avro/src/main/java/org/apache/avro/util/UtfTextUtils.java @@ -0,0 +1,247 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.avro.util; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; + +/** + * Text utilities especially suited for UTF encoded bytes. + * + *

+ * When the character set is unknown, methods in this class assume UTF encoded + * text and try to detect the UTF variant (8/16/32 bits, big/little endian), + * using the BOM (if present) or an educated guess assuming the first character + * is in the range U+0000-U+00FF. This heuristic works for all latin text based + * formats, which includes Avro IDL, JSON, XML, etc. If the heuristic fails, + * UTF-8 is assumed. + *

+ * + * @see XML specification, + * appendix F: Autodetection of Character Encodings (Non-Normative) + */ +public class UtfTextUtils { + private static final int TRANSFER_BUFFER_SIZE = 4096; + /** + * JVM standard character set (but that doesn't have a constant in + * {@link StandardCharsets}) for UTF-32. + */ + private static final Charset UTF_32 = Charset.forName("UTF-32"); + /** + * JVM standard character set (but that doesn't have a constant in + * {@link StandardCharsets}) for UTF-32BE. + */ + private static final Charset UTF_32BE = Charset.forName("UTF-32BE"); + /** + * JVM standard character set (but that doesn't have a constant in + * {@link StandardCharsets}) for UTF-32LE. + */ + private static final Charset UTF_32LE = Charset.forName("UTF-32LE"); + + public static String asString(byte[] bytes, Charset charset) { + if (charset == null) { + charset = detectUtfCharset(bytes); + } + return skipBOM(new String(bytes, charset)); + } + + /** + * Reads the specified input stream as text. If {@code charset} is {@code null}, + * the method will assume UTF encoded text and attempt to detect the appropriate + * charset. + * + * @param input the input to read + * @param charset the character set of the input, if known + * @return all bytes, read into a string + * @throws IOException when reading the input fails for some reason + */ + public static String readAllBytes(InputStream input, Charset charset) throws IOException { + if (charset == null) { + input = ensureMarkSupport(input); + input.mark(4); + byte[] buffer = new byte[4]; + int bytesRead = fillBuffer(input, buffer); + input.reset(); + + charset = detectUtfCharset0(buffer, bytesRead); + + if (charset == null) { + throw new IOException("Unsupported UCS-4 variant (neither UTF-32BE nor UTF32-LE)"); + } + } + Reader reader = new InputStreamReader(input, charset); + return readAllChars(reader); + } + + private static InputStream ensureMarkSupport(InputStream input) { + if (input.markSupported()) { + return input; + } else { + return new BufferedInputStream(input); + } + } + + private static int fillBuffer(InputStream in, byte[] buf) throws IOException { + int remaining = buf.length; + int offset = 0; + while (remaining > 0) { + int bytesRead = in.read(buf, offset, remaining); + // As remaining > 0, bytesRead is either -1 or positive + if (bytesRead == -1) { + break; + } + offset += bytesRead; + remaining -= bytesRead; + } + return offset; + } + + public static String readAllChars(Reader input) throws IOException { + StringBuilder buffer = new StringBuilder(); + char[] charBuffer = new char[TRANSFER_BUFFER_SIZE]; + int charsRead; + while ((charsRead = input.read(charBuffer, 0, TRANSFER_BUFFER_SIZE)) >= 0) { + buffer.append(charBuffer, 0, charsRead); + } + return skipBOM(buffer); + } + + private static String skipBOM(CharSequence buffer) { + if (buffer.charAt(0) == '\uFEFF') { + return buffer.subSequence(1, buffer.length()).toString(); + } + return buffer.toString(); + } + + /** + * Assuming UTF encoded bytes, detect the UTF variant (8/16/32 bits, big/little + * endian). + * + *

+ * To ensure the most accurate detection, the algorithm requires at least 4 + * bytes. One should only provide less than 4 bytes of data if that is all there + * is. + *

+ * + *

+ * Detection is certain when a byte order mark (BOM) is used. Otherwise a + * heuristic is used, which works when the first character is from the first 256 + * characters from the BMP (U+0000-U+00FF). This works for all latin-based + * textual formats, like Avro IDL, JSON, YAML, XML, etc. + *

+ * + * @param firstFewBytes the first few bytes of the text to detect the character + * set of + * @return the character set to use + */ + public static Charset detectUtfCharset(byte[] firstFewBytes) { + Charset detectedCharset = detectUtfCharset0(firstFewBytes, firstFewBytes.length); + if (detectedCharset == null) { + throw new IllegalArgumentException("Unsupported UCS-4 variant (neither UTF-32BE nor UTF32-LE)"); + } + return detectedCharset; + } + + private static Charset detectUtfCharset0(byte[] firstFewBytes, int numBytes) { + /* + * Lookup table, adapted from https://www.w3.org/TR/xml/#sec-guessing + * It omits non-UTF encodings (the 2nd and 3rd rows from the end). + * Note that the order (with respect to UTF-32 & UTF-16) is important! + * + * (the non-zero bytes encode the byte order mark, BOM) + * + * spotless:off + * Match the 'magic bytes' in order, and take the first match: + * 00 00 FE FF -> UTF-32 (be) + * FF FE 00 00 -> UTF-32 (le) + * 00 00 FF FE -> unsupported UCS-4 (byte order 2143) + * FE FF 00 00 -> unsupported UCS-4 (byte order 3412) + * FE FF __ __ -> UTF-16 (be) + * FF FE __ __ -> UTF-16 (le) + * EF BB BF __ -> UTF-8 + * 00 00 00 __ -> UTF-32BE + * __ 00 00 00 -> UTF-32LE + * 00 00 __ 00 -> unsupported UCS-4 (byte order 2143) + * 00 __ 00 00 -> unsupported UCS-4 (byte order 3412) + * 00 __ __ __ -> UTF-16BE + * __ 00 __ __ -> UTF-16LE + * __ __ __ __ -> UTF-8 (fallback) + * spotless:on + */ + int quad = quad(firstFewBytes, numBytes); + int word = quad >>> 16; + if (numBytes > 3 && (quad == 0x0000FEFF || quad == 0xFFFE0000)) { + // With BOM: UTF-32 (Charset handles BOM & endianness) + return UTF_32; + } else if (numBytes > 3 && (quad == 0x0000FFFE || quad == 0xFEFF0000)) { + // With BOM: unsupported UCS-4 encoding (byte order 2143 resp. 3412) + return null; + } else if (numBytes > 1 && (word == 0xFEFF || word == 0xFFFE)) { + // With BOM: UTF-16 (Charset handles BOM & endianness) + return StandardCharsets.UTF_16; + } else if (numBytes > 2 && quad >>> 8 == 0xEFBBBF) { + // With BOM: UTF-8 (Charset does not handle a BOM, so our caller must skip it) + return StandardCharsets.UTF_8; + } else if (numBytes > 3 && (quad & 0xFFFFFF00) == 0) { + // Without BOM (i.e., a guess) + return UTF_32BE; + } else if (numBytes > 3 && (quad & 0x00FFFFFF) == 0) { + // Without BOM (i.e., a guess) + return UTF_32LE; + } else if (numBytes > 3 && (quad & 0xFFFF00FF) == 0 || (quad & 0xFF00FFFF) == 0) { + // Without BOM (i.e., a guess): unsupported UCS-4 encoding (byte order 2143 + // resp. 3412) + return null; + } else if (numBytes > 1 && (word & 0xFF00) == 0) { + // Without BOM (i.e., a guess) + return StandardCharsets.UTF_16BE; + } else if (numBytes > 1 && (word & 0x00FF) == 0) { + // Without BOM (i.e., a guess) + return StandardCharsets.UTF_16LE; + } else { + // Fallback + return StandardCharsets.UTF_8; + } + } + + private static int quad(byte[] bytes, int length) { + int quad = 0xFFFFFFFF; + switch (length) { + default: + quad = (quad & 0xFFFFFF00) | (bytes[3] & 0xFF); + // Fallthrough + case 3: + quad = (quad & 0xFFFF00FF) | (bytes[2] & 0xFF) << 8; + // Fallthrough + case 2: + quad = (quad & 0xFF00FFFF) | (bytes[1] & 0xFF) << 16; + // Fallthrough + case 1: + quad = (quad & 0x00FFFFFF) | (bytes[0] & 0xFF) << 24; + // Fallthrough + case 0: + break; + } + return quad; + } +} diff --git a/lang/java/avro/src/test/java/org/apache/avro/DummySchemaParser.java b/lang/java/avro/src/test/java/org/apache/avro/DummySchemaParser.java new file mode 100644 index 00000000000..0a20beadfa7 --- /dev/null +++ b/lang/java/avro/src/test/java/org/apache/avro/DummySchemaParser.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.avro; + +import java.io.IOException; +import java.net.URI; +import java.util.Collection; + +public class DummySchemaParser implements FormattedSchemaParser { + public static final String SCHEMA_TEXT_ONE = "one"; + public static final Schema FIXED_SCHEMA = Schema.createFixed("DummyOne", null, "tests", 42); + public static final String SCHEMA_TEXT_ERROR = "error"; + public static final String SCHEMA_TEXT_IO_ERROR = "ioerror"; + public static final String ERROR_MESSAGE = "Syntax error"; + public static final String IO_ERROR_MESSAGE = "I/O error"; + + @Override + public Schema parse(Collection schemata, URI baseUri, CharSequence formattedSchema) + throws IOException, SchemaParseException { + if (SCHEMA_TEXT_ONE.contentEquals(formattedSchema)) { + return FIXED_SCHEMA; + } else if (SCHEMA_TEXT_ERROR.contentEquals(formattedSchema)) { + throw new SchemaParseException(ERROR_MESSAGE); + } else if (SCHEMA_TEXT_IO_ERROR.contentEquals(formattedSchema)) { + throw new IOException(IO_ERROR_MESSAGE); + } + // Syntax not recognized + return null; + } +} diff --git a/lang/java/avro/src/test/java/org/apache/avro/TestSchemaParser.java b/lang/java/avro/src/test/java/org/apache/avro/TestSchemaParser.java new file mode 100644 index 00000000000..dc0c77431fe --- /dev/null +++ b/lang/java/avro/src/test/java/org/apache/avro/TestSchemaParser.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.avro; + +import com.fasterxml.jackson.core.JsonParseException; +import org.junit.jupiter.api.Test; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.StringReader; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +import static java.util.Collections.singletonList; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class TestSchemaParser { + private static final Schema SCHEMA_REAL = Schema.createFixed("Real", null, "tests", 42); + private static final String SCHEMA_JSON = SCHEMA_REAL.toString(false); + + @Test + void testParseFile() throws IOException { + Path tempFile = Files.createTempFile("TestSchemaParser", null); + Files.write(tempFile, singletonList(SCHEMA_JSON)); + + Schema schema = new SchemaParser().parse(tempFile.toFile()); + assertEquals(SCHEMA_REAL, schema); + } + + @Test + void testParsePath() throws IOException { + Path tempFile = Files.createTempFile("TestSchemaParser", null); + Files.write(tempFile, singletonList(SCHEMA_JSON)); + + Schema schema = new SchemaParser().parse(tempFile); + assertEquals(SCHEMA_REAL, schema); + } + + @Test + void testParseReader() throws IOException { + Schema schema = new SchemaParser().parse(new StringReader(SCHEMA_JSON)); + assertEquals(SCHEMA_REAL, schema); + } + + @Test + void testParseStream() throws IOException { + Schema schema = new SchemaParser().parse(new ByteArrayInputStream(SCHEMA_JSON.getBytes(StandardCharsets.UTF_16))); + assertEquals(SCHEMA_REAL, schema); + } + + @Test + void testParseTextWithFallbackJsonParser() { + Schema schema = new SchemaParser().parse(SCHEMA_JSON); + assertEquals(SCHEMA_REAL, schema); + } + + @Test + void testParseByCustomParser() { + Schema schema = new SchemaParser().parse(DummySchemaParser.SCHEMA_TEXT_ONE); + assertEquals(DummySchemaParser.FIXED_SCHEMA, schema); + } + + @Test + void testSingleParseError() { + SchemaParseException parseException = assertThrows(SchemaParseException.class, + () -> new SchemaParser().parse("foo")); + assertEquals(JsonParseException.class, parseException.getCause().getClass()); + assertEquals(0, parseException.getSuppressed().length); + } + + @Test + void testMultipleParseErrors() { + SchemaParseException parseException = assertThrows(SchemaParseException.class, + () -> new SchemaParser().parse(DummySchemaParser.SCHEMA_TEXT_ERROR)); + assertTrue(parseException.getMessage().startsWith("Could not parse the schema")); + Throwable[] suppressed = parseException.getSuppressed(); + assertEquals(2, suppressed.length); + assertEquals(DummySchemaParser.ERROR_MESSAGE, suppressed[0].getMessage()); + assertEquals(JsonParseException.class, suppressed[1].getCause().getClass()); + } + + @Test + void testIOFailureWhileParsingText() { + AvroRuntimeException exception = assertThrows(AvroRuntimeException.class, + () -> new SchemaParser().parse(DummySchemaParser.SCHEMA_TEXT_IO_ERROR)); + assertEquals(IOException.class, exception.getCause().getClass()); + assertEquals(DummySchemaParser.IO_ERROR_MESSAGE, exception.getCause().getMessage()); + } +} diff --git a/lang/java/avro/src/test/java/org/apache/avro/util/UtfTextUtilsTest.java b/lang/java/avro/src/test/java/org/apache/avro/util/UtfTextUtilsTest.java new file mode 100644 index 00000000000..6c525e6d39a --- /dev/null +++ b/lang/java/avro/src/test/java/org/apache/avro/util/UtfTextUtilsTest.java @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.avro.util; + +import org.junit.jupiter.api.Test; + +import java.io.ByteArrayInputStream; +import java.io.FilterInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +@SuppressWarnings("SpellCheckingInspection") +class UtfTextUtilsTest { + @Test + void validateCharsetDetectionWithBOM() { + assertEquals("UTF-32", testDetection("0000FEFF").name()); + assertEquals("UTF-32", testDetection("FFFE0000").name()); + assertEquals("UTF-16", testDetection("FEFF0041").name()); + assertEquals("UTF-16", testDetection("FFFE4100").name()); + assertEquals("UTF-8", testDetection("EFBBBF41").name()); + + // Invalid UCS-4 encodings: these we're certain we cannot handle. + assertThrows(IllegalArgumentException.class, () -> testDetection("0000FFFE")); + assertThrows(IllegalArgumentException.class, () -> testDetection("FEFF0000")); + } + + @Test + void validateCharsetDetectionWithoutBOM() { + assertEquals("UTF-32BE", testDetection("00000041").name()); + assertEquals("UTF-32LE", testDetection("41000000").name()); + assertEquals("UTF-16BE", testDetection("00410042").name()); + assertEquals("UTF-16LE", testDetection("41004200").name()); + assertEquals("UTF-8", testDetection("41424344").name()); + + assertEquals("UTF-8", testDetection("414243").name()); + + assertEquals("UTF-16BE", testDetection("0041").name()); + assertEquals("UTF-16LE", testDetection("4100").name()); + assertEquals("UTF-8", testDetection("4142").name()); + + assertEquals("UTF-8", testDetection("41").name()); + + assertEquals("UTF-8", testDetection("").name()); + + // Invalid UCS-4 encodings: these we're fairly certain we cannot handle. + assertThrows(IllegalArgumentException.class, () -> testDetection("00004100")); + assertThrows(IllegalArgumentException.class, () -> testDetection("00410000")); + } + + private Charset testDetection(String hexBytes) { + return UtfTextUtils.detectUtfCharset(hexBytes(hexBytes)); + } + + private static byte[] hexBytes(String hexBytes) { + byte[] bytes = new byte[hexBytes.length() / 2]; + for (int i = 0; i < bytes.length; i++) { + int index = i * 2; + bytes[i] = (byte) Integer.parseUnsignedInt(hexBytes.substring(index, index + 2), 16); + } + return bytes; + } + + @Test + void validateTextConversionFromBytes() { + assertEquals("A", UtfTextUtils.asString(hexBytes("EFBBBF41"), StandardCharsets.UTF_8)); + assertEquals("A", UtfTextUtils.asString(hexBytes("EFBBBF41"), null)); + + assertEquals("A", UtfTextUtils.asString(hexBytes("41"), StandardCharsets.UTF_8)); + assertEquals("A", UtfTextUtils.asString(hexBytes("41"), null)); + } + + @Test + void validateTextConversionFromStreams() throws IOException { + assertEquals("A", + UtfTextUtils.readAllBytes(new ByteArrayInputStream(hexBytes("EFBBBF41")), StandardCharsets.UTF_8)); + assertEquals("A", UtfTextUtils.readAllBytes(new ByteArrayInputStream(hexBytes("EFBBBF41")), null)); + + assertEquals("A", UtfTextUtils.readAllBytes(new ByteArrayInputStream(hexBytes("41")), StandardCharsets.UTF_8)); + assertEquals("A", UtfTextUtils.readAllBytes(new ByteArrayInputStream(hexBytes("41")), null)); + + // Invalid UCS-4 encoding should throw an IOException instead of an + // IllegalArgumentException. + assertThrows(IOException.class, + () -> UtfTextUtils.readAllBytes(new ByteArrayInputStream(hexBytes("0000FFFE")), null)); + } + + @Test + void validateSupportForUnmarkableStreams() throws IOException { + assertEquals("ABCD", + UtfTextUtils.readAllBytes(new UnmarkableInputStream(new ByteArrayInputStream(hexBytes("41424344"))), null)); + } + + private static class UnmarkableInputStream extends FilterInputStream { + public UnmarkableInputStream(InputStream input) { + super(input); + } + + @Override + public synchronized void mark(int ignored) { + } + + @Override + public synchronized void reset() throws IOException { + throw new IOException("mark/reset not supported"); + } + + @Override + public boolean markSupported() { + return false; + } + } +} diff --git a/lang/java/avro/src/test/resources/META-INF/services/org.apache.avro.FormattedSchemaParser b/lang/java/avro/src/test/resources/META-INF/services/org.apache.avro.FormattedSchemaParser new file mode 100644 index 00000000000..b2db6ddb269 --- /dev/null +++ b/lang/java/avro/src/test/resources/META-INF/services/org.apache.avro.FormattedSchemaParser @@ -0,0 +1,18 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +org.apache.avro.DummySchemaParser diff --git a/lang/java/idl/src/main/java/org/apache/avro/idl/IdlReader.java b/lang/java/idl/src/main/java/org/apache/avro/idl/IdlReader.java index f2419f5f551..ea3f3cff6df 100644 --- a/lang/java/idl/src/main/java/org/apache/avro/idl/IdlReader.java +++ b/lang/java/idl/src/main/java/org/apache/avro/idl/IdlReader.java @@ -35,6 +35,7 @@ import org.antlr.v4.runtime.Recognizer; import org.antlr.v4.runtime.Token; import org.apache.avro.JsonProperties; +import org.apache.avro.JsonSchemaParser; import org.apache.avro.LogicalType; import org.apache.avro.LogicalTypes; import org.apache.avro.Protocol; @@ -68,6 +69,7 @@ import org.apache.avro.idl.IdlParser.SchemaPropertyContext; import org.apache.avro.idl.IdlParser.UnionTypeContext; import org.apache.avro.idl.IdlParser.VariableDeclarationContext; +import org.apache.avro.util.UtfTextUtils; import org.apache.avro.util.internal.Accessor; import org.apache.commons.text.StringEscapeUtils; @@ -83,6 +85,7 @@ import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.Collections; import java.util.Deque; import java.util.HashSet; @@ -134,7 +137,10 @@ public void syntaxError(Recognizer recognizer, Object offendingSymbol, int * Predicate to check for valid names. Should probably be delegated to the * Schema class. */ - private static final Predicate VALID_NAME = Pattern.compile("[_\\p{L}][_\\p{L}\\d]*").asPredicate(); + private static final Predicate VALID_NAME = Pattern + .compile("[_\\p{L}][_\\p{LD}]*", Pattern.UNICODE_CHARACTER_CLASS | Pattern.UNICODE_CASE | Pattern.CANON_EQ) + .asPredicate(); + private static final Set INVALID_TYPE_NAMES = new HashSet<>(Arrays.asList("boolean", "int", "long", "float", "double", "bytes", "string", "null", "date", "time_ms", "timestamp_ms", "localtimestamp_ms", "uuid")); private static final String CLASSPATH_SCHEME = "classpath"; @@ -159,15 +165,13 @@ private Schema namedSchemaOrUnresolved(String fullName) { return schema; } - private void setTypes(Map types) { + private void setTypes(Collection types) { names.clear(); - for (Schema schema : types.values()) { - addSchema(schema); - } + addTypes(types); } - public void addTypes(Map types) { - for (Schema schema : types.values()) { + public void addTypes(Collection types) { + for (Schema schema : types) { addSchema(schema); } } @@ -185,20 +189,28 @@ public IdlFile parse(Path location) throws IOException { } IdlFile parse(URI location) throws IOException { - try (InputStream stream = location.toURL().openStream()) { - readLocations.add(location); - URI inputDir = location; - if ("jar".equals(location.getScheme())) { - String jarUriAsString = location.toString(); - String pathFromJarRoot = jarUriAsString.substring(jarUriAsString.indexOf("!/") + 2); - inputDir = URI.create(CLASSPATH_SCHEME + ":/" + pathFromJarRoot); - } - inputDir = inputDir.resolve("."); + readLocations.add(location); + URI inputDir = location; + if ("jar".equals(location.getScheme())) { + String jarUriAsString = location.toString(); + String pathFromJarRoot = jarUriAsString.substring(jarUriAsString.indexOf("!/") + 2); + inputDir = URI.create(CLASSPATH_SCHEME + ":/" + pathFromJarRoot); + } + inputDir = inputDir.resolve("."); - return parse(inputDir, CharStreams.fromStream(stream, StandardCharsets.UTF_8)); + try (InputStream stream = location.toURL().openStream()) { + String inputString = UtfTextUtils.readAllBytes(stream, null); + return parse(inputDir, CharStreams.fromString(inputString)); } } + /** + * Parse an IDL file from a string, using the given directory for imports. + */ + public IdlFile parse(URI directory, CharSequence source) throws IOException { + return parse(directory, CharStreams.fromString(source.toString())); + } + /** * Parse an IDL file from a stream. This method cannot handle imports. */ @@ -219,8 +231,14 @@ private IdlFile parse(URI inputDir, CharStream charStream) { parser.setTrace(false); parser.setBuildParseTree(false); - // Trigger parsing. - parser.idlFile(); + try { + // Trigger parsing. + parser.idlFile(); + } catch (SchemaParseException e) { + throw e; + } catch (RuntimeException e) { + throw new SchemaParseException(e); + } return parseListener.getIdlFile(); } @@ -440,10 +458,11 @@ public void exitImportStatement(ImportStatementContext importContext) { break; case IdlParser.Schema: try (InputStream stream = importLocation.toURL().openStream()) { - Schema.Parser parser = new Schema.Parser(); - parser.addTypes(getTypes().values()); // inherit names - parser.parse(stream); - setTypes(parser.getTypes()); // update names + JsonSchemaParser parser = new JsonSchemaParser(); + Collection types = new ArrayList<>(names.values()); + parser.parse(types, importLocation.resolve("."), UtfTextUtils.readAllBytes(stream, null)); + // Ensure we're only changing (adding to) the known types when a parser succeeds + types.forEach(IdlReader.this::addSchema); } break; } diff --git a/lang/java/idl/src/main/java/org/apache/avro/idl/IdlSchemaParser.java b/lang/java/idl/src/main/java/org/apache/avro/idl/IdlSchemaParser.java new file mode 100644 index 00000000000..c6de45bf077 --- /dev/null +++ b/lang/java/idl/src/main/java/org/apache/avro/idl/IdlSchemaParser.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.avro.idl; + +import org.apache.avro.FormattedSchemaParser; +import org.apache.avro.Schema; +import org.apache.avro.SchemaParseException; + +import java.io.IOException; +import java.net.URI; +import java.util.Collection; +import java.util.regex.Pattern; + +public class IdlSchemaParser implements FormattedSchemaParser { + + @Override + public Schema parse(Collection existingSchemata, URI baseUri, CharSequence formattedSchema) + throws IOException, SchemaParseException { + boolean valid = Pattern.compile("^\\A*!" + // Initial whitespace + "(?:/\\*(?:[^*]|\\*[^/])*!\\*/\\s*!|//(!=\\R)*!\\R\\s*!)*!" + // Comments + "(?:namespace|schema|protocol|record|enum|fixed|import)\\s", // First keyword + Pattern.UNICODE_CHARACTER_CLASS | Pattern.MULTILINE).matcher(formattedSchema).find(); + if (valid) { + IdlReader idlReader = new IdlReader(); + idlReader.addTypes(existingSchemata); + IdlFile idlFile = idlReader.parse(baseUri, formattedSchema); + Schema mainSchema = idlFile.getMainSchema(); + if (mainSchema != null) { + return mainSchema; + } + if (!idlFile.getNamedSchemas().isEmpty()) { + return idlFile.getNamedSchemas().values().iterator().next(); + } + } + return null; + } +} diff --git a/lang/java/idl/src/main/resources/META-INF/services/org.apache.avro.FormattedSchemaParser b/lang/java/idl/src/main/resources/META-INF/services/org.apache.avro.FormattedSchemaParser new file mode 100644 index 00000000000..acb4986e419 --- /dev/null +++ b/lang/java/idl/src/main/resources/META-INF/services/org.apache.avro.FormattedSchemaParser @@ -0,0 +1,18 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +org.apache.avro.idl.IdlSchemaParser