AVRO-3666: Separate parsing from Schema class

This allows using pluggable parser implementations, allowing multiple formats to be parsed with the same code.
apache · Sep 21, 2023 · e5179e9 · e5179e9
1 parent 73752a4
commit e5179e9
Show file tree

Hide file tree

Showing 12 changed files with 1,073 additions and 26 deletions.
diff --git a/doc/content/en/docs/++version++/Getting started (Java)/_index.md b/doc/content/en/docs/++version++/Getting started (Java)/_index.md
@@ -77,7 +77,7 @@ You may also build the required Avro jars from source. Building Avro is beyond t
 
 ## Defining a schema
 
-Avro schemas are defined using JSON. Schemas are composed of primitive types (null, boolean, int, long, float, double, bytes, and string) and complex types (record, enum, array, map, union, and fixed). You can learn more about Avro schemas and types from the specification, but for now let's start with a simple schema example, user.avsc:
+Avro schemas are defined using JSON or IDL (the latter requires an extra dependency). Schemas are composed of primitive types (null, boolean, int, long, float, double, bytes, and string) and complex types (record, enum, array, map, union, and fixed). You can learn more about Avro schemas and types from the specification, but for now let's start with a simple schema example, user.avsc:
 
 ```json
 {"namespace": "example.avro",
@@ -209,10 +209,10 @@ Data in Avro is always stored with its corresponding schema, meaning we can alwa
 Let's go over the same example as in the previous section, but without using code generation: we'll create some users, serialize them to a data file on disk, and then read back the file and deserialize the users objects.
 
 ### Creating users
-First, we use a Parser to read our schema definition and create a Schema object.
+First, we use a SchemaParser to read our schema definition and create a Schema object.
 
 ```java
-Schema schema = new Schema.Parser().parse(new File("user.avsc"));
+Schema schema = new SchemaParser().parse(new File("user.avsc"));
 ```
 
 Using this schema, let's create some users.

diff --git a/lang/java/avro/src/main/java/org/apache/avro/FormattedSchemaParser.java b/lang/java/avro/src/main/java/org/apache/avro/FormattedSchemaParser.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.avro;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.Collection;
+
+/**
+ * Schema parser for a specific schema format.
+ *
+ * <p>
+ * The {@link SchemaParser} class uses this interface, supporting text based schema sources.
+ * </p>
+ *
+ * <h2>Note to implementers:</h2>
+ *
+ * <p>
+ * Implementations are located using a {@link java.util.ServiceLoader}. See that class for details.
+ * </p>
+ *
+ * <p>
+ * You can expect that schemas being read are invalid, so you are encouraged to return {@code null} upon parsing failure
+ * where the input clearly doesn't make sense (e.g., reading "/**" when expecting JSON). If the input is likely in the
+ * correct format, but invalid, throw a {@link SchemaParseException} instead.
+ * </p>
+ *
+ * <p>
+ * Note that throwing anything other than a {@code SchemaParseException} will abort the parsing process, so reserve that
+ * for rethrowing exceptions.
+ * </p>
+ *
+ * @see java.util.ServiceLoader
+ */
+public interface FormattedSchemaParser {
+  /**
+   * Parse a schema from a text based source. Can use the base location of the schema (e.g., the directory where the
+   * schema file lives) if available.
+   *
+   * <p>
+   * Implementations should add all named schemas they parse to the collection.
+   * </p>
+   *
+   * @param types           a mutable collection of known types; parsed named schemata will be added
+   * @param baseUri         the base location of the schema, or {@code null} if not known
+   * @param formattedSchema the schema as text
+   * @return the parsed schema, or {@code null} if the format is not supported
+   * @throws IOException          when the schema cannot be read
+   * @throws SchemaParseException when the schema cannot be parsed
+   */
+  Schema parse(Collection<Schema> types, URI baseUri, CharSequence formattedSchema)
+      throws IOException, SchemaParseException;
+}
diff --git a/lang/java/avro/src/main/java/org/apache/avro/JsonSchemaParser.java b/lang/java/avro/src/main/java/org/apache/avro/JsonSchemaParser.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.avro;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.Collection;
+
+/**
+ * Schema parser for JSON formatted schemata. This initial implementation simply
+ * delegates to the {@link Schema.Parser} class, though it should be refactored
+ * out of there.
+ *
+ * <p>
+ * Note: this class is intentionally not available via the Java
+ * {@link java.util.ServiceLoader}, as its use is hardcoded as fallback when no
+ * service exists. This enables users to reliably override the standard JSON
+ * parser as well.
+ * </p>
+ */
+public class JsonSchemaParser implements FormattedSchemaParser {
+  /**
+   * <p>
+   * Parse a schema written in the internal (JSON) format without any validations.
+   * </p>
+   *
+   * <p>
+   * Using this method is only safe if used to parse a write schema (i.e., a
+   * schema used to read Avro data). Other usages, for example by generated Avro
+   * code, can cause interoperability problems.
+   * </p>
+   *
+   * <p>
+   * Use with care and sufficient testing!
+   * </p>
+   *
+   * @param fragments one or more strings making up the schema (some schemata
+   *                  exceed the compiler limits)
+   * @return the parsed schema
+   */
+  public static Schema parseInternal(String... fragments) {
+    StringBuilder buffer = new StringBuilder();
+    for (String fragment : fragments) {
+      buffer.append(fragment);
+    }
+    return new JsonSchemaParser().parse(new ArrayList<>(), buffer, true);
+  }
+
+  @Override
+  public Schema parse(Collection<Schema> schemas, URI baseUri, CharSequence formattedSchema)
+      throws IOException, SchemaParseException {
+    return parse(schemas, formattedSchema, false);
+  }
+
+  private Schema parse(Collection<Schema> schemas, CharSequence formattedSchema, boolean skipValidation)
+      throws SchemaParseException {
+    // TODO: refactor JSON parsing out of the Schema class
+    Schema.Parser parser;
+    if (skipValidation) {
+      parser = new Schema.Parser(Schema.NameValidator.NO_VALIDATION);
+      parser.setValidateDefaults(false);
+    } else {
+      parser = new Schema.Parser();
+    }
+    if (schemas != null) {
+      parser.addTypes(schemas);
+    }
+    Schema schema = parser.parse(formattedSchema.toString());
+    if (schemas != null) {
+      schemas.addAll(parser.getTypes().values());
+    }
+    return schema;
+  }
+}