From f3bcd651dcb0a3fcd5a94d2bef667df8a4a1107b Mon Sep 17 00:00:00 2001 From: Charles Givre Date: Mon, 18 Sep 2023 23:30:38 -0400 Subject: [PATCH] WIP --- contrib/format-xml/README.md | 4 ++++ .../drill/exec/store/xml/XMLBatchReader.java | 8 ++++++- .../drill/exec/store/xml/XMLFormatConfig.java | 22 ++++++++++++++----- .../drill/exec/store/xml/XMLReader.java | 2 +- .../drill/exec/store/xml/TestXMLReader.java | 2 +- 5 files changed, 30 insertions(+), 8 deletions(-) diff --git a/contrib/format-xml/README.md b/contrib/format-xml/README.md index 7bca9c79071..16640c6bd98 100644 --- a/contrib/format-xml/README.md +++ b/contrib/format-xml/README.md @@ -6,6 +6,9 @@ Aside from the file extension, there is one configuration option: * `dataLevel`: XML data often contains a considerable amount of nesting which is not necesarily useful for data analysis. This parameter allows you to set the nesting level where the data actually starts. The levels start at `1`. +* `allTextMode`: When set to true, Drill will not attempt to infer data types. Defaults to `true`. +* `useXSD`: When set to `true`, if the XML file has an associated XSD schema file, Drill will + download that file and use that for the schema. Defaults to `false`. The default configuration is shown below: @@ -16,6 +19,7 @@ The default configuration is shown below: "xml" ], "allTextMode": true, + "useXSD": false, "dataLevel": 2 } ``` diff --git a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLBatchReader.java b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLBatchReader.java index 6f46f4be7d7..b0642e03953 100644 --- a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLBatchReader.java +++ b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLBatchReader.java @@ -49,11 +49,13 @@ static class XMLReaderConfig { final XMLFormatPlugin plugin; final int dataLevel; final boolean allTextMode; + final boolean useXSD; XMLReaderConfig(XMLFormatPlugin plugin) { this.plugin = plugin; dataLevel = plugin.getConfig().dataLevel; allTextMode = plugin.getConfig().allTextMode(); + useXSD = plugin.getConfig().useXSD(); } } @@ -63,7 +65,11 @@ public XMLBatchReader(XMLReaderConfig readerConfig, EasySubScan scan, FileSchema this.readerConfig = readerConfig; file = negotiator.file(); - // Add schema if provided + // We need to set an order of precedence for schemata. + // The order implemented here is: + // 1. Provided schema + // 2. Schema from XSD + // 3. Inferred schema from data if (negotiator.providedSchema() != null) { TupleMetadata schema = negotiator.providedSchema(); negotiator.tableSchema(schema, false); diff --git a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLFormatConfig.java b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLFormatConfig.java index 2946d3d9d2d..fc86a8aa0f8 100644 --- a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLFormatConfig.java +++ b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLFormatConfig.java @@ -35,19 +35,22 @@ public class XMLFormatConfig implements FormatPluginConfig { public final List extensions; public final int dataLevel; - - @JsonProperty public final boolean allTextMode; + public final boolean useXSD; public XMLFormatConfig(@JsonProperty("extensions") List extensions, @JsonProperty("dataLevel") int dataLevel, - @JsonProperty("allTextMode") Boolean allTextMode + @JsonProperty("allTextMode") Boolean allTextMode, + @JsonProperty("useXSD") Boolean useXSD ) { this.extensions = extensions == null ? Collections.singletonList("xml") : ImmutableList.copyOf(extensions); this.dataLevel = Math.max(dataLevel, 1); // Default to true this.allTextMode = allTextMode == null || allTextMode; + + // Default to false + this.useXSD = useXSD != null && useXSD; } @JsonInclude(JsonInclude.Include.NON_DEFAULT) @@ -56,13 +59,20 @@ public List getExtensions() { } @JsonProperty("allTextMode") + @JsonInclude(JsonInclude.Include.NON_DEFAULT) public boolean allTextMode() { return allTextMode; } + @JsonProperty("useXSD") + @JsonInclude(JsonInclude.Include.NON_DEFAULT) + public boolean useXSD() { + return useXSD; + } + @Override public int hashCode() { - return Objects.hash(extensions, dataLevel, allTextMode); + return Objects.hash(extensions, dataLevel, allTextMode, useXSD); } public XMLBatchReader.XMLReaderConfig getReaderConfig(XMLFormatPlugin plugin) { @@ -80,7 +90,8 @@ public boolean equals(Object obj) { XMLFormatConfig other = (XMLFormatConfig) obj; return Objects.equals(extensions, other.extensions) && Objects.equals(dataLevel, other.dataLevel) - && Objects.equals(allTextMode, other.allTextMode); + && Objects.equals(allTextMode, other.allTextMode) + && Objects.equals(useXSD, other.useXSD); } @Override @@ -89,6 +100,7 @@ public String toString() { .field("extensions", extensions) .field("dataLevel", dataLevel) .field("allTextMode", allTextMode) + .field("useXSD", useXSD) .toString(); } } diff --git a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java index 8d1fb59ff76..1221aba6fdf 100644 --- a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java +++ b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java @@ -85,7 +85,7 @@ public class XMLReader implements Closeable { /** * This field indicates the various states in which the reader operates. The names should be self-explanatory, - * but they are used as the reader iterates over the XML tags to know what to do. + * and they are used as the reader iterates over the XML tags to know what to do. */ private enum xmlState { ROW_STARTED, diff --git a/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java b/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java index a0d3ac913e8..2b95c77f3d0 100644 --- a/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java +++ b/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java @@ -49,7 +49,7 @@ public class TestXMLReader extends ClusterTest { public static void setup() throws Exception { ClusterTest.startCluster(ClusterFixture.builder(dirTestWatcher)); - XMLFormatConfig formatConfig = new XMLFormatConfig(null, 2, true); + XMLFormatConfig formatConfig = new XMLFormatConfig(null, 2, true, false); cluster.defineFormat("cp", "xml", formatConfig); cluster.defineFormat("dfs", "xml", formatConfig);