DRILL-8450: Add Data Type Inference to XML Format Plugin (#2819)

apache · Aug 22, 2023 · ee1cfeb · ee1cfeb
1 parent 5c94c27
commit ee1cfeb
Show file tree

Hide file tree

Showing 10 changed files with 196 additions and 23 deletions.
diff --git a/common/src/main/java/org/apache/drill/common/Typifier.java b/common/src/main/java/org/apache/drill/common/Typifier.java
@@ -18,6 +18,9 @@
 
 package org.apache.drill.common;
 
+import org.apache.commons.lang3.StringUtils;
+import org.apache.drill.common.types.TypeProtos.MinorType;
+
 import java.nio.CharBuffer;
 
 import java.time.LocalDate;
@@ -45,6 +48,11 @@ public class Typifier {
       DateTimeFormatter.ofPattern("yyyy/MM/dd HH:mm:ss.SS", defaultLocale),
       DateTimeFormatter.ofPattern("MM/dd/yyyy hh:mm:ss a", defaultLocale),
       DateTimeFormatter.ofPattern("M/d/yy H:mm", defaultLocale),
+        DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss",  defaultLocale),
+        DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSSXXX",  defaultLocale),
+        DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSSXXX",  defaultLocale),
+        DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSSVV",  defaultLocale),
+        DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ssVV",  defaultLocale),
       DateTimeFormatter.ofPattern("dd/MM/yyyy HH:mm:ss", defaultLocale)));
 
   private static final HashSet<DateTimeFormatter> dateFormats = new HashSet<>(
@@ -88,6 +96,40 @@ public class Typifier {
   // If a String contains any of these, try to evaluate it as an equation
   private static final char[] MathCharacters = new char[]{'+', '-', '/', '*', '='};
 
+  /**
+   * This function infers the Drill data type of unknown data.
+   * @param data The input text of unknown data type.
+   * @return A {@link MinorType} of the Drill data type.
+   */
+  public static MinorType typifyToDrill (String data) {
+    Entry<Class, String> result = Typifier.typify(data);
+    String dataType = result.getKey().getSimpleName();
+
+    // If the string is empty, return UNKNOWN
+    if (StringUtils.isEmpty(data)) {
+      return MinorType.VARCHAR;
+    } else if (dataType.equalsIgnoreCase("Float")) {
+      return MinorType.FLOAT4;
+    } else if (dataType.equalsIgnoreCase("Double")) {
+      return MinorType.FLOAT8;
+    } else if (dataType.equalsIgnoreCase("Integer")) {
+      return MinorType.INT;
+    } else if (dataType.equalsIgnoreCase("Boolean")) {
+      return MinorType.BIT;
+    } else if (dataType.equalsIgnoreCase("Long")) {
+      return MinorType.BIGINT;
+    } else if(dataType.equalsIgnoreCase("LocalDateTime")) {
+      return MinorType.TIMESTAMP;
+    } else if (dataType.equalsIgnoreCase("LocalDate")) {
+      return MinorType.DATE;
+    } else if (dataType.equalsIgnoreCase("LocalTime")) {
+      return MinorType.TIME;
+    } else {
+      return MinorType.VARCHAR;
+    }
+  }
+
+
   // default is:
   //   > don't interpret "0" and "1" as true and false
   //   > restrict interpretation to common types

diff --git a/contrib/format-xml/README.md b/contrib/format-xml/README.md
@@ -15,12 +15,15 @@ The default configuration is shown below:
   "extensions": [
     "xml"
   ],
+  "allTextMode": true,
   "dataLevel": 2
 }
 ```
 
 ## Data Types
-All fields are read as strings.  Nested fields are read as maps.  Future functionality could include support for lists.
+The XML reader has an `allTextMode` which, when set to `true` reads all data fields as strings.
+When set to `false`, Drill will attempt to infer data types.
+Nested fields are read as maps.  Future functionality could include support for lists.
 
 ## Provided Schema
 The XML Format Reader supports provided inline schemas.  An example query might be:

diff --git a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLBatchReader.java b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLBatchReader.java
@@ -37,27 +37,30 @@
 public class XMLBatchReader implements ManagedReader {
 
   private static final Logger logger = LoggerFactory.getLogger(XMLBatchReader.class);
-
   private final FileDescrip file;
   private final RowSetLoader rootRowWriter;
   private final CustomErrorContext errorContext;
+  private final XMLReaderConfig readerConfig;
 
   private XMLReader reader;
   private final int dataLevel;
 
   static class XMLReaderConfig {
     final XMLFormatPlugin plugin;
     final int dataLevel;
+    final boolean allTextMode;
 
     XMLReaderConfig(XMLFormatPlugin plugin) {
       this.plugin = plugin;
       dataLevel = plugin.getConfig().dataLevel;
+      allTextMode = plugin.getConfig().allTextMode();
     }
   }
 
   public XMLBatchReader(XMLReaderConfig readerConfig, EasySubScan scan, FileSchemaNegotiator negotiator) {
     errorContext = negotiator.parentErrorContext();
     dataLevel = readerConfig.dataLevel;
+    this.readerConfig = readerConfig;
     file = negotiator.file();
 
     // Add schema if provided
@@ -68,7 +71,6 @@ public XMLBatchReader(XMLReaderConfig readerConfig, EasySubScan scan, FileSchema
 
     ResultSetLoader loader = negotiator.build();
     rootRowWriter = loader.writer();
-
     openFile();
   }
 
@@ -85,7 +87,7 @@ public void close() {
   private void openFile() {
     try {
       InputStream fsStream = file.fileSystem().openPossiblyCompressedStream(file.split().getPath());
-      reader = new XMLReader(fsStream, dataLevel);
+      reader = new XMLReader(fsStream, dataLevel, readerConfig.allTextMode);
       reader.open(rootRowWriter, errorContext);
     } catch (Exception e) {
       throw UserException

diff --git a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLFormatConfig.java b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLFormatConfig.java
@@ -36,20 +36,33 @@ public class XMLFormatConfig implements FormatPluginConfig {
   public final List<String> extensions;
   public final int dataLevel;
 
+  @JsonProperty
+  public final boolean allTextMode;
+
   public XMLFormatConfig(@JsonProperty("extensions") List<String> extensions,
-                         @JsonProperty("dataLevel") int dataLevel) {
+                        @JsonProperty("dataLevel") int dataLevel,
+                        @JsonProperty("allTextMode") Boolean allTextMode
+      ) {
     this.extensions = extensions == null ? Collections.singletonList("xml") : ImmutableList.copyOf(extensions);
     this.dataLevel = Math.max(dataLevel, 1);
+
+    // Default to true
+    this.allTextMode = allTextMode == null || allTextMode;
   }
 
   @JsonInclude(JsonInclude.Include.NON_DEFAULT)
   public List<String> getExtensions() {
     return extensions;
   }
 
+  @JsonProperty("allTextMode")
+  public boolean allTextMode() {
+    return allTextMode;
+  }
+
   @Override
   public int hashCode() {
-    return Objects.hash(extensions, dataLevel);
+    return Objects.hash(extensions, dataLevel, allTextMode);
   }
 
   public XMLBatchReader.XMLReaderConfig getReaderConfig(XMLFormatPlugin plugin) {
@@ -66,14 +79,16 @@ public boolean equals(Object obj) {
     }
     XMLFormatConfig other = (XMLFormatConfig) obj;
     return Objects.equals(extensions, other.extensions)
-      && Objects.equals(dataLevel, other.dataLevel);
+        && Objects.equals(dataLevel, other.dataLevel)
+        && Objects.equals(allTextMode, other.allTextMode);
   }
 
   @Override
   public String toString() {
     return new PlanStringBuilder(this)
-      .field("extensions", extensions)
-      .field("dataLevel", dataLevel)
+        .field("extensions", extensions)
+        .field("dataLevel", dataLevel)
+        .field("allTextMode", allTextMode)
       .toString();
   }
 }
diff --git a/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java b/contrib/format-xml/src/main/java/org/apache/drill/exec/store/xml/XMLReader.java
@@ -19,6 +19,7 @@
 package org.apache.drill.exec.store.xml;
 
 import org.apache.drill.common.AutoCloseables;
+import org.apache.drill.common.Typifier;
 import org.apache.drill.common.exceptions.CustomErrorContext;
 import org.apache.drill.common.exceptions.UserException;
 import org.apache.drill.common.types.TypeProtos;
@@ -64,7 +65,7 @@ public class XMLReader implements Closeable {
   private final Stack<TupleWriter> rowWriterStack;
   private final int dataLevel;
   private final Map<String, XMLMap> nestedMapCollection;
-
+  private final boolean allTextmode;
   private TupleWriter attributeWriter;
   private CustomErrorContext errorContext;
   private RowSetLoader rootRowWriter;
@@ -96,7 +97,7 @@ private enum xmlState {
     ROW_ENDED
   }
 
-  public XMLReader(InputStream fsStream, int dataLevel) throws XMLStreamException {
+  public XMLReader(InputStream fsStream, int dataLevel, boolean allTextMode) throws XMLStreamException {
     this.fsStream = fsStream;
     XMLInputFactory inputFactory = XMLInputFactory.newInstance();
     reader = inputFactory.createXMLEventReader(fsStream);
@@ -105,6 +106,7 @@ public XMLReader(InputStream fsStream, int dataLevel) throws XMLStreamException
     nestedMapCollection = new HashMap<>();
     this.dataLevel = dataLevel;
     isSelfClosingEvent = false;
+    this.allTextmode = allTextMode;
   }
 
   public void open(RowSetLoader rootRowWriter, CustomErrorContext errorContext ) {
@@ -431,8 +433,12 @@ private void writeFieldData(String fieldName, String fieldValue, TupleWriter wri
     // Find the TupleWriter object
     int index = writer.tupleSchema().index(fieldName);
     if (index == -1) {
-      ColumnMetadata colSchema = MetadataUtils.newScalar(fieldName, TypeProtos.MinorType.VARCHAR, TypeProtos.DataMode.OPTIONAL);
-      index = writer.addColumn(colSchema);
+      if (allTextmode) {
+        ColumnMetadata colSchema = MetadataUtils.newScalar(fieldName, TypeProtos.MinorType.VARCHAR, TypeProtos.DataMode.OPTIONAL);
+        index = writer.addColumn(colSchema);
+      } else {
+        index = addNewScalarColumn(fieldName, fieldValue, writer);
+      }
     }
     ScalarWriter colWriter = writer.scalar(index);
     ColumnMetadata columnMetadata = writer.tupleSchema().metadata(index);
@@ -461,7 +467,12 @@ private void writeFieldData(String fieldName, String fieldValue, TupleWriter wri
           dateFormat = columnMetadata.property("drill.format");
           LocalDate localDate;
           if (Strings.isNullOrEmpty(dateFormat)) {
-            localDate = LocalDate.parse(fieldValue);
+            // Use typifier if all text mode is disabled.
+            if (!allTextmode) {
+              localDate = Typifier.stringAsDate(fieldValue);
+            } else {
+              localDate = LocalDate.parse(fieldValue);
+            }
           } else {
             localDate = LocalDate.parse(fieldValue, DateTimeFormatter.ofPattern(dateFormat));
           }
@@ -503,6 +514,26 @@ private void writeFieldData(String fieldName, String fieldValue, TupleWriter wri
     }
   }
 
+  /**
+   * Adds a new scalar column to the schema. If the data type is unknown, it will default to
+   * VARCHAR.
+   * @param fieldName The field name.
+   * @param fieldValue The field value
+   * @param writer A {@link TupleWriter} of the current Drill schema
+   * @return A int of the index of the new column.
+   */
+  private int addNewScalarColumn(String fieldName, String fieldValue, TupleWriter writer) {
+    MinorType dataType;
+    if (Strings.isNullOrEmpty(fieldValue)) {
+      dataType = MinorType.VARCHAR;
+    } else {
+      dataType = Typifier.typifyToDrill(fieldValue);
+    }
+
+    ColumnMetadata colSchema = MetadataUtils.newScalar(fieldName, dataType, DataMode.OPTIONAL);
+    return writer.addColumn(colSchema);
+  }
+
   /**
    * Writes a attribute. If the field does not have a corresponding ScalarWriter, this method will
    * create one.

diff --git a/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java b/contrib/format-xml/src/test/java/org/apache/drill/exec/store/xml/TestXMLReader.java
@@ -49,7 +49,7 @@ public class TestXMLReader extends ClusterTest {
   public static void setup() throws Exception {
     ClusterTest.startCluster(ClusterFixture.builder(dirTestWatcher));
 
-    XMLFormatConfig formatConfig = new XMLFormatConfig(null, 2);
+    XMLFormatConfig formatConfig = new XMLFormatConfig(null, 2, true);
     cluster.defineFormat("cp", "xml", formatConfig);
     cluster.defineFormat("dfs", "xml", formatConfig);
 
@@ -86,6 +86,43 @@ public void testWildcard() throws Exception {
     new RowSetComparison(expected).verifyAndClearAll(results);
   }
 
+  @Test
+  public void testAllTextMode() throws Exception {
+    String sql = "SELECT attributes, int_field, bigint_field, float_field, double_field, " +
+        "boolean_field, date_field, time_field, timestamp_field, string_field" +
+        " FROM table(cp.`xml/simple_with_datatypes" +
+        ".xml` (type => 'xml', " +
+        "allTextMode => 'false'))";
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    assertEquals(2, results.rowCount());
+
+    TupleMetadata expectedSchema = new SchemaBuilder()
+        .add("attributes", MinorType.MAP)
+        .addNullable("int_field", MinorType.FLOAT8)
+        .addNullable("bigint_field", MinorType.FLOAT8)
+        .addNullable("float_field", MinorType.FLOAT8)
+        .addNullable("double_field", MinorType.FLOAT8)
+        .addNullable("boolean_field", MinorType.BIT)
+        .addNullable("date_field", MinorType.DATE)
+        .addNullable("time_field", MinorType.VARCHAR)
+        .addNullable("timestamp_field", MinorType.TIMESTAMP)
+        .addNullable("string_field", MinorType.VARCHAR)
+        .buildSchema();
+
+    //DateUtility.parseLocalDateTime
+
+    RowSet expected = client.rowSetBuilder(expectedSchema)
+        .addRow(mapArray(),  1.0, 1000.0, 1.3, 3.3, true, LocalDate.parse("2022-01-01"),
+            "12:04:34", Instant.parse("2022-01-06T12:30" +
+                ":30Z"),
+            "string")
+        .addRow(mapArray(), 2.0, 2000.0, 2.3, 4.3, false, LocalDate.parse("2022-02-01"),
+            "13:04:34", Instant.parse("2022-03-06T12:30:30Z"), null)
+        .build();
+
+    new RowSetComparison(expected).verifyAndClearAll(results);
+  }
+
   @Test
   public void testSimpleProvidedSchema() throws Exception {
     String sql = "SELECT * FROM table(cp.`xml/simple_with_datatypes.xml` (type => 'xml', schema " +
@@ -105,13 +142,17 @@ public void testSimpleProvidedSchema() throws Exception {
       .addNullable("time_field", MinorType.TIME)
       .addNullable("timestamp_field", MinorType.TIMESTAMP)
       .addNullable("string_field", MinorType.VARCHAR)
-      .addNullable("date2_field", MinorType.DATE)
+        .addNullable("date2_field", MinorType.DATE)
       .add("attributes", MinorType.MAP)
       .buildSchema();
 
     RowSet expected = client.rowSetBuilder(expectedSchema)
-      .addRow(1, 1000L, 1.2999999523162842, 3.3, true, LocalDate.parse("2022-01-01"), LocalTime.parse("12:04:34"), Instant.parse("2022-01-06T12:30:30Z"), "string", LocalDate.parse("2022-03-02"), mapArray())
-      .addRow(2, 2000L, 2.299999952316284, 4.3, false, LocalDate.parse("2022-02-01"), LocalTime.parse("13:04:34"), Instant.parse("2022-03-06T12:30:30Z"), null, LocalDate.parse("2022-03-01"), mapArray())
+      .addRow(1, 1000L, 1.2999999523162842, 3.3, true, LocalDate.parse("2022-01-01"),
+          LocalTime.parse("12:04:34"), Instant.parse("2022-01-06T12:30:30Z"), "string",
+          LocalDate.parse("2022-03-02"), mapArray())
+      .addRow(2, 2000L, 2.299999952316284, 4.3, false, LocalDate.parse("2022-02-01"),
+          LocalTime.parse("13:04:34"), Instant.parse("2022-03-06T12:30:30Z"), null,
+          LocalDate.parse("2022-03-01"), mapArray())
       .build();
 
     new RowSetComparison(expected).verifyAndClearAll(results);

diff --git a/contrib/storage-http/XML_Options.md b/contrib/storage-http/XML_Options.md
@@ -5,6 +5,11 @@ Drill has a several XML configuration options to allow you to configure how Dril
 XML data often contains a considerable amount of nesting which is not necessarily useful for data analysis. This parameter allows you to set the nesting level
   where the data actually starts.  The levels start at `1`.
 
+## AllTextMode
+Drill's XML reader can infer data types.  Similar to the JSON reader, there is an option called
+`allTextMode` which can be set to `true` to disable data type inference.  This is useful if your
+data has inconsistent schema.
+
 ## Schema Provisioning
 One of the challenges of querying APIs is inconsistent data.  Drill allows you to provide a schema for individual endpoints.  You can do this in one of three ways:
 
@@ -26,6 +31,7 @@ Or,
 ```json
 "xmlOptions": {
   "dataLevel": 2,
+  "allTextMode": true,
   "schema": {
     "type": "tuple_schema",
       "columns": [