Skip to content

Commit

Permalink
DRILL-8450: Add Data Type Inference to XML Format Plugin (#2819)
Browse files Browse the repository at this point in the history
  • Loading branch information
cgivre authored Aug 22, 2023
1 parent 5c94c27 commit ee1cfeb
Show file tree
Hide file tree
Showing 10 changed files with 196 additions and 23 deletions.
42 changes: 42 additions & 0 deletions common/src/main/java/org/apache/drill/common/Typifier.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@

package org.apache.drill.common;

import org.apache.commons.lang3.StringUtils;
import org.apache.drill.common.types.TypeProtos.MinorType;

import java.nio.CharBuffer;

import java.time.LocalDate;
Expand Down Expand Up @@ -45,6 +48,11 @@ public class Typifier {
DateTimeFormatter.ofPattern("yyyy/MM/dd HH:mm:ss.SS", defaultLocale),
DateTimeFormatter.ofPattern("MM/dd/yyyy hh:mm:ss a", defaultLocale),
DateTimeFormatter.ofPattern("M/d/yy H:mm", defaultLocale),
DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss", defaultLocale),
DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSSXXX", defaultLocale),
DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSSXXX", defaultLocale),
DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSSVV", defaultLocale),
DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ssVV", defaultLocale),
DateTimeFormatter.ofPattern("dd/MM/yyyy HH:mm:ss", defaultLocale)));

private static final HashSet<DateTimeFormatter> dateFormats = new HashSet<>(
Expand Down Expand Up @@ -88,6 +96,40 @@ public class Typifier {
// If a String contains any of these, try to evaluate it as an equation
private static final char[] MathCharacters = new char[]{'+', '-', '/', '*', '='};

/**
* This function infers the Drill data type of unknown data.
* @param data The input text of unknown data type.
* @return A {@link MinorType} of the Drill data type.
*/
public static MinorType typifyToDrill (String data) {
Entry<Class, String> result = Typifier.typify(data);
String dataType = result.getKey().getSimpleName();

// If the string is empty, return UNKNOWN
if (StringUtils.isEmpty(data)) {
return MinorType.VARCHAR;
} else if (dataType.equalsIgnoreCase("Float")) {
return MinorType.FLOAT4;
} else if (dataType.equalsIgnoreCase("Double")) {
return MinorType.FLOAT8;
} else if (dataType.equalsIgnoreCase("Integer")) {
return MinorType.INT;
} else if (dataType.equalsIgnoreCase("Boolean")) {
return MinorType.BIT;
} else if (dataType.equalsIgnoreCase("Long")) {
return MinorType.BIGINT;
} else if(dataType.equalsIgnoreCase("LocalDateTime")) {
return MinorType.TIMESTAMP;
} else if (dataType.equalsIgnoreCase("LocalDate")) {
return MinorType.DATE;
} else if (dataType.equalsIgnoreCase("LocalTime")) {
return MinorType.TIME;
} else {
return MinorType.VARCHAR;
}
}


// default is:
// > don't interpret "0" and "1" as true and false
// > restrict interpretation to common types
Expand Down
5 changes: 4 additions & 1 deletion contrib/format-xml/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,15 @@ The default configuration is shown below:
"extensions": [
"xml"
],
"allTextMode": true,
"dataLevel": 2
}
```

## Data Types
All fields are read as strings. Nested fields are read as maps. Future functionality could include support for lists.
The XML reader has an `allTextMode` which, when set to `true` reads all data fields as strings.
When set to `false`, Drill will attempt to infer data types.
Nested fields are read as maps. Future functionality could include support for lists.

## Provided Schema
The XML Format Reader supports provided inline schemas. An example query might be:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,27 +37,30 @@
public class XMLBatchReader implements ManagedReader {

private static final Logger logger = LoggerFactory.getLogger(XMLBatchReader.class);

private final FileDescrip file;
private final RowSetLoader rootRowWriter;
private final CustomErrorContext errorContext;
private final XMLReaderConfig readerConfig;

private XMLReader reader;
private final int dataLevel;

static class XMLReaderConfig {
final XMLFormatPlugin plugin;
final int dataLevel;
final boolean allTextMode;

XMLReaderConfig(XMLFormatPlugin plugin) {
this.plugin = plugin;
dataLevel = plugin.getConfig().dataLevel;
allTextMode = plugin.getConfig().allTextMode();
}
}

public XMLBatchReader(XMLReaderConfig readerConfig, EasySubScan scan, FileSchemaNegotiator negotiator) {
errorContext = negotiator.parentErrorContext();
dataLevel = readerConfig.dataLevel;
this.readerConfig = readerConfig;
file = negotiator.file();

// Add schema if provided
Expand All @@ -68,7 +71,6 @@ public XMLBatchReader(XMLReaderConfig readerConfig, EasySubScan scan, FileSchema

ResultSetLoader loader = negotiator.build();
rootRowWriter = loader.writer();

openFile();
}

Expand All @@ -85,7 +87,7 @@ public void close() {
private void openFile() {
try {
InputStream fsStream = file.fileSystem().openPossiblyCompressedStream(file.split().getPath());
reader = new XMLReader(fsStream, dataLevel);
reader = new XMLReader(fsStream, dataLevel, readerConfig.allTextMode);
reader.open(rootRowWriter, errorContext);
} catch (Exception e) {
throw UserException
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,20 +36,33 @@ public class XMLFormatConfig implements FormatPluginConfig {
public final List<String> extensions;
public final int dataLevel;

@JsonProperty
public final boolean allTextMode;

public XMLFormatConfig(@JsonProperty("extensions") List<String> extensions,
@JsonProperty("dataLevel") int dataLevel) {
@JsonProperty("dataLevel") int dataLevel,
@JsonProperty("allTextMode") Boolean allTextMode
) {
this.extensions = extensions == null ? Collections.singletonList("xml") : ImmutableList.copyOf(extensions);
this.dataLevel = Math.max(dataLevel, 1);

// Default to true
this.allTextMode = allTextMode == null || allTextMode;
}

@JsonInclude(JsonInclude.Include.NON_DEFAULT)
public List<String> getExtensions() {
return extensions;
}

@JsonProperty("allTextMode")
public boolean allTextMode() {
return allTextMode;
}

@Override
public int hashCode() {
return Objects.hash(extensions, dataLevel);
return Objects.hash(extensions, dataLevel, allTextMode);
}

public XMLBatchReader.XMLReaderConfig getReaderConfig(XMLFormatPlugin plugin) {
Expand All @@ -66,14 +79,16 @@ public boolean equals(Object obj) {
}
XMLFormatConfig other = (XMLFormatConfig) obj;
return Objects.equals(extensions, other.extensions)
&& Objects.equals(dataLevel, other.dataLevel);
&& Objects.equals(dataLevel, other.dataLevel)
&& Objects.equals(allTextMode, other.allTextMode);
}

@Override
public String toString() {
return new PlanStringBuilder(this)
.field("extensions", extensions)
.field("dataLevel", dataLevel)
.field("extensions", extensions)
.field("dataLevel", dataLevel)
.field("allTextMode", allTextMode)
.toString();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
package org.apache.drill.exec.store.xml;

import org.apache.drill.common.AutoCloseables;
import org.apache.drill.common.Typifier;
import org.apache.drill.common.exceptions.CustomErrorContext;
import org.apache.drill.common.exceptions.UserException;
import org.apache.drill.common.types.TypeProtos;
Expand Down Expand Up @@ -64,7 +65,7 @@ public class XMLReader implements Closeable {
private final Stack<TupleWriter> rowWriterStack;
private final int dataLevel;
private final Map<String, XMLMap> nestedMapCollection;

private final boolean allTextmode;
private TupleWriter attributeWriter;
private CustomErrorContext errorContext;
private RowSetLoader rootRowWriter;
Expand Down Expand Up @@ -96,7 +97,7 @@ private enum xmlState {
ROW_ENDED
}

public XMLReader(InputStream fsStream, int dataLevel) throws XMLStreamException {
public XMLReader(InputStream fsStream, int dataLevel, boolean allTextMode) throws XMLStreamException {
this.fsStream = fsStream;
XMLInputFactory inputFactory = XMLInputFactory.newInstance();
reader = inputFactory.createXMLEventReader(fsStream);
Expand All @@ -105,6 +106,7 @@ public XMLReader(InputStream fsStream, int dataLevel) throws XMLStreamException
nestedMapCollection = new HashMap<>();
this.dataLevel = dataLevel;
isSelfClosingEvent = false;
this.allTextmode = allTextMode;
}

public void open(RowSetLoader rootRowWriter, CustomErrorContext errorContext ) {
Expand Down Expand Up @@ -431,8 +433,12 @@ private void writeFieldData(String fieldName, String fieldValue, TupleWriter wri
// Find the TupleWriter object
int index = writer.tupleSchema().index(fieldName);
if (index == -1) {
ColumnMetadata colSchema = MetadataUtils.newScalar(fieldName, TypeProtos.MinorType.VARCHAR, TypeProtos.DataMode.OPTIONAL);
index = writer.addColumn(colSchema);
if (allTextmode) {
ColumnMetadata colSchema = MetadataUtils.newScalar(fieldName, TypeProtos.MinorType.VARCHAR, TypeProtos.DataMode.OPTIONAL);
index = writer.addColumn(colSchema);
} else {
index = addNewScalarColumn(fieldName, fieldValue, writer);
}
}
ScalarWriter colWriter = writer.scalar(index);
ColumnMetadata columnMetadata = writer.tupleSchema().metadata(index);
Expand Down Expand Up @@ -461,7 +467,12 @@ private void writeFieldData(String fieldName, String fieldValue, TupleWriter wri
dateFormat = columnMetadata.property("drill.format");
LocalDate localDate;
if (Strings.isNullOrEmpty(dateFormat)) {
localDate = LocalDate.parse(fieldValue);
// Use typifier if all text mode is disabled.
if (!allTextmode) {
localDate = Typifier.stringAsDate(fieldValue);
} else {
localDate = LocalDate.parse(fieldValue);
}
} else {
localDate = LocalDate.parse(fieldValue, DateTimeFormatter.ofPattern(dateFormat));
}
Expand Down Expand Up @@ -503,6 +514,26 @@ private void writeFieldData(String fieldName, String fieldValue, TupleWriter wri
}
}

/**
* Adds a new scalar column to the schema. If the data type is unknown, it will default to
* VARCHAR.
* @param fieldName The field name.
* @param fieldValue The field value
* @param writer A {@link TupleWriter} of the current Drill schema
* @return A int of the index of the new column.
*/
private int addNewScalarColumn(String fieldName, String fieldValue, TupleWriter writer) {
MinorType dataType;
if (Strings.isNullOrEmpty(fieldValue)) {
dataType = MinorType.VARCHAR;
} else {
dataType = Typifier.typifyToDrill(fieldValue);
}

ColumnMetadata colSchema = MetadataUtils.newScalar(fieldName, dataType, DataMode.OPTIONAL);
return writer.addColumn(colSchema);
}

/**
* Writes a attribute. If the field does not have a corresponding ScalarWriter, this method will
* create one.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ public class TestXMLReader extends ClusterTest {
public static void setup() throws Exception {
ClusterTest.startCluster(ClusterFixture.builder(dirTestWatcher));

XMLFormatConfig formatConfig = new XMLFormatConfig(null, 2);
XMLFormatConfig formatConfig = new XMLFormatConfig(null, 2, true);
cluster.defineFormat("cp", "xml", formatConfig);
cluster.defineFormat("dfs", "xml", formatConfig);

Expand Down Expand Up @@ -86,6 +86,43 @@ public void testWildcard() throws Exception {
new RowSetComparison(expected).verifyAndClearAll(results);
}

@Test
public void testAllTextMode() throws Exception {
String sql = "SELECT attributes, int_field, bigint_field, float_field, double_field, " +
"boolean_field, date_field, time_field, timestamp_field, string_field" +
" FROM table(cp.`xml/simple_with_datatypes" +
".xml` (type => 'xml', " +
"allTextMode => 'false'))";
RowSet results = client.queryBuilder().sql(sql).rowSet();
assertEquals(2, results.rowCount());

TupleMetadata expectedSchema = new SchemaBuilder()
.add("attributes", MinorType.MAP)
.addNullable("int_field", MinorType.FLOAT8)
.addNullable("bigint_field", MinorType.FLOAT8)
.addNullable("float_field", MinorType.FLOAT8)
.addNullable("double_field", MinorType.FLOAT8)
.addNullable("boolean_field", MinorType.BIT)
.addNullable("date_field", MinorType.DATE)
.addNullable("time_field", MinorType.VARCHAR)
.addNullable("timestamp_field", MinorType.TIMESTAMP)
.addNullable("string_field", MinorType.VARCHAR)
.buildSchema();

//DateUtility.parseLocalDateTime

RowSet expected = client.rowSetBuilder(expectedSchema)
.addRow(mapArray(), 1.0, 1000.0, 1.3, 3.3, true, LocalDate.parse("2022-01-01"),
"12:04:34", Instant.parse("2022-01-06T12:30" +
":30Z"),
"string")
.addRow(mapArray(), 2.0, 2000.0, 2.3, 4.3, false, LocalDate.parse("2022-02-01"),
"13:04:34", Instant.parse("2022-03-06T12:30:30Z"), null)
.build();

new RowSetComparison(expected).verifyAndClearAll(results);
}

@Test
public void testSimpleProvidedSchema() throws Exception {
String sql = "SELECT * FROM table(cp.`xml/simple_with_datatypes.xml` (type => 'xml', schema " +
Expand All @@ -105,13 +142,17 @@ public void testSimpleProvidedSchema() throws Exception {
.addNullable("time_field", MinorType.TIME)
.addNullable("timestamp_field", MinorType.TIMESTAMP)
.addNullable("string_field", MinorType.VARCHAR)
.addNullable("date2_field", MinorType.DATE)
.addNullable("date2_field", MinorType.DATE)
.add("attributes", MinorType.MAP)
.buildSchema();

RowSet expected = client.rowSetBuilder(expectedSchema)
.addRow(1, 1000L, 1.2999999523162842, 3.3, true, LocalDate.parse("2022-01-01"), LocalTime.parse("12:04:34"), Instant.parse("2022-01-06T12:30:30Z"), "string", LocalDate.parse("2022-03-02"), mapArray())
.addRow(2, 2000L, 2.299999952316284, 4.3, false, LocalDate.parse("2022-02-01"), LocalTime.parse("13:04:34"), Instant.parse("2022-03-06T12:30:30Z"), null, LocalDate.parse("2022-03-01"), mapArray())
.addRow(1, 1000L, 1.2999999523162842, 3.3, true, LocalDate.parse("2022-01-01"),
LocalTime.parse("12:04:34"), Instant.parse("2022-01-06T12:30:30Z"), "string",
LocalDate.parse("2022-03-02"), mapArray())
.addRow(2, 2000L, 2.299999952316284, 4.3, false, LocalDate.parse("2022-02-01"),
LocalTime.parse("13:04:34"), Instant.parse("2022-03-06T12:30:30Z"), null,
LocalDate.parse("2022-03-01"), mapArray())
.build();

new RowSetComparison(expected).verifyAndClearAll(results);
Expand Down
6 changes: 6 additions & 0 deletions contrib/storage-http/XML_Options.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@ Drill has a several XML configuration options to allow you to configure how Dril
XML data often contains a considerable amount of nesting which is not necessarily useful for data analysis. This parameter allows you to set the nesting level
where the data actually starts. The levels start at `1`.

## AllTextMode
Drill's XML reader can infer data types. Similar to the JSON reader, there is an option called
`allTextMode` which can be set to `true` to disable data type inference. This is useful if your
data has inconsistent schema.

## Schema Provisioning
One of the challenges of querying APIs is inconsistent data. Drill allows you to provide a schema for individual endpoints. You can do this in one of three ways:

Expand All @@ -26,6 +31,7 @@ Or,
```json
"xmlOptions": {
"dataLevel": 2,
"allTextMode": true,
"schema": {
"type": "tuple_schema",
"columns": [
Expand Down
Loading

0 comments on commit ee1cfeb

Please sign in to comment.