From d82a3a16bb9473c8a84d413b8c2f846424b8d3b8 Mon Sep 17 00:00:00 2001 From: Dave Syer Date: Sat, 3 Aug 2024 11:31:05 +0000 Subject: [PATCH 1/2] Add support for reading Json in object per line --- .gitignore | 3 + .../tablesaw/io/jsonl/JsonlReadOptions.java | 186 ++++++++++++++++++ .../tech/tablesaw/io/jsonl/JsonlReader.java | 110 +++++++++++ .../tablesaw/io/jsonl/JsonlReaderTest.java | 60 ++++++ 4 files changed, 359 insertions(+) create mode 100644 json/src/main/java/tech/tablesaw/io/jsonl/JsonlReadOptions.java create mode 100644 json/src/main/java/tech/tablesaw/io/jsonl/JsonlReader.java create mode 100644 json/src/test/java/tech/tablesaw/io/jsonl/JsonlReaderTest.java diff --git a/.gitignore b/.gitignore index ac03e2a5f..9bd6972f1 100644 --- a/.gitignore +++ b/.gitignore @@ -31,3 +31,6 @@ bin/ nbproject/private/ nbactions.xml nb-configuration.xml + +# VSCode +.vscode/ diff --git a/json/src/main/java/tech/tablesaw/io/jsonl/JsonlReadOptions.java b/json/src/main/java/tech/tablesaw/io/jsonl/JsonlReadOptions.java new file mode 100644 index 000000000..ed44b1d46 --- /dev/null +++ b/json/src/main/java/tech/tablesaw/io/jsonl/JsonlReadOptions.java @@ -0,0 +1,186 @@ +package tech.tablesaw.io.jsonl; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.io.StringReader; +import java.net.URL; +import java.time.format.DateTimeFormatter; +import java.util.Locale; +import java.util.Map; +import java.util.Optional; +import java.util.function.Function; + +import tech.tablesaw.api.ColumnType; +import tech.tablesaw.io.ReadOptions; +import tech.tablesaw.io.Source; + +public class JsonlReadOptions extends ReadOptions { + + private final String path; + + protected JsonlReadOptions(Builder builder) { + super(builder); + this.path = builder.path; + } + + public static Builder builder(Source source) { + return new Builder(source); + } + + public static Builder builder(File file) { + return new Builder(file).tableName(file.getName()); + } + + public static Builder builder(String fileName) { + return new Builder(new File(fileName)); + } + + public static Builder builder(URL url) throws IOException { + return new Builder(url); + } + + public static Builder builderFromFile(String fileName) { + return new Builder(new File(fileName)); + } + + public static Builder builderFromString(String contents) { + return new Builder(new StringReader(contents)); + } + + public static Builder builderFromUrl(String url) throws IOException { + return new Builder(new URL(url)); + } + + public static Builder builder(InputStream stream) { + return new Builder(stream); + } + + public static Builder builder(Reader reader) { + return new Builder(reader); + } + + public String path() { + return path; + } + + public static class Builder extends ReadOptions.Builder { + + private String path; + + protected Builder(Source source) { + super(source); + } + + protected Builder(URL url) throws IOException { + super(url); + } + + public Builder(File file) { + super(file); + } + + protected Builder(Reader reader) { + super(reader); + } + + protected Builder(InputStream stream) { + super(stream); + } + + @Override + public JsonlReadOptions build() { + return new JsonlReadOptions(this); + } + + // Override super-class setters to return an instance of this class + + @Override + public Builder header(boolean header) { + super.header(header); + return this; + } + + @Override + public Builder tableName(String tableName) { + super.tableName(tableName); + return this; + } + + @Override + public Builder sample(boolean sample) { + super.sample(sample); + return this; + } + + @Override + public Builder dateFormat(DateTimeFormatter dateFormat) { + super.dateFormat(dateFormat); + return this; + } + + @Override + public Builder timeFormat(DateTimeFormatter timeFormat) { + super.timeFormat(timeFormat); + return this; + } + + @Override + public Builder dateTimeFormat(DateTimeFormatter dateTimeFormat) { + super.dateTimeFormat(dateTimeFormat); + return this; + } + + @Override + public Builder locale(Locale locale) { + super.locale(locale); + return this; + } + + @Override + public Builder missingValueIndicator(String... missingValueIndicators) { + super.missingValueIndicator(missingValueIndicators); + return this; + } + + @Override + public Builder minimizeColumnSizes() { + super.minimizeColumnSizes(); + return this; + } + + /** + * @param path the JSON Pointer path used to select a sub-tree in the main + * document + */ + public Builder path(String path) { + this.path = path; + return this; + } + + @Override + public Builder columnTypes(ColumnType[] columnTypes) { + super.columnTypes(columnTypes); + return this; + } + + @Override + public Builder columnTypes(Function columnTypeFunction) { + super.columnTypes(columnTypeFunction); + return this; + } + + @Override + public Builder columnTypesPartial(Function> columnTypeFunction) { + super.columnTypesPartial(columnTypeFunction); + return this; + } + + @Override + public Builder columnTypesPartial(Map columnTypeByName) { + super.columnTypesPartial(columnTypeByName); + return this; + } + } +} diff --git a/json/src/main/java/tech/tablesaw/io/jsonl/JsonlReader.java b/json/src/main/java/tech/tablesaw/io/jsonl/JsonlReader.java new file mode 100644 index 000000000..eadc89323 --- /dev/null +++ b/json/src/main/java/tech/tablesaw/io/jsonl/JsonlReader.java @@ -0,0 +1,110 @@ +package tech.tablesaw.io.jsonl; + +import java.io.IOException; +import java.io.Reader; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; + +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectReader; +import com.github.wnameless.json.flattener.JsonFlattener; + +import tech.tablesaw.api.Table; +import tech.tablesaw.io.DataReader; +import tech.tablesaw.io.ReadOptions; +import tech.tablesaw.io.ReaderRegistry; +import tech.tablesaw.io.RuntimeIOException; +import tech.tablesaw.io.Source; +import tech.tablesaw.io.TableBuildingUtils; + +public class JsonlReader implements DataReader { + + private static final JsonlReader INSTANCE = new JsonlReader(); + private static final ObjectMapper mapper = new ObjectMapper(); + + static { + register(Table.defaultReaderRegistry); + } + + public static void register(ReaderRegistry registry) { + registry.registerExtension("jsonl", INSTANCE); + registry.registerMimeType("text/jsonl", INSTANCE); + registry.registerMimeType("application/jsonl+json", INSTANCE); + registry.registerOptions(JsonlReadOptions.class, INSTANCE); + } + + @Override + public Table read(JsonlReadOptions options) { + ObjectReader stream = mapper.readerFor(JsonNode.class); + try { + Reader reader = options.source().createReader(null); + JsonParser parser = stream.createParser(reader); + Iterator iter = stream.readValues(parser); + return convertObjects(iter, options); + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + private Table convertObjects(Iterator iter, ReadOptions options) { + // flatten each object inside the array + StringBuilder result = new StringBuilder("["); + boolean first = true; + for (; iter.hasNext();) { + JsonNode rowObj = iter.next(); + String flattenedRow = null; + try { + flattenedRow = JsonFlattener.flatten(mapper.writeValueAsString(rowObj)); + } catch (JsonProcessingException e) { + throw new RuntimeIOException(e); + } + if (!first) { + result.append(","); + } + first = false; + result.append(flattenedRow); + } + String flattenedJsonString = result.append("]").toString(); + JsonNode flattenedJsonObj = null; + try { + flattenedJsonObj = mapper.readTree(flattenedJsonString); + } catch (JsonProcessingException e) { + throw new RuntimeIOException(e); + } + + Set colNames = new LinkedHashSet<>(); + for (JsonNode row : flattenedJsonObj) { + Iterator fieldNames = row.fieldNames(); + while (fieldNames.hasNext()) { + colNames.add(fieldNames.next()); + } + } + + List columnNames = new ArrayList<>(colNames); + List dataRows = new ArrayList<>(); + for (JsonNode node : flattenedJsonObj) { + String[] arr = new String[columnNames.size()]; + for (int i = 0; i < columnNames.size(); i++) { + if (node.has(columnNames.get(i))) { + arr[i] = node.get(columnNames.get(i)).asText(); + } else { + arr[i] = null; + } + } + dataRows.add(arr); + } + + return TableBuildingUtils.build(columnNames, dataRows, options); + } + + @Override + public Table read(Source source) { + return read(JsonlReadOptions.builder(source).build()); + } +} diff --git a/json/src/test/java/tech/tablesaw/io/jsonl/JsonlReaderTest.java b/json/src/test/java/tech/tablesaw/io/jsonl/JsonlReaderTest.java new file mode 100644 index 000000000..c4bb5cdb5 --- /dev/null +++ b/json/src/test/java/tech/tablesaw/io/jsonl/JsonlReaderTest.java @@ -0,0 +1,60 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package tech.tablesaw.io.jsonl; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +import tech.tablesaw.api.ColumnType; +import tech.tablesaw.api.IntColumn; +import tech.tablesaw.api.Table; + +public class JsonlReaderTest { + + @Test + public void arrayOfNestedObjects() { + String json = + "{\"a\":1453438800000,\"b\":{\"c\":-2.1448117025014}}\n" // + + "{\"a\":1454043600000,\"b\":{\"c\":-2.9763153817574}}\n" // + + "{\"a\":1454648400000,\"b\":{\"c\":-2.9545283436391}}"; + Table table = Table.read().string(json, "jsonl"); + assertEquals(2, table.columnCount()); + assertEquals(3, table.rowCount()); + assertEquals("a", table.column(0).name()); + assertEquals("b.c", table.column(1).name()); + assertEquals(ColumnType.LONG, table.typeArray()[0]); + } + + @Test + public void arrayOfRowsWithIncompleteIndexes() { + String json = + "{\"A\" : \"123\", \"B\" : \"456\"}\n" // + + "{\"B\" : \"789\", \"C\" : \"123\"}"; + + Table expected = + Table.create( + IntColumn.create("A", new int[] {123, Integer.MIN_VALUE}), + IntColumn.create("B", new int[] {456, 789}), + IntColumn.create("C", new int[] {Integer.MIN_VALUE, 123})); + Table actual = Table.read().string(json, "jsonl"); + + assertEquals(ColumnType.INTEGER, actual.typeArray()[0]); + assertEquals(expected.column("A").asList(), actual.column("A").asList()); + assertEquals(expected.column("B").asList(), actual.column("B").asList()); + assertEquals(expected.column("C").asList(), actual.column("C").asList()); + } + +} From 41ae65edd8f8e339806103e994f556e9d2b9ae87 Mon Sep 17 00:00:00 2001 From: Dave Syer Date: Sat, 3 Aug 2024 13:12:52 +0100 Subject: [PATCH 2/2] Add support for writing Json object per line --- .../tablesaw/io/json/JsonWriteOptions.java | 1 + .../tablesaw/io/jsonl/JsonlWriteOptions.java | 31 +++++++++ .../tech/tablesaw/io/jsonl/JsonlWriter.java | 67 +++++++++++++++++++ .../tablesaw/io/jsonl/JsonlWriterTest.java | 32 +++++++++ 4 files changed, 131 insertions(+) create mode 100644 json/src/main/java/tech/tablesaw/io/jsonl/JsonlWriteOptions.java create mode 100644 json/src/main/java/tech/tablesaw/io/jsonl/JsonlWriter.java create mode 100644 json/src/test/java/tech/tablesaw/io/jsonl/JsonlWriterTest.java diff --git a/json/src/main/java/tech/tablesaw/io/json/JsonWriteOptions.java b/json/src/main/java/tech/tablesaw/io/json/JsonWriteOptions.java index 2154ab34f..7b382d399 100644 --- a/json/src/main/java/tech/tablesaw/io/json/JsonWriteOptions.java +++ b/json/src/main/java/tech/tablesaw/io/json/JsonWriteOptions.java @@ -3,6 +3,7 @@ import java.io.Writer; import tech.tablesaw.io.Destination; import tech.tablesaw.io.WriteOptions; +import tech.tablesaw.io.jsonl.JsonlWriteOptions; public class JsonWriteOptions extends WriteOptions { diff --git a/json/src/main/java/tech/tablesaw/io/jsonl/JsonlWriteOptions.java b/json/src/main/java/tech/tablesaw/io/jsonl/JsonlWriteOptions.java new file mode 100644 index 000000000..aacf905d0 --- /dev/null +++ b/json/src/main/java/tech/tablesaw/io/jsonl/JsonlWriteOptions.java @@ -0,0 +1,31 @@ +package tech.tablesaw.io.jsonl; + +import java.io.Writer; +import tech.tablesaw.io.Destination; +import tech.tablesaw.io.WriteOptions; + +public class JsonlWriteOptions extends WriteOptions { + + private JsonlWriteOptions(Builder builder) { + super(builder); + } + + public static Builder builder(Writer writer) { + return new Builder(new Destination(writer)); + } + + public static Builder builder(Destination destination) { + return new Builder(destination); + } + + public static class Builder extends WriteOptions.Builder { + + protected Builder(Destination destination) { + super(destination); + } + + public JsonlWriteOptions build() { + return new JsonlWriteOptions(this); + } + } +} diff --git a/json/src/main/java/tech/tablesaw/io/jsonl/JsonlWriter.java b/json/src/main/java/tech/tablesaw/io/jsonl/JsonlWriter.java new file mode 100644 index 000000000..c51485bed --- /dev/null +++ b/json/src/main/java/tech/tablesaw/io/jsonl/JsonlWriter.java @@ -0,0 +1,67 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package tech.tablesaw.io.jsonl; + +import java.io.IOException; +import java.io.Writer; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ObjectNode; +import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule; + +import tech.tablesaw.api.Table; +import tech.tablesaw.io.DataWriter; +import tech.tablesaw.io.Destination; +import tech.tablesaw.io.RuntimeIOException; +import tech.tablesaw.io.WriterRegistry; + +public class JsonlWriter implements DataWriter { + + private static final JsonlWriter INSTANCE = new JsonlWriter(); + private static final ObjectMapper mapper = new ObjectMapper().registerModule(new JavaTimeModule()); + + static { + register(Table.defaultWriterRegistry); + } + + public static void register(WriterRegistry registry) { + registry.registerExtension("jsonl", INSTANCE); + registry.registerOptions(JsonlWriteOptions.class, INSTANCE); + } + + public void write(Table table, JsonlWriteOptions options) { + try (Writer writer = options.destination().createWriter()) { + for (int r = 0; r < table.rowCount(); r++) { + ObjectNode row = mapper.createObjectNode(); + for (int c = 0; c < table.columnCount(); c++) { + row.set(table.column(c).name(), mapper.convertValue(table.get(r, c), JsonNode.class)); + } + String str = mapper.writeValueAsString(row); + writer.write(str); + if (r < table.rowCount() - 1) { + writer.write("\n"); + } + } + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + @Override + public void write(Table table, Destination dest) { + write(table, JsonlWriteOptions.builder(dest).build()); + } +} diff --git a/json/src/test/java/tech/tablesaw/io/jsonl/JsonlWriterTest.java b/json/src/test/java/tech/tablesaw/io/jsonl/JsonlWriterTest.java new file mode 100644 index 000000000..4c3e69cb9 --- /dev/null +++ b/json/src/test/java/tech/tablesaw/io/jsonl/JsonlWriterTest.java @@ -0,0 +1,32 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package tech.tablesaw.io.jsonl; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +import tech.tablesaw.api.Table; + +public class JsonlWriterTest { + + @Test + public void arrayOfObjects() { + String json = "{\"a\":1453438800000,\"b\":-2.144}\n{\"a\":1454043600000,\"b\":-2.976}\n{\"a\":1454648400000,\"b\":-2.954}"; + Table table = Table.read().string(json, "jsonl"); + String output = table.write().toString("jsonl"); + assertEquals(json, output); + } +}