From 5c92d47b3b1203cb22d6ed2db9e1365331ec5256 Mon Sep 17 00:00:00 2001 From: jimeng Date: Mon, 7 Oct 2024 16:03:29 +0800 Subject: [PATCH] add test case --- build.gradle | 6 ++ .../simdjson/Parse2VsJacksonBenchMark.java | 55 ++++++++++++ src/main/java/org/simdjson/BitIndexes.java | 2 + .../java/org/simdjson/SimdJsonParser2.java | 89 +++++++++---------- .../simdjson/JsonMultiValueParsingTest.java | 33 +++++++ 5 files changed, 138 insertions(+), 47 deletions(-) create mode 100644 src/jmh/java/org/simdjson/Parse2VsJacksonBenchMark.java create mode 100644 src/test/java/org/simdjson/JsonMultiValueParsingTest.java diff --git a/build.gradle b/build.gradle index 66f9d78..fc6ccce 100644 --- a/build.gradle +++ b/build.gradle @@ -47,6 +47,7 @@ ext { junitVersion = '5.10.2' jsoniterScalaVersion = '2.28.4' lombokVersion = '1.18.34' + jacksonVersion = '2.18.0' } dependencies { @@ -66,6 +67,11 @@ dependencies { testImplementation group: 'org.junit.jupiter', name: 'junit-jupiter-api', version: junitVersion testImplementation group: 'org.junit.jupiter', name: 'junit-jupiter-params', version: junitVersion testRuntimeOnly group: 'org.junit.jupiter', name: 'junit-jupiter-engine', version: junitVersion + + // Jackson dependency for jmh + implementation group: 'com.fasterxml.jackson.core', name: 'jackson-core', version: jacksonVersion + implementation group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: jacksonVersion + implementation group: 'com.fasterxml.jackson.core', name: 'jackson-annotations', version: jacksonVersion } tasks.register('downloadTestData') { diff --git a/src/jmh/java/org/simdjson/Parse2VsJacksonBenchMark.java b/src/jmh/java/org/simdjson/Parse2VsJacksonBenchMark.java new file mode 100644 index 0000000..49ee8d8 --- /dev/null +++ b/src/jmh/java/org/simdjson/Parse2VsJacksonBenchMark.java @@ -0,0 +1,55 @@ +package org.simdjson; + +import java.io.IOException; +import java.io.InputStream; +import java.util.concurrent.TimeUnit; + +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ArrayNode; + +@State(Scope.Benchmark) +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(TimeUnit.SECONDS) +public class Parse2VsJacksonBenchMark { + @Param({"/twitter.json"}) + String fileName; + private byte[] buffer; + private final SimdJsonParser2 parser = new SimdJsonParser2("statuses.0.metadata", "metadata.0.created_at", "metadata.0.id", + "statuses.1.metadata", "metadata.1.created_at", "metadata.1.id"); + private final ObjectMapper MAPPER = new ObjectMapper(); + + @Setup(Level.Trial) + public void setup() throws IOException { + try (InputStream is = ParseBenchmark.class.getResourceAsStream(fileName)) { + assert is != null; + buffer = is.readAllBytes(); + } + } + + @Benchmark + public void parseBySimdJson() { + String[] result = parser.parse(buffer, buffer.length); + } + + @Benchmark + public void parseByJackson() throws Exception { + ArrayNode arrayNode = (ArrayNode) MAPPER.readTree(buffer).path("statuses"); + String[] result = new String[6]; + result[0] = arrayNode.get(0).path("metadata").toString(); + result[1] = arrayNode.get(0).path("created_at").toString(); + result[2] = arrayNode.get(0).path("id").toString(); + result[3] = arrayNode.get(0).path("metadata").toString(); + result[4] = arrayNode.get(0).path("created_at").toString(); + result[5] = arrayNode.get(0).path("id").toString(); + } +} diff --git a/src/main/java/org/simdjson/BitIndexes.java b/src/main/java/org/simdjson/BitIndexes.java index 55ba8ee..637eac9 100644 --- a/src/main/java/org/simdjson/BitIndexes.java +++ b/src/main/java/org/simdjson/BitIndexes.java @@ -1,5 +1,7 @@ package org.simdjson; +import java.util.Arrays; + public class BitIndexes { private final int[] indexes; diff --git a/src/main/java/org/simdjson/SimdJsonParser2.java b/src/main/java/org/simdjson/SimdJsonParser2.java index 1ba292d..0d30539 100644 --- a/src/main/java/org/simdjson/SimdJsonParser2.java +++ b/src/main/java/org/simdjson/SimdJsonParser2.java @@ -29,21 +29,22 @@ static class JsonNode { private final String[] emptyResult; private JsonNode ptr; private byte[] buffer; - private final int targetParseNum; + private final int expectParseCols; + // every time json string is processed, currentVersion will be incremented by 1 private long currentVersion = 0; // pruning, when alreadyProcessedCols == NUM - private long alreadyProcessedCols = 0; + private long parseCols = 0; public SimdJsonParser2(String... args) { parser = new SimdJsonParser(); - targetParseNum = args.length; - row = new JsonNode[targetParseNum]; - result = new String[targetParseNum]; - emptyResult = new String[targetParseNum]; + expectParseCols = args.length; + row = new JsonNode[expectParseCols]; + result = new String[expectParseCols]; + emptyResult = new String[expectParseCols]; for (int i = 0; i < args.length; i++) { emptyResult[i] = null; } - for (int i = 0; i < targetParseNum; i++) { + for (int i = 0; i < expectParseCols; i++) { JsonNode cur = root; String[] paths = args[i].split("\\."); for (int j = 0; j < paths.length; j++) { @@ -65,7 +66,7 @@ public String[] parse(byte[] buffer, int len) { if (buffer == null || buffer.length == 0) { return emptyResult; } - this.alreadyProcessedCols = 0; + this.parseCols = 0; this.currentVersion++; this.ptr = root; this.buffer = buffer; @@ -84,22 +85,34 @@ public String[] parse(byte[] buffer, int len) { return getResult(); } - private void parseElement(String fieldName) { - if (fieldName == null) { - int start = bitIndexes.advance(); - int realEnd = bitIndexes.advance(); - while (realEnd > start) { - if (buffer[--realEnd] == '"') { - break; - } - } - fieldName = new String(buffer, start + 1, realEnd - start - 1); + private String parseField() { + int start = bitIndexes.advance(); + int next = bitIndexes.peek(); + String field = new String(buffer, start, next - start).trim(); + if ("null".equalsIgnoreCase(field)) { + return null; + } + // field type is string or type is decimal + if (field.startsWith("\"")) { + field = field.substring(1, field.length() - 1); + } + return field; + } + + private void parseElement(String expectFieldName) { + if (parseCols >= expectParseCols) { + return; + } + // if expectFieldName is null, parent is map, else is list + if (expectFieldName == null) { + expectFieldName = parseField(); + bitIndexes.advance(); // skip : } - if (!ptr.getChildren().containsKey(fieldName)) { + if (!ptr.getChildren().containsKey(expectFieldName)) { skip(false); return; } - ptr = ptr.getChildren().get(fieldName); + ptr = ptr.getChildren().get(expectFieldName); switch (buffer[bitIndexes.peek()]) { case '{' -> { parseMap(); @@ -110,7 +123,7 @@ private void parseElement(String fieldName) { default -> { ptr.setValue(skip(true)); ptr.setVersion(currentVersion); - ++alreadyProcessedCols; + ++parseCols; } } ptr = ptr.getParent(); @@ -120,12 +133,12 @@ private void parseMap() { if (ptr.getChildren() == null) { ptr.setValue(skip(true)); ptr.setVersion(currentVersion); - ++alreadyProcessedCols; + ++parseCols; return; } ptr.setStart(bitIndexes.peek()); bitIndexes.advance(); - while (bitIndexes.hasNext() && buffer[bitIndexes.peek()] != '}' && alreadyProcessedCols < targetParseNum) { + while (bitIndexes.hasNext() && buffer[bitIndexes.peek()] != '}' && parseCols <= expectParseCols) { parseElement(null); if (buffer[bitIndexes.peek()] == ',') { bitIndexes.advance(); @@ -135,7 +148,7 @@ private void parseMap() { if (ptr.isLeaf()) { ptr.setValue(new String(buffer, ptr.getStart(), ptr.getEnd() - ptr.getStart() + 1)); ptr.setVersion(currentVersion); - ++alreadyProcessedCols; + ++parseCols; } bitIndexes.advance(); } @@ -144,13 +157,13 @@ private void parseList() { if (ptr.getChildren() == null) { ptr.setValue(skip(true)); ptr.setVersion(currentVersion); - ++alreadyProcessedCols; + ++parseCols; return; } ptr.setStart(bitIndexes.peek()); bitIndexes.advance(); int i = 0; - while (bitIndexes.hasNext() && buffer[bitIndexes.peek()] != ']' && alreadyProcessedCols < targetParseNum) { + while (bitIndexes.hasNext() && buffer[bitIndexes.peek()] != ']' && parseCols <= expectParseCols) { parseElement("" + i); if (buffer[bitIndexes.peek()] == ',') { bitIndexes.advance(); @@ -161,7 +174,7 @@ private void parseList() { if (ptr.isLeaf()) { ptr.setValue(new String(buffer, ptr.getStart(), ptr.getEnd() - ptr.getStart() + 1)); ptr.setVersion(currentVersion); - ++alreadyProcessedCols; + ++parseCols; } bitIndexes.advance(); } @@ -198,32 +211,14 @@ private String skip(boolean retainValue) { bitIndexes.advance(); return retainValue ? new String(buffer, start, end - start + 1) : null; } - case '"' -> { - bitIndexes.advance(); - int realEnd = bitIndexes.peek(); - while (realEnd > start) { - if (buffer[--realEnd] == '"') { - break; - } - } - return retainValue ? new String(buffer, start + 1, realEnd - start - 1) : null; - } default -> { - bitIndexes.advance(); - int realEnd = bitIndexes.peek(); - while (realEnd >= start) { - --realEnd; - if (buffer[realEnd] >= '0' && buffer[realEnd] <= '9') { - break; - } - } - return retainValue ? new String(buffer, start, realEnd - start + 1) : null; + return parseField(); } } } private String[] getResult() { - for (int i = 0; i < targetParseNum; i++) { + for (int i = 0; i < expectParseCols; i++) { if (row[i].getVersion() < currentVersion) { result[i] = null; continue; diff --git a/src/test/java/org/simdjson/JsonMultiValueParsingTest.java b/src/test/java/org/simdjson/JsonMultiValueParsingTest.java new file mode 100644 index 0000000..f0d8def --- /dev/null +++ b/src/test/java/org/simdjson/JsonMultiValueParsingTest.java @@ -0,0 +1,33 @@ +package org.simdjson; + +import static org.simdjson.testutils.SimdJsonAssertions.assertThat; +import static org.simdjson.testutils.TestUtils.toUtf8; + +import org.junit.jupiter.api.Test; + +public class JsonMultiValueParsingTest { + @Test + public void testParseMultiValue() { + byte[] json = toUtf8("{\"field1\":{\"field2\":\"value2\",\"field3\":3},\"field4\":[\"value4\",\"value5\"],\"field5\":null}"); + SimdJsonParser2 parser = new SimdJsonParser2("field1.field2", "field1.field3", "field4", "field4.0", "field5"); + String[] result = parser.parse(json, json.length); + assertThat(result[0]).isEqualTo("value2"); + assertThat(result[1]).isEqualTo("3"); + assertThat(result[2]).isEqualTo("[\"value4\",\"value5\"]"); + assertThat(result[3]).isEqualTo("value4"); + assertThat(result[4]).isEqualTo(null); + } + + @Test + public void testNonAsciiCharacters() { + byte[] json = toUtf8("{\"ąćśńźż\": 1, \"\\u20A9\\u0E3F\": 2, \"αβγ\": 3, \"😀abc😀\": 4}"); + SimdJsonParser2 parser = new SimdJsonParser2("ąćśńźż", "\\u20A9\\u0E3F", "αβγ", "😀abc😀"); + // when + String[] result = parser.parse(json, json.length); + // then + assertThat(result[0]).isEqualTo("1"); + assertThat(result[1]).isEqualTo("2"); + assertThat(result[2]).isEqualTo("3"); + assertThat(result[3]).isEqualTo("4"); + } +}