From ebe3d0098b4618a8ff174f03812be6ad7615fcae Mon Sep 17 00:00:00 2001 From: jimeng Date: Fri, 4 Oct 2024 22:56:59 +0800 Subject: [PATCH] add SimdJsonParser2 base on bitindex --- build.gradle | 10 +- src/main/java/org/simdjson/BitIndexes.java | 6 +- .../java/org/simdjson/SimdJsonParser.java | 9 +- .../java/org/simdjson/SimdJsonParser2.java | 235 ++++++++++++++++++ 4 files changed, 255 insertions(+), 5 deletions(-) create mode 100644 src/main/java/org/simdjson/SimdJsonParser2.java diff --git a/build.gradle b/build.gradle index 60e5f4b..66f9d78 100644 --- a/build.gradle +++ b/build.gradle @@ -26,6 +26,7 @@ group = 'org.simdjson' version = scmVersion.version repositories { + mavenLocal() mavenCentral() } @@ -45,6 +46,7 @@ java { ext { junitVersion = '5.10.2' jsoniterScalaVersion = '2.28.4' + lombokVersion = '1.18.34' } dependencies { @@ -53,6 +55,10 @@ dependencies { jmhImplementation group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-core_2.13', version: jsoniterScalaVersion jmhImplementation group: 'com.google.guava', name: 'guava', version: '32.1.2-jre' compileOnly group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-macros_2.13', version: jsoniterScalaVersion + compileOnly group: 'org.projectlombok', name: 'lombok', version: lombokVersion + annotationProcessor group: 'org.projectlombok', name: 'lombok', version: lombokVersion + testCompileOnly group: 'org.projectlombok', name: 'lombok', version: lombokVersion + testAnnotationProcessor group: 'org.projectlombok', name: 'lombok', version: lombokVersion testImplementation group: 'org.assertj', name: 'assertj-core', version: '3.24.2' testImplementation group: 'org.apache.commons', name: 'commons-text', version: '1.10.0' @@ -160,7 +166,9 @@ publishing { publications { mavenJava(MavenPublication) { from(components.java) - + groupId = 'org.simdjson' + artifactId = 'simdjson-java' + version = scmVersion.version pom { name = project.name description = 'A Java version of simdjson, a high-performance JSON parser utilizing SIMD instructions.' diff --git a/src/main/java/org/simdjson/BitIndexes.java b/src/main/java/org/simdjson/BitIndexes.java index 59c0dc3..55ba8ee 100644 --- a/src/main/java/org/simdjson/BitIndexes.java +++ b/src/main/java/org/simdjson/BitIndexes.java @@ -1,6 +1,6 @@ package org.simdjson; -class BitIndexes { +public class BitIndexes { private final int[] indexes; @@ -44,8 +44,8 @@ private long clearLowestBit(long bits) { return bits & (bits - 1); } - void advance() { - readIdx++; + int advance() { + return indexes[readIdx++]; } int getAndAdvance() { diff --git a/src/main/java/org/simdjson/SimdJsonParser.java b/src/main/java/org/simdjson/SimdJsonParser.java index 707124c..0208d38 100644 --- a/src/main/java/org/simdjson/SimdJsonParser.java +++ b/src/main/java/org/simdjson/SimdJsonParser.java @@ -1,5 +1,7 @@ package org.simdjson; +import lombok.Getter; + public class SimdJsonParser { private static final int PADDING = 64; @@ -24,7 +26,12 @@ public SimdJsonParser(int capacity, int maxDepth) { paddedBuffer = new byte[capacity]; indexer = new StructuralIndexer(bitIndexes); } - + public BitIndexes buildBitIndex (byte[] buffer, int len) { + byte[] padded = padIfNeeded(buffer, len); + reset(); + stage1(padded, len); + return bitIndexes; + } public T parse(byte[] buffer, int len, Class expectedType) { byte[] padded = padIfNeeded(buffer, len); reset(); diff --git a/src/main/java/org/simdjson/SimdJsonParser2.java b/src/main/java/org/simdjson/SimdJsonParser2.java new file mode 100644 index 0000000..1ba292d --- /dev/null +++ b/src/main/java/org/simdjson/SimdJsonParser2.java @@ -0,0 +1,235 @@ +package org.simdjson; + +import java.util.HashMap; +import java.util.Map; + +import lombok.Data; +import lombok.RequiredArgsConstructor; + +public class SimdJsonParser2 { + + @Data + @RequiredArgsConstructor + static class JsonNode { + private long version = 0; + private boolean isLeaf = false; + private final String name; + private String value = null; + private JsonNode parent = null; + private Map children = new HashMap<>(); + private int start = -1; + private int end = -1; + } + + private final SimdJsonParser parser; + private BitIndexes bitIndexes; + private final JsonNode root = new JsonNode(null); + private final JsonNode[] row; + private final String[] result; + private final String[] emptyResult; + private JsonNode ptr; + private byte[] buffer; + private final int targetParseNum; + private long currentVersion = 0; + // pruning, when alreadyProcessedCols == NUM + private long alreadyProcessedCols = 0; + + public SimdJsonParser2(String... args) { + parser = new SimdJsonParser(); + targetParseNum = args.length; + row = new JsonNode[targetParseNum]; + result = new String[targetParseNum]; + emptyResult = new String[targetParseNum]; + for (int i = 0; i < args.length; i++) { + emptyResult[i] = null; + } + for (int i = 0; i < targetParseNum; i++) { + JsonNode cur = root; + String[] paths = args[i].split("\\."); + for (int j = 0; j < paths.length; j++) { + if (!cur.getChildren().containsKey(paths[j])) { + JsonNode child = new JsonNode(paths[j]); + cur.getChildren().put(paths[j], child); + child.setParent(cur); + } + cur = cur.getChildren().get(paths[j]); + } + cur.setLeaf(true); + row[i] = cur; + } + + } + + public String[] parse(byte[] buffer, int len) { + this.bitIndexes = parser.buildBitIndex(buffer, len); + if (buffer == null || buffer.length == 0) { + return emptyResult; + } + this.alreadyProcessedCols = 0; + this.currentVersion++; + this.ptr = root; + this.buffer = buffer; + + switch (buffer[bitIndexes.peek()]) { + case '{' -> { + parseMap(); + } + case '[' -> { + parseList(); + } + default -> { + throw new RuntimeException("invalid json format"); + } + } + return getResult(); + } + + private void parseElement(String fieldName) { + if (fieldName == null) { + int start = bitIndexes.advance(); + int realEnd = bitIndexes.advance(); + while (realEnd > start) { + if (buffer[--realEnd] == '"') { + break; + } + } + fieldName = new String(buffer, start + 1, realEnd - start - 1); + } + if (!ptr.getChildren().containsKey(fieldName)) { + skip(false); + return; + } + ptr = ptr.getChildren().get(fieldName); + switch (buffer[bitIndexes.peek()]) { + case '{' -> { + parseMap(); + } + case '[' -> { + parseList(); + } + default -> { + ptr.setValue(skip(true)); + ptr.setVersion(currentVersion); + ++alreadyProcessedCols; + } + } + ptr = ptr.getParent(); + } + + private void parseMap() { + if (ptr.getChildren() == null) { + ptr.setValue(skip(true)); + ptr.setVersion(currentVersion); + ++alreadyProcessedCols; + return; + } + ptr.setStart(bitIndexes.peek()); + bitIndexes.advance(); + while (bitIndexes.hasNext() && buffer[bitIndexes.peek()] != '}' && alreadyProcessedCols < targetParseNum) { + parseElement(null); + if (buffer[bitIndexes.peek()] == ',') { + bitIndexes.advance(); + } + } + ptr.setEnd(bitIndexes.peek()); + if (ptr.isLeaf()) { + ptr.setValue(new String(buffer, ptr.getStart(), ptr.getEnd() - ptr.getStart() + 1)); + ptr.setVersion(currentVersion); + ++alreadyProcessedCols; + } + bitIndexes.advance(); + } + + private void parseList() { + if (ptr.getChildren() == null) { + ptr.setValue(skip(true)); + ptr.setVersion(currentVersion); + ++alreadyProcessedCols; + return; + } + ptr.setStart(bitIndexes.peek()); + bitIndexes.advance(); + int i = 0; + while (bitIndexes.hasNext() && buffer[bitIndexes.peek()] != ']' && alreadyProcessedCols < targetParseNum) { + parseElement("" + i); + if (buffer[bitIndexes.peek()] == ',') { + bitIndexes.advance(); + } + i++; + } + ptr.setEnd(bitIndexes.peek()); + if (ptr.isLeaf()) { + ptr.setValue(new String(buffer, ptr.getStart(), ptr.getEnd() - ptr.getStart() + 1)); + ptr.setVersion(currentVersion); + ++alreadyProcessedCols; + } + bitIndexes.advance(); + } + + private String skip(boolean retainValue) { + int i = 0; + int start = retainValue ? bitIndexes.peek() : 0; + switch (buffer[bitIndexes.peek()]) { + case '{' -> { + i++; + while (i > 0) { + bitIndexes.advance(); + if (buffer[bitIndexes.peek()] == '{') { + i++; + } else if (buffer[bitIndexes.peek()] == '}') { + i--; + } + } + int end = bitIndexes.peek(); + bitIndexes.advance(); + return retainValue ? new String(buffer, start, end - start + 1) : null; + } + case '[' -> { + i++; + while (i > 0) { + bitIndexes.advance(); + if (buffer[bitIndexes.peek()] == '[') { + i++; + } else if (buffer[bitIndexes.peek()] == ']') { + i--; + } + } + int end = bitIndexes.peek(); + bitIndexes.advance(); + return retainValue ? new String(buffer, start, end - start + 1) : null; + } + case '"' -> { + bitIndexes.advance(); + int realEnd = bitIndexes.peek(); + while (realEnd > start) { + if (buffer[--realEnd] == '"') { + break; + } + } + return retainValue ? new String(buffer, start + 1, realEnd - start - 1) : null; + } + default -> { + bitIndexes.advance(); + int realEnd = bitIndexes.peek(); + while (realEnd >= start) { + --realEnd; + if (buffer[realEnd] >= '0' && buffer[realEnd] <= '9') { + break; + } + } + return retainValue ? new String(buffer, start, realEnd - start + 1) : null; + } + } + } + + private String[] getResult() { + for (int i = 0; i < targetParseNum; i++) { + if (row[i].getVersion() < currentVersion) { + result[i] = null; + continue; + } + result[i] = row[i].getValue(); + } + return result; + } +}