Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add schema-based parsing #43

Merged
merged 5 commits into from
Apr 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
build
profilers
testdata
hotspot_*.log
36 changes: 24 additions & 12 deletions build.gradle
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import me.champeau.jmh.JmhBytecodeGeneratorTask
import org.gradle.internal.os.OperatingSystem
import org.ajoberstar.grgit.Grgit
import org.gradle.internal.os.OperatingSystem

import java.time.Duration

plugins {
Expand Down Expand Up @@ -42,20 +43,20 @@ java {
}

ext {
junitVersion = '5.9.1'
jsoniterScalaVersion = '2.24.4'
junitVersion = '5.10.2'
jsoniterScalaVersion = '2.28.4'
}

dependencies {
jmhImplementation group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: '2.16.0'
jmhImplementation group: 'com.alibaba.fastjson2', name: 'fastjson2', version: '2.0.42'
jmhImplementation group: 'com.jsoniter', name: 'jsoniter', version: '0.9.23'
jmhImplementation group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: '2.17.0'
jmhImplementation group: 'com.alibaba.fastjson2', name: 'fastjson2', version: '2.0.49'
jmhImplementation group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-core_2.13', version: jsoniterScalaVersion
jmhImplementation group: 'com.google.guava', name: 'guava', version: '32.1.2-jre'
compileOnly group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-macros_2.13', version: jsoniterScalaVersion

testImplementation group: 'org.assertj', name: 'assertj-core', version: '3.24.2'
testImplementation group: 'org.apache.commons', name: 'commons-text', version: '1.10.0'
testImplementation group: 'org.junit-pioneer', name: 'junit-pioneer', version: '2.2.0'
testImplementation group: 'org.junit.jupiter', name: 'junit-jupiter-api', version: junitVersion
testImplementation group: 'org.junit.jupiter', name: 'junit-jupiter-params', version: junitVersion
testRuntimeOnly group: 'org.junit.jupiter', name: 'junit-jupiter-engine', version: junitVersion
Expand Down Expand Up @@ -136,15 +137,21 @@ jmh {
'--add-modules=jdk.incubator.vector'
]
if (getBooleanProperty('jmh.profilersEnabled', false)) {
createDirIfDoesNotExist('./profilers')
if (OperatingSystem.current().isLinux()) {
profilers = [
'perf',
'perfasm:intelSyntax=true',
'async:verbose=true;output=flamegraph;event=cpu;dir=./profilers/async;libPath=' + getAsyncProfilerLibPath('LD_LIBRARY_PATH')
def profilerList = [
'async:verbose=true;output=flamegraph;event=cpu;dir=./profilers/async;libPath=' + getLibPath('LD_LIBRARY_PATH')
]
if (getBooleanProperty('jmh.jitLogEnabled', false)) {
createDirIfDoesNotExist('./profilers/perfasm')
profilerList += [
'perfasm:intelSyntax=true;saveLog=true;saveLogTo=./profilers/perfasm'
]
}
profilers = profilerList
} else if (OperatingSystem.current().isMacOsX()) {
profilers = [
'async:verbose=true;output=flamegraph;event=cpu;dir=./profilers/async;libPath=' + getAsyncProfilerLibPath('DYLD_LIBRARY_PATH')
'async:verbose=true;output=flamegraph;event=cpu;dir=./profilers/async;libPath=' + getLibPath('DYLD_LIBRARY_PATH')
]
}
}
Expand Down Expand Up @@ -218,6 +225,11 @@ def getBooleanProperty(String name, boolean defaultValue) {
Boolean.valueOf((project.findProperty(name) ?: defaultValue) as String)
}

static def getAsyncProfilerLibPath(String envVarName) {
static def getLibPath(String envVarName) {
System.getenv(envVarName) ?: System.getProperty('java.library.path')
}

static createDirIfDoesNotExist(String dir) {
File file = new File(dir)
file.mkdirs()
}
4 changes: 2 additions & 2 deletions src/jmh/java/org/simdjson/NumberParserBenchmark.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
public class NumberParserBenchmark {

private final Tape tape = new Tape(100);
private final NumberParser numberParser = new NumberParser(tape);
private final NumberParser numberParser = new NumberParser();

@Param({
"2.2250738585072013e-308", // fast path
Expand All @@ -43,7 +43,7 @@ public double baseline() {
@Benchmark
public double simdjson() {
tape.reset();
numberParser.parseNumber(numberUtf8Bytes, 0);
numberParser.parseNumber(numberUtf8Bytes, 0, tape);
return tape.getDouble(0);
}
}
31 changes: 1 addition & 30 deletions src/jmh/java/org/simdjson/ParseAndSelectBenchmark.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,6 @@
import com.alibaba.fastjson2.JSONObject;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.github.plokhotnyuk.jsoniter_scala.core.ReaderConfig$;
import com.github.plokhotnyuk.jsoniter_scala.core.package$;
import com.jsoniter.JsonIterator;
import com.jsoniter.any.Any;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Level;
Expand Down Expand Up @@ -43,19 +39,7 @@ public void setup() throws IOException {
buffer = is.readAllBytes();
bufferPadded = padded(buffer);
}
}

@Benchmark
public int countUniqueUsersWithDefaultProfile_jsoniter_scala() throws IOException {
Twitter twitter = package$.MODULE$.readFromArray(buffer, ReaderConfig$.MODULE$, Twitter$.MODULE$.codec());
Set<String> defaultUsers = new HashSet<>();
for (Status tweet: twitter.statuses()) {
User user = tweet.user();
if (user.default_profile()) {
defaultUsers.add(user.screen_name());
}
}
return defaultUsers.size();
System.out.println("VectorSpecies = " + StructuralIndexer.BYTE_SPECIES);
}

@Benchmark
Expand Down Expand Up @@ -88,19 +72,6 @@ public int countUniqueUsersWithDefaultProfile_fastjson() {
return defaultUsers.size();
}

@Benchmark
public int countUniqueUsersWithDefaultProfile_jsoniter() {
Any json = JsonIterator.deserialize(buffer);
Set<String> defaultUsers = new HashSet<>();
for (Any tweet : json.get("statuses")) {
Any user = tweet.get("user");
if (user.get("default_profile").toBoolean()) {
defaultUsers.add(user.get("screen_name").toString());
}
}
return defaultUsers.size();
}

@Benchmark
public int countUniqueUsersWithDefaultProfile_simdjson() {
JsonValue simdJsonValue = simdJsonParser.parse(buffer, buffer.length);
Expand Down
123 changes: 123 additions & 0 deletions src/jmh/java/org/simdjson/SchemaBasedParseAndSelectBenchmark.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
package org.simdjson;

import com.alibaba.fastjson2.JSON;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.github.plokhotnyuk.jsoniter_scala.core.ReaderConfig$;
import com.github.plokhotnyuk.jsoniter_scala.core.package$;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;

import java.io.IOException;
import java.io.InputStream;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.TimeUnit;

import static org.simdjson.SimdJsonPaddingUtil.padded;

@State(Scope.Benchmark)
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.SECONDS)
public class SchemaBasedParseAndSelectBenchmark {

private final SimdJsonParser simdJsonParser = new SimdJsonParser();
private final ObjectMapper objectMapper = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);

private byte[] buffer;
private byte[] bufferPadded;

@Setup(Level.Trial)
public void setup() throws IOException {
try (InputStream is = ParseBenchmark.class.getResourceAsStream("/twitter.json")) {
buffer = is.readAllBytes();
bufferPadded = padded(buffer);
}
System.out.println("VectorSpecies = " + StructuralIndexer.BYTE_SPECIES);
}

@Benchmark
public int countUniqueUsersWithDefaultProfile_simdjson() {
Set<String> defaultUsers = new HashSet<>();
SimdJsonTwitter twitter = simdJsonParser.parse(buffer, buffer.length, SimdJsonTwitter.class);
for (SimdJsonStatus status : twitter.statuses()) {
SimdJsonUser user = status.user();
if (user.default_profile()) {
defaultUsers.add(user.screen_name());
}
}
return defaultUsers.size();
}

@Benchmark
public int countUniqueUsersWithDefaultProfile_simdjsonPadded() {
Set<String> defaultUsers = new HashSet<>();
SimdJsonTwitter twitter = simdJsonParser.parse(bufferPadded, buffer.length, SimdJsonTwitter.class);
for (SimdJsonStatus status : twitter.statuses()) {
SimdJsonUser user = status.user();
if (user.default_profile()) {
defaultUsers.add(user.screen_name());
}
}
return defaultUsers.size();
}

@Benchmark
public int countUniqueUsersWithDefaultProfile_jackson() throws IOException {
Set<String> defaultUsers = new HashSet<>();
SimdJsonTwitter twitter = objectMapper.readValue(buffer, SimdJsonTwitter.class);
for (SimdJsonStatus status : twitter.statuses()) {
SimdJsonUser user = status.user();
if (user.default_profile()) {
defaultUsers.add(user.screen_name());
}
}
return defaultUsers.size();
}

@Benchmark
public int countUniqueUsersWithDefaultProfile_jsoniter_scala() {
Twitter twitter = package$.MODULE$.readFromArray(buffer, ReaderConfig$.MODULE$, Twitter$.MODULE$.codec());
Set<String> defaultUsers = new HashSet<>();
for (Status tweet: twitter.statuses()) {
User user = tweet.user();
if (user.default_profile()) {
defaultUsers.add(user.screen_name());
}
}
return defaultUsers.size();
}

@Benchmark
public int countUniqueUsersWithDefaultProfile_fastjson() {
Set<String> defaultUsers = new HashSet<>();
SimdJsonTwitter twitter = JSON.parseObject(buffer, SimdJsonTwitter.class);
for (SimdJsonStatus status : twitter.statuses()) {
SimdJsonUser user = status.user();
if (user.default_profile()) {
defaultUsers.add(user.screen_name());
}
}
return defaultUsers.size();
}

record SimdJsonUser(boolean default_profile, String screen_name) {

}

record SimdJsonStatus(SimdJsonUser user) {

}

record SimdJsonTwitter(List<SimdJsonStatus> statuses) {

}
}
37 changes: 36 additions & 1 deletion src/main/java/org/simdjson/BitIndexes.java
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,26 @@ private long clearLowestBit(long bits) {
return bits & (bits - 1);
}

int advance() {
void advance() {
readIdx++;
}

int getAndAdvance() {
assert readIdx <= writeIdx;
return indexes[readIdx++];
}

int getLast() {
return indexes[writeIdx - 1];
}

int advanceAndGet() {
assert readIdx + 1 <= writeIdx;
return indexes[++readIdx];
}

int peek() {
assert readIdx <= writeIdx;
return indexes[readIdx];
}

Expand All @@ -60,6 +75,26 @@ boolean isEnd() {
return writeIdx == readIdx;
}

boolean isPastEnd() {
return readIdx > writeIdx;
}

void finish() {
// If we go past the end of the detected structural indexes, it means we are dealing with an invalid JSON.
// Thus, we need to stop processing immediately and throw an exception. To avoid checking after every increment
// of readIdx whether this has happened, we jump to the first structural element. This should produce the
// desired outcome, i.e., an iterator should detect invalid JSON. To understand how this works, let's first
// exclude primitive values (numbers, strings, booleans, nulls) from the scope of possible JSON documents. We
// can do this because, when these values are parsed, the length of the input buffer is verified, ensuring we
// never go past its end. Therefore, we can focus solely on objects and arrays. Since we always check that if
// the first character is '{', the last one must be '}', and if the first character is '[', the last one must
// be ']', we know that if we've reached beyond the buffer without crashing, the input is either '{...}' or '[...]'.
// Thus, if we jump to the first structural element, we will generate either '{...}{' or '[...]['. Both of these
// are invalid sequences and will be detected by the iterator, which will then stop processing and throw an
// exception informing about the invalid JSON.
indexes[writeIdx] = 0;
}

void reset() {
writeIdx = 0;
readIdx = 0;
Expand Down
24 changes: 24 additions & 0 deletions src/main/java/org/simdjson/ClassResolver.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package org.simdjson;

import java.lang.reflect.Type;
import java.util.HashMap;
import java.util.Map;

class ClassResolver {

private final Map<Type, ResolvedClass> classCache = new HashMap<>();

ResolvedClass resolveClass(Type type) {
ResolvedClass resolvedClass = classCache.get(type);
if (resolvedClass != null) {
return resolvedClass;
}
resolvedClass = new ResolvedClass(type, this);
classCache.put(type, resolvedClass);
return resolvedClass;
}

void reset() {
classCache.clear();
}
}
4 changes: 4 additions & 0 deletions src/main/java/org/simdjson/ConstructorArgument.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
package org.simdjson;

record ConstructorArgument(int idx, ResolvedClass resolvedClass) {
}
Loading
Loading