Skip to content

Commit

Permalink
feat(fuzzer): Add input generator for json_parse in expression fuzzer (
Browse files Browse the repository at this point in the history
…#11466)

Summary:


Make expression fuzzer generate input vectors of valid JSON strings for the 
json_parse function. To test corner cases, the JSON strings may be 
randomly truncated or inserted with a space character.

Differential Revision: D65576377
  • Loading branch information
kagamiori authored and facebook-github-bot committed Dec 4, 2024
1 parent f9c213f commit f51f6c0
Show file tree
Hide file tree
Showing 10 changed files with 227 additions and 38 deletions.
3 changes: 1 addition & 2 deletions velox/exec/fuzzer/PrestoQueryRunner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -450,8 +450,7 @@ bool PrestoQueryRunner::isSupported(const exec::FunctionSignature& signature) {
return !(
usesTypeName(signature, "interval year to month") ||
usesTypeName(signature, "hugeint") ||
usesTypeName(signature, "hyperloglog") ||
usesTypeName(signature, "json"));
usesTypeName(signature, "hyperloglog"));
}

std::optional<std::string> PrestoQueryRunner::toSql(
Expand Down
28 changes: 28 additions & 0 deletions velox/expression/fuzzer/ExpressionFuzzer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -647,6 +647,7 @@ ExpressionFuzzer::ExpressionFuzzer(
// Register function override (for cases where we want to restrict the types
// or parameters we pass to functions).
registerFuncOverride(&ExpressionFuzzer::generateSwitchArgs, "switch");
registerFuncOverride(&ExpressionFuzzer::generateJsonParseArg, "json_parse");
}

bool ExpressionFuzzer::isSupportedSignature(
Expand Down Expand Up @@ -771,6 +772,7 @@ core::TypedExprPtr ExpressionFuzzer::generateArgColumn(const TypePtr& arg) {
state.inputRowTypes_.emplace_back(arg);
state.inputRowNames_.emplace_back(
fmt::format("c{}", state.inputRowTypes_.size() - 1));
state.customInputGenerators_.emplace_back(nullptr);
listOfCandidateCols.push_back(state.inputRowNames_.back());
return std::make_shared<core::FieldAccessTypedExpr>(
arg, state.inputRowNames_.back());
Expand Down Expand Up @@ -940,6 +942,31 @@ std::vector<core::TypedExprPtr> ExpressionFuzzer::generateSwitchArgs(
return inputExpressions;
}

std::vector<core::TypedExprPtr> ExpressionFuzzer::generateJsonParseArg(
const CallableSignature& input) {
VELOX_CHECK_EQ(input.args.size(), 1);
std::vector<core::TypedExprPtr> inputExpressions;

state.inputRowTypes_.emplace_back(input.args[0]);
state.inputRowNames_.emplace_back(
fmt::format("c{}", state.inputRowTypes_.size() - 1));

const auto representedType = vectorFuzzer_->randType(3);
const auto seed = rand<uint32_t>(rng_);
const auto nullRatio = vectorFuzzer_->getOptions().nullRatio;
state.customInputGenerators_.emplace_back(
std::make_shared<fuzzer::JsonInputGenerator>(
seed,
input.args[0],
nullRatio,
fuzzer::getRandomInputGenerator(seed, representedType, nullRatio),
true));

inputExpressions.push_back(std::make_shared<core::FieldAccessTypedExpr>(
input.args[0], state.inputRowNames_.back()));
return inputExpressions;
}

ExpressionFuzzer::FuzzedExpressionData ExpressionFuzzer::fuzzExpressions(
const RowTypePtr& outType) {
state.reset();
Expand All @@ -953,6 +980,7 @@ ExpressionFuzzer::FuzzedExpressionData ExpressionFuzzer::fuzzExpressions(
return {
std::move(expressions),
ROW(std::move(state.inputRowNames_), std::move(state.inputRowTypes_)),
std::move(state.customInputGenerators_),
std::move(state.expressionStats_)};
}

Expand Down
13 changes: 13 additions & 0 deletions velox/expression/fuzzer/ExpressionFuzzer.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,12 @@ class ExpressionFuzzer {
// The input vector type that is expected by the generated expressions.
RowTypePtr inputType;

// Custom input generators for input vectors. The generator at index i
// corresponds to the i-th field in inputType. If customInputGenerators[i]
// doesn't exist or is nullptr, then no custom input generator is used for
// the i-th field.
std::vector<std::shared_ptr<AbstractInputGenerator>> customInputGenerators;

// Count how many times each expression has been selected in expressions.
std::unordered_map<std::string, size_t> selectionStats;
};
Expand Down Expand Up @@ -270,6 +276,11 @@ class ExpressionFuzzer {
std::vector<core::TypedExprPtr> generateSwitchArgs(
const CallableSignature& input);

/// Specialization for the "json_parse" function. It makes json_parse take a
/// column argument that contains valid JSON strings with random variations.
std::vector<core::TypedExprPtr> generateJsonParseArg(
const CallableSignature& input);

core::TypedExprPtr getCallExprFromCallable(
const CallableSignature& callable,
const TypePtr& type);
Expand Down Expand Up @@ -417,6 +428,8 @@ class ExpressionFuzzer {
/// expressions consume.
std::vector<TypePtr> inputRowTypes_;
std::vector<std::string> inputRowNames_;
/// Contains the custom input generators for the input vectors.
std::vector<std::shared_ptr<AbstractInputGenerator>> customInputGenerators_;

// Count how many times each function has been selected.
std::unordered_map<std::string, size_t> expressionStats_;
Expand Down
11 changes: 7 additions & 4 deletions velox/expression/fuzzer/ExpressionFuzzerVerifier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -323,8 +323,10 @@ void ExpressionFuzzerVerifier::retryWithTry(

RowVectorPtr ExpressionFuzzerVerifier::fuzzInputWithRowNumber(
VectorFuzzer& fuzzer,
const RowTypePtr& type) {
auto rowVector = fuzzer.fuzzInputRow(type);
const RowTypePtr& type,
const std::vector<std::shared_ptr<AbstractInputGenerator>>&
inputGenerators) {
auto rowVector = fuzzer.fuzzInputRow(type, inputGenerators);
auto names = type->names();
names.push_back("row_number");

Expand Down Expand Up @@ -362,7 +364,7 @@ void ExpressionFuzzerVerifier::go() {
// re-use columns and share sub-expressions if the appropriate flag is set.
int numExpressionTrees = boost::random::uniform_int_distribution<int>(
1, options_.maxExpressionTreesPerStep)(rng_);
auto [expressions, inputType, selectionStats] =
auto [expressions, inputType, inputGenerators, selectionStats] =
expressionFuzzer_.fuzzExpressions(numExpressionTrees);
// Project a row number column in the output to enable epsilon-comparison
// for floating-point columns and make investigation of failures easier.
Expand All @@ -375,7 +377,8 @@ void ExpressionFuzzerVerifier::go() {

std::vector<core::TypedExprPtr> plans = std::move(expressions);

auto rowVector = fuzzInputWithRowNumber(*vectorFuzzer_, inputType);
auto rowVector =
fuzzInputWithRowNumber(*vectorFuzzer_, inputType, inputGenerators);

InputRowMetadata inputRowMetadata =
generateInputRowMetadata(rowVector, *vectorFuzzer_);
Expand Down
4 changes: 3 additions & 1 deletion velox/expression/fuzzer/ExpressionFuzzerVerifier.h
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,9 @@ class ExpressionFuzzerVerifier {
// Fuzzes the input vector of type with an additional row number column.
RowVectorPtr fuzzInputWithRowNumber(
VectorFuzzer& fuzzer,
const RowTypePtr& type);
const RowTypePtr& type,
const std::vector<std::shared_ptr<AbstractInputGenerator>>&
inputGenerators);

const Options options_;

Expand Down
8 changes: 7 additions & 1 deletion velox/vector/fuzzer/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,13 @@ target_link_libraries(
add_library(velox_vector_fuzzer GeneratorSpec.cpp VectorFuzzer.cpp)

target_link_libraries(
velox_vector_fuzzer velox_type velox_vector velox_vector_fuzzer_util)
velox_vector_fuzzer
velox_type
velox_vector
velox_vector_fuzzer_util
velox_presto_types
velox_fuzzer_constrained_input_generators)

if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
target_compile_options(velox_vector_fuzzer
PRIVATE -Wno-deprecated-declarations)
Expand Down
98 changes: 76 additions & 22 deletions velox/vector/fuzzer/VectorFuzzer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,21 @@
#include <locale>

#include "velox/common/base/Exceptions.h"
#include "velox/functions/prestosql/types/JsonType.h"
#include "velox/type/Timestamp.h"
#include "velox/vector/BaseVector.h"
#include "velox/vector/FlatVector.h"
#include "velox/vector/NullsBuilder.h"
#include "velox/vector/VectorTypeUtils.h"
#include "velox/vector/fuzzer/ConstrainedVectorGenerator.h"
#include "velox/vector/fuzzer/Utils.h"

namespace facebook::velox {

namespace {

using fuzzer::AbstractInputGenerator;

// Structure to help temporary changes to Options. This objects saves the
// current state of the Options object, and restores it when it's destructed.
// For instance, if you would like to temporarily disable nulls for a particular
Expand Down Expand Up @@ -116,7 +120,13 @@ VectorPtr fuzzConstantPrimitiveImpl(
const TypePtr& type,
vector_size_t size,
FuzzerGenerator& rng,
const VectorFuzzer::Options& opts) {
const VectorFuzzer::Options& opts,
const std::shared_ptr<AbstractInputGenerator>& customGenerator) {
if (customGenerator) {
return fuzzer::ConstrainedVectorGenerator::generateConstant(
customGenerator, size, pool);
}

using TCpp = typename TypeTraits<kind>::NativeType;
if constexpr (std::is_same_v<TCpp, StringView>) {
std::wstring_convert<std::codecvt_utf8<char16_t>, char16_t> converter;
Expand Down Expand Up @@ -225,21 +235,31 @@ bool hasNestedDictionaryLayers(const VectorPtr& baseVector) {

} // namespace

VectorPtr VectorFuzzer::fuzzNotNull(const TypePtr& type) {
return fuzzNotNull(type, opts_.vectorSize);
VectorPtr VectorFuzzer::fuzzNotNull(
const TypePtr& type,
const std::shared_ptr<AbstractInputGenerator>& customGenerator) {
return fuzzNotNull(type, opts_.vectorSize, customGenerator);
}

VectorPtr VectorFuzzer::fuzzNotNull(const TypePtr& type, vector_size_t size) {
VectorPtr VectorFuzzer::fuzzNotNull(
const TypePtr& type,
vector_size_t size,
const std::shared_ptr<AbstractInputGenerator>& customGenerator) {
ScopedOptions restorer(this);
opts_.nullRatio = 0;
return fuzz(type, size);
return fuzz(type, size, customGenerator);
}

VectorPtr VectorFuzzer::fuzz(const TypePtr& type) {
return fuzz(type, opts_.vectorSize);
VectorPtr VectorFuzzer::fuzz(
const TypePtr& type,
const std::shared_ptr<AbstractInputGenerator>& customGenerator) {
return fuzz(type, opts_.vectorSize, customGenerator);
}

VectorPtr VectorFuzzer::fuzz(const TypePtr& type, vector_size_t size) {
VectorPtr VectorFuzzer::fuzz(
const TypePtr& type,
vector_size_t size,
const std::shared_ptr<AbstractInputGenerator>& customGenerator) {
VectorPtr vector;
vector_size_t vectorSize = size;

Expand All @@ -252,13 +272,32 @@ VectorPtr VectorFuzzer::fuzz(const TypePtr& type, vector_size_t size) {

// 20% chance of adding a constant vector.
if (coinToss(0.2)) {
vector = fuzzConstant(type, vectorSize);
} else if (type->isPrimitiveType()) {
vector = fuzzFlatPrimitive(type, vectorSize);
vector = fuzzConstant(type, vectorSize, customGenerator);
} else if (type->isOpaque()) {
vector = fuzzFlatOpaque(type, vectorSize);
} else {
vector = fuzzComplex(type, vectorSize);
if (customGenerator) {
vector = fuzzer::ConstrainedVectorGenerator::generateFlat(
customGenerator, vectorSize, pool_);
} else {
if (isJsonType(type)) {
const auto representedType = randType(3);
const auto seed = rand<uint32_t>(rng_);
std::shared_ptr<AbstractInputGenerator> generator =
std::make_unique<fuzzer::JsonInputGenerator>(
seed,
type,
opts_.nullRatio,
fuzzer::getRandomInputGenerator(
seed, representedType, opts_.nullRatio),
true);
vector = fuzzer::ConstrainedVectorGenerator::generateFlat(
generator, vectorSize, pool_);
} else {
vector = type->isPrimitiveType() ? fuzzFlatPrimitive(type, vectorSize)
: fuzzComplex(type, vectorSize);
}
}
}

if (vectorSize > size) {
Expand Down Expand Up @@ -290,11 +329,16 @@ VectorPtr VectorFuzzer::fuzz(const GeneratorSpec& generatorSpec) {
return generatorSpec.generateData(rng_, pool_, opts_.vectorSize);
}

VectorPtr VectorFuzzer::fuzzConstant(const TypePtr& type) {
return fuzzConstant(type, opts_.vectorSize);
VectorPtr VectorFuzzer::fuzzConstant(
const TypePtr& type,
const std::shared_ptr<AbstractInputGenerator>& customGenerator) {
return fuzzConstant(type, opts_.vectorSize, customGenerator);
}

VectorPtr VectorFuzzer::fuzzConstant(const TypePtr& type, vector_size_t size) {
VectorPtr VectorFuzzer::fuzzConstant(
const TypePtr& type,
vector_size_t size,
const std::shared_ptr<AbstractInputGenerator>& customGenerator) {
// For constants, there are two possible cases:
// - generate a regular constant vector (only for primitive types).
// - generate a random vector and wrap it using a constant vector.
Expand All @@ -313,7 +357,8 @@ VectorPtr VectorFuzzer::fuzzConstant(const TypePtr& type, vector_size_t size) {
type,
size,
rng_,
opts_);
opts_,
customGenerator);
}
}

Expand All @@ -334,9 +379,10 @@ VectorPtr VectorFuzzer::fuzzConstant(const TypePtr& type, vector_size_t size) {
opts_.maxConstantContainerSize.value(), opts_.containerLength);
opts_.complexElementsMaxSize = std::min<int32_t>(
opts_.maxConstantContainerSize.value(), opts_.complexElementsMaxSize);
// TODO: incorporate fuzzer options into customGenerator.
}
return BaseVector::wrapInConstant(
size, constantIndex, fuzz(type, innerVectorSize));
size, constantIndex, fuzz(type, innerVectorSize, customGenerator));
}

VectorPtr VectorFuzzer::fuzzFlat(const TypePtr& type) {
Expand Down Expand Up @@ -622,8 +668,11 @@ MapVectorPtr VectorFuzzer::fuzzMap(
values);
}

RowVectorPtr VectorFuzzer::fuzzInputRow(const RowTypePtr& rowType) {
return fuzzRow(rowType, opts_.vectorSize, false);
RowVectorPtr VectorFuzzer::fuzzInputRow(
const RowTypePtr& rowType,
const std::vector<std::shared_ptr<AbstractInputGenerator>>&
inputGenerators) {
return fuzzRow(rowType, opts_.vectorSize, false, inputGenerators);
}

RowVectorPtr VectorFuzzer::fuzzInputFlatRow(const RowTypePtr& rowType) {
Expand Down Expand Up @@ -680,14 +729,19 @@ RowVectorPtr VectorFuzzer::fuzzRow(const RowTypePtr& rowType) {
RowVectorPtr VectorFuzzer::fuzzRow(
const RowTypePtr& rowType,
vector_size_t size,
bool allowTopLevelNulls) {
bool allowTopLevelNulls,
const std::vector<std::shared_ptr<AbstractInputGenerator>>&
inputGenerators) {
std::vector<VectorPtr> children;
children.reserve(rowType->size());

for (auto i = 0; i < rowType->size(); ++i) {
const auto& inputGenerator =
inputGenerators.size() > i ? inputGenerators[i] : nullptr;
children.push_back(
opts_.containerHasNulls ? fuzz(rowType->childAt(i), size)
: fuzzNotNull(rowType->childAt(i), size));
opts_.containerHasNulls
? fuzz(rowType->childAt(i), size, inputGenerator)
: fuzzNotNull(rowType->childAt(i), size, inputGenerator));
}

return std::make_shared<RowVector>(
Expand Down
Loading

0 comments on commit f51f6c0

Please sign in to comment.