From a7cf710cfb036485217461bad4a9d4b0273418fb Mon Sep 17 00:00:00 2001 From: Pedro Yamada Date: Mon, 22 Apr 2024 11:11:50 +1000 Subject: [PATCH] Implement `maximum_results` option When working on large mono-repos some queries might take a very long time due to listing all files on the project. This waiting time is a combination of building the results array and serialising it back. This commit adds a query option `maximum_results`, which allows clients to ask the server to truncate results above a certain threshold number. On that case, a flag is sent back indicating to the client results have been truncated. --- watchman/InMemoryView.cpp | 8 ++++++++ watchman/cmds/query.cpp | 3 +++ watchman/cmds/since.cpp | 4 ++++ watchman/query/Query.h | 10 ++++++++++ watchman/query/QueryContext.h | 7 +++++++ watchman/query/QueryResult.h | 4 ++++ watchman/query/eval.cpp | 10 ++++++++++ watchman/query/parse.cpp | 23 +++++++++++++++++++++++ 8 files changed, 69 insertions(+) diff --git a/watchman/InMemoryView.cpp b/watchman/InMemoryView.cpp index 5b83d5b1afa3..a9ad26b8a9e5 100644 --- a/watchman/InMemoryView.cpp +++ b/watchman/InMemoryView.cpp @@ -588,6 +588,10 @@ void InMemoryView::timeGenerator(const Query* query, QueryContext* ctx) const { continue; } + if (ctx->num_results_over_maximum > 0) { + continue; + } + w_query_process_file( query, ctx, std::make_unique(f, caches_)); } @@ -899,6 +903,10 @@ void InMemoryView::allFilesGenerator(const Query* query, QueryContext* ctx) continue; } + if (ctx->num_results_over_maximum > 0) { + continue; + } + w_query_process_file( query, ctx, std::make_unique(f, caches_)); } diff --git a/watchman/cmds/query.cpp b/watchman/cmds/query.cpp index 0a4020b40127..2c58c9c925f2 100644 --- a/watchman/cmds/query.cpp +++ b/watchman/cmds/query.cpp @@ -40,6 +40,9 @@ static UntypedResponse cmd_query(Client* client, const json_ref& args) { if (res.savedStateInfo) { response.set("saved-state-info", std::move(*res.savedStateInfo)); } + if (res.exceededMaximumResults) { + response.set("exceededMaximumResults", json_boolean(res.exceededMaximumResults)); + } add_root_warnings_to_response(response, root); diff --git a/watchman/cmds/since.cpp b/watchman/cmds/since.cpp index d9089c6c77f7..03ccd7284730 100644 --- a/watchman/cmds/since.cpp +++ b/watchman/cmds/since.cpp @@ -45,6 +45,10 @@ static UntypedResponse cmd_since(Client* client, const json_ref& args) { if (res.savedStateInfo) { response.set("saved-state-info", std::move(*res.savedStateInfo)); } + if (res.exceededMaximumResults) { + response.set("exceededMaximumResults", json_boolean(res.exceededMaximumResults)); + } + add_root_warnings_to_response(response, root); return response; diff --git a/watchman/query/Query.h b/watchman/query/Query.h index 3e5f3ed7f598..192e12fce903 100644 --- a/watchman/query/Query.h +++ b/watchman/query/Query.h @@ -48,6 +48,16 @@ struct Query { bool dedup_results = false; uint32_t bench_iterations = 0; + /** + * If provided, queries with more than `maximum_results` results will + * return an empty results list and a flag indicating the response + * has been truncated. + * + * This is similar to the `empty_on_fresh_instance` option, but for + * all back-ends and responses. + */ + std::optional maximum_results = std::nullopt; + /** * Optional full path to relative root, without and with trailing slash. */ diff --git a/watchman/query/QueryContext.h b/watchman/query/QueryContext.h index bcfcc32bf2a5..fa2175073c76 100644 --- a/watchman/query/QueryContext.h +++ b/watchman/query/QueryContext.h @@ -69,6 +69,13 @@ struct QueryContext : QueryContextBase { // How many times we suppressed a result due to dedup checking uint32_t num_deduped{0}; + /** + * How many results were dropped due to `maximum_results` being exceeded. + * + * Note this may be at most 1 due to generators short-circuiting work once the limit + * is exceeded. + */ + uint32_t num_results_over_maximum{0}; // Disable fresh instance queries bool disableFreshInstance{false}; diff --git a/watchman/query/QueryResult.h b/watchman/query/QueryResult.h index d3ec4cca50ed..ae5225a7c661 100644 --- a/watchman/query/QueryResult.h +++ b/watchman/query/QueryResult.h @@ -37,6 +37,10 @@ struct QueryResult { uint32_t stateTransCountAtStartOfQuery; std::optional savedStateInfo; QueryDebugInfo debugInfo; + /** + * True if the result has been truncated due to the `maximum_results` query option. + */ + bool exceededMaximumResults = false; }; } // namespace watchman diff --git a/watchman/query/eval.cpp b/watchman/query/eval.cpp index 5c16a3632374..ae02cbf28201 100644 --- a/watchman/query/eval.cpp +++ b/watchman/query/eval.cpp @@ -115,6 +115,13 @@ void w_query_process_file( } } + // Handle `Query::maximum_results` in order to avoid handling a huge list of results. + // Once this is exceeded, all work could stop. + if (ctx->query->maximum_results && ctx->resultsArray.size() >= ctx->query->maximum_results) { + ctx->num_results_over_maximum += 1; + return; + } + ctx->maybeRender(std::move(ctx->file)); } @@ -225,6 +232,7 @@ static void execute_common( auto meta = json_object({ {"fresh_instance", json_boolean(res->isFreshInstance)}, {"num_deduped", json_integer(ctx->num_deduped)}, + {"exceeded_maximum", json_boolean(ctx->num_results_over_maximum > 0)}, {"num_results", json_integer(ctx->resultsArray.size())}, {"num_walked", json_integer(ctx->getNumWalked())}, }); @@ -237,6 +245,8 @@ static void execute_common( res->resultsArray = ctx->renderResults(); res->dedupedFileNames = std::move(ctx->dedup); + + res->exceededMaximumResults = ctx->num_results_over_maximum > 0; } // Capability indicating support for scm-aware since queries diff --git a/watchman/query/parse.cpp b/watchman/query/parse.cpp index 948e1055c63e..82ab35b10981 100644 --- a/watchman/query/parse.cpp +++ b/watchman/query/parse.cpp @@ -229,6 +229,13 @@ void parse_empty_on_fresh_instance(Query* res, const json_ref& query) { parse_bool_param(query, "empty_on_fresh_instance", false); } +void parse_maximum_results(Query* res, const json_ref& query) { + auto maximum_results = query.get_optional("maximum_results"); + if (maximum_results) { + res->maximum_results = parse_nonnegative_integer("maximum_results", maximum_results.value()); + } +} + void parse_always_include_directories(Query* res, const json_ref& query) { res->alwaysIncludeDirectories = parse_bool_param(query, "always_include_directories", false); @@ -274,6 +281,7 @@ std::shared_ptr parseQuery( parse_lock_timeout(res, query); parse_relative_root(root, res, query); parse_empty_on_fresh_instance(res, query); + parse_maximum_results(res, query); parse_fail_if_no_saved_state(res, query); parse_omit_changed_files(res, query); parse_always_include_directories(res, query); @@ -333,6 +341,9 @@ void w_query_legacy_field_list(QueryFieldList* flist) { // Translate from the legacy array into the new style, then // delegate to the main parser. // We build a big anyof expression +// +// If the old format query has an object that object's fields are merged into the new style: +// For example: `["since", "target", "spec", { "more_fields": "here" }]` std::shared_ptr parseQueryLegacy( const std::shared_ptr& root, const json_ref& args, @@ -355,6 +366,14 @@ std::shared_ptr parseQueryLegacy( auto& args_array = args.array(); for (i = start; i < args_array.size(); i++) { + if (args_array[i].isObject()) { + const auto& object = args_array[i]; + for (const auto& entry : object.object()) { + query_obj.set(entry.first, json_ref(entry.second)); + } + continue; + } + const char* arg = json_string_value(args_array[i]); if (!arg) { /* not a string value! */ @@ -364,6 +383,10 @@ std::shared_ptr parseQueryLegacy( } for (i = start; i < json_array_size(args); i++) { + if (!args_array[i].isString()) { + continue; + } + const char* arg = json_string_value(args_array[i]); if (!strcmp(arg, "--")) { i++;