diff --git a/CMakeLists.txt b/CMakeLists.txt index 99a68805..1ae9bf11 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.13) project(tentris LANGUAGES CXX - VERSION 1.1.0) + VERSION 1.1.2) set(CMAKE_CXX_STANDARD 20) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/version.hpp.in ${CMAKE_CURRENT_SOURCE_DIR}/src/lib/tentris/tentris_version.hpp) @@ -72,7 +72,7 @@ target_include_directories(rapidjson INTERFACE find_package(tsl-hopscotch-map REQUIRED) find_package(fmt REQUIRED) find_package(hypertrie REQUIRED) -find_package(sparql-parser REQUIRED) +find_package(sparql-parser-base REQUIRED) find_package(rdf-parser REQUIRED) if (TENTRIS_STATIC) SET(Boost_USE_STATIC_LIBS ON) @@ -103,8 +103,8 @@ target_link_libraries(tentris stdc++fs # for #include cxxopts rapidjson + sparql-parser-base::sparql-parser-base tsl::hopscotch_map - sparql-parser::sparql-parser fmt::fmt hypertrie::hypertrie rdf-parser::rdf-parser diff --git a/Dockerfile b/Dockerfile index 81040e11..c3bb87fc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,16 +1,16 @@ -FROM ubuntu:groovy AS builder +FROM ubuntu:22.04 AS builder ARG DEBIAN_FRONTEND=noninteractive ARG TENTRIS_MARCH="x86-64" RUN apt-get -qq update && \ - apt-get -qq install -y make cmake uuid-dev git openjdk-11-jdk python3-pip python3-setuptools python3-wheel libstdc++-10-dev clang-11 g++-10 pkg-config lld autoconf libtool -RUN rm /usr/bin/ld && ln -s /usr/bin/lld-11 /usr/bin/ld -ARG CXX="clang++-11" -ARG CC="clang-11" + apt-get -qq install -y make cmake uuid-dev git openjdk-11-jdk python3-pip python3-setuptools python3-wheel libstdc++-11-dev clang-14 g++-11 pkg-config lld autoconf libtool +RUN rm /usr/bin/ld && ln -s /usr/bin/lld-14 /usr/bin/ld +ARG CXX="clang++-14" +ARG CC="clang-14" ENV CXXFLAGS="${CXXFLAGS} -march=${TENTRIS_MARCH}" ENV CMAKE_EXE_LINKER_FLAGS="-L/usr/local/lib/x86_64-linux-gnu -L/lib/x86_64-linux-gnu -L/usr/lib/x86_64-linux-gnu -L/usr/local/lib" -# Compile more recent tcmalloc-minimal with clang-11 + -march +# Compile more recent tcmalloc-minimal with clang-14 + -march RUN git clone --quiet --branch gperftools-2.8.1 https://github.com/gperftools/gperftools WORKDIR /gperftools RUN ./autogen.sh diff --git a/conanfile.txt b/conanfile.txt index 5a9fab11..e0f538a8 100644 --- a/conanfile.txt +++ b/conanfile.txt @@ -2,15 +2,16 @@ boost/1.75.0 fmt/7.1.3 restinio/0.6.12 -hypertrie/0.6.0@dice-group/rc15 -rdf-parser/0.13.1@dice-group/stable -sparql-parser/0.3.0@dice-group/rc2 +hypertrie/0.6.1@dice-group/stable +rdf-parser/0.13.0@dice-group/stable +sparql-parser-base/0.2.2@dice-group/stable [options] restinio:asio=boost restinio:with_zlib=True boost:shared=False *:shared=False +sparql-parser-base:sparql_version=1.0 [generators] diff --git a/src/exec/TentrisTerminal.cpp b/src/exec/TentrisTerminal.cpp index 97a30514..1c17c61a 100644 --- a/src/exec/TentrisTerminal.cpp +++ b/src/exec/TentrisTerminal.cpp @@ -23,13 +23,13 @@ using namespace tentris::store; using namespace tentris::logging; using namespace tentris::store::cache; +using namespace tentris::store::sparql; using namespace std::filesystem; using namespace iter; using namespace tentris::tensor; using namespace std::chrono; using Variable = Dice::sparql::Variable; -using SelectModifier = Dice::sparql::Nodes::QueryNodes::SelectNodes::SelectModifier; TerminalConfig cfg; diff --git a/src/lib/tentris/http/SparqlEndpoint.hpp b/src/lib/tentris/http/SparqlEndpoint.hpp index a4a17f0f..f16d8b7c 100644 --- a/src/lib/tentris/http/SparqlEndpoint.hpp +++ b/src/lib/tentris/http/SparqlEndpoint.hpp @@ -12,6 +12,7 @@ #include #include "tentris/http/QueryResultState.hpp" +#include "tentris/store/SPARQL/ParsedSPARQL.hpp" #include "tentris/store/AtomicQueryExecutionPackageCache.hpp" #include "tentris/store/SparqlJsonResultSAXWriter.hpp" #include "tentris/store/AtomicTripleStore.hpp" @@ -28,7 +29,7 @@ namespace tentris::http { using namespace ::tentris::logging; using namespace ::tentris::store; - using SelectModifier = Dice::sparql::Nodes::QueryNodes::SelectNodes::SelectModifier; + using namespace ::tentris::store::sparql; using namespace ::tentris::tensor; using namespace std::string_literals; diff --git a/src/lib/tentris/store/QueryExecutionPackage.hpp b/src/lib/tentris/store/QueryExecutionPackage.hpp index 6f85b9ed..cfbc2e93 100644 --- a/src/lib/tentris/store/QueryExecutionPackage.hpp +++ b/src/lib/tentris/store/QueryExecutionPackage.hpp @@ -7,11 +7,9 @@ #include "tentris/store/RDF/TermStore.hpp" #include "tentris/store/AtomicTripleStore.hpp" +#include "tentris/store/SPARQL/ParsedSPARQL.hpp" #include "tentris/tensor/BoolHypertrie.hpp" -#include -#include - namespace tentris::store { class TripleStore; }; @@ -25,10 +23,10 @@ namespace tentris::store::cache { struct QueryExecutionPackage { using const_BoolHypertrie = ::tentris::tensor::const_BoolHypertrie; using time_point_t = logging::time_point_t; - using SelectModifier = Dice::sparql::Nodes::QueryNodes::SelectNodes::SelectModifier; + using SelectModifier = sparql::SelectModifier; using Variable = Dice::sparql::Variable; + using ParsedSPARQL = sparql::ParsedSPARQL; using Subscript = ::tentris::tensor::Subscript; - using SelectNode= Dice::sparql::Nodes::QueryNodes::SelectNodes::SelectNode; private: std::string sparql_string; @@ -61,18 +59,18 @@ namespace tentris::store::cache { explicit QueryExecutionPackage(const std::string &sparql_string) : sparql_string{sparql_string} { using namespace logging; logDebug(fmt::format("Parsing query: {}", sparql_string)); - std::shared_ptr selectNode= Dice::sparql_parser::parseSelectQuery(sparql_string); - subscript = std::make_shared( selectNode->getOperands(), selectNode->getSubscriptResult()); - select_modifier =selectNode->getSelectModifier(); + ParsedSPARQL parsed_sparql{sparql_string}; + subscript = parsed_sparql.getSubscript(); + select_modifier = parsed_sparql.getSelectModifier(); logDebug(fmt::format("Parsed subscript: {} [distinct = {}]", subscript, select_modifier == SelectModifier::DISTINCT)); - query_variables = selectNode->getSelectVariables(); + query_variables = parsed_sparql.getQueryVariables(); auto &triple_store = AtomicTripleStore::getInstance(); logDebug(fmt::format("Slicing TPs")); - for ([[maybe_unused]] const auto &[op_pos, tp]: iter::enumerate(selectNode->getBgps())) { + for ([[maybe_unused]] const auto &[op_pos, tp]: iter::enumerate(parsed_sparql.getBgps())) { logDebug(fmt::format("Slice key {}: ⟨{}⟩", op_pos, fmt::join(tp, ", "))); std::variant op = triple_store.resolveTriplePattern(tp); if (std::holds_alternative(op)) { @@ -150,7 +148,7 @@ struct fmt::formatter { template auto format(const tentris::store::cache::QueryExecutionPackage &p, FormatContext &ctx) { - using SelectModifier = Dice::sparql::Nodes::QueryNodes::SelectNodes::SelectModifier; + using SelectModifier = tentris::store::sparql::SelectModifier; return format_to(ctx.begin(), " SPARQL: {}\n" " subscript: {}\n" diff --git a/src/lib/tentris/store/SPARQL/ParsedSPARQL.hpp b/src/lib/tentris/store/SPARQL/ParsedSPARQL.hpp new file mode 100644 index 00000000..01be32d7 --- /dev/null +++ b/src/lib/tentris/store/SPARQL/ParsedSPARQL.hpp @@ -0,0 +1,434 @@ +#ifndef TENTRIS_SPARQLPARSER_HPP +#define TENTRIS_SPARQLPARSER_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include + +#include +#include + +#include +#include +#include +#include + + +namespace tentris::store::sparql { + + namespace { + using Subscript = einsum::internal::Subscript; + namespace parser = Dice::sparql_parser::base; + using SparqlParser = parser::SparqlParser; + using namespace fmt::literals; + } + + + enum SelectModifier { + NONE, + DISTINCT, + REDUCE + }; + + class LexerErrorListener : public antlr4::BaseErrorListener { + using Variable = Dice::sparql::Variable; + public: + LexerErrorListener() = default; + + void + syntaxError([[maybe_unused]]antlr4::Recognizer *recognizer, [[maybe_unused]]antlr4::Token *offendingSymbol, + [[maybe_unused]]size_t line, [[maybe_unused]]size_t charPositionInLine, const std::string &msg, + [[maybe_unused]]std::exception_ptr e) override { + throw std::invalid_argument{msg}; + } + }; + + class ParserErrorListener : public antlr4::BaseErrorListener { + public: + ParserErrorListener() = default; + + void + syntaxError([[maybe_unused]]antlr4::Recognizer *recognizer, [[maybe_unused]]antlr4::Token *offendingSymbol, + [[maybe_unused]]size_t line, [[maybe_unused]]size_t charPositionInLine, const std::string &msg, + [[maybe_unused]]std::exception_ptr e) override { + throw std::invalid_argument{msg}; + } + }; + + + class ParsedSPARQL { + using Term = Dice::rdf::Term; + using BNode = Dice::rdf::BNode; + using Literal = Dice::rdf::Literal; + using URIRef = Dice::rdf::URIRef; + using TriplePattern = Dice::sparql::TriplePattern; + using VarOrTerm = Dice::sparql::VarOrTerm; + using Variable = Dice::sparql::Variable; + + using SparqlLexer = parser::SparqlLexer; + using ANTLRInputStream =antlr4::ANTLRInputStream; + using CommonTokenStream = antlr4::CommonTokenStream; + using QueryContext = SparqlParser::QueryContext; + std::string sparql_str; + + SelectModifier select_modifier = NONE; + + robin_hood::unordered_map prefixes{}; + std::vector query_variables{}; + robin_hood::unordered_set variables{}; + robin_hood::unordered_set anonym_variables{}; + std::vector bgps; + uint next_anon_var_id = 0; + std::shared_ptr subscript; + + public: + + ParsedSPARQL() = default; + + + explicit ParsedSPARQL(std::string sparqlstr) : + sparql_str{std::move(sparqlstr)} { + namespace ranges = std::ranges; + + std::istringstream str_stream{sparql_str}; + ANTLRInputStream input{str_stream}; + SparqlLexer lexer{&input}; + CommonTokenStream tokens{&lexer}; + SparqlParser parser{&tokens}; + // replace the error handler + auto lexerErrorListener = LexerErrorListener{}; + lexer.removeErrorListeners(); + lexer.addErrorListener(&lexerErrorListener); + + auto parserErrorListener = ParserErrorListener{}; + parser.removeParseListeners(); + parser.removeErrorListeners(); + parser.addErrorListener(&parserErrorListener); + // check that _query is present + QueryContext *_query = parser.query(); + if (_query == nullptr) + throw std::invalid_argument("The query was not parsable"); + else { + const std::vector &prefixDecl = _query->prologue()->prefixDecl(); + for (auto &prefix : prefixDecl) + // remove < and > from <...> + prefixes[prefix->PNAME_NS()->getText()] = std::string(prefix->IRI_REF()->getText(), 1, + prefix->IRI_REF()->getText().size() - 2); + + + SparqlParser::SelectQueryContext *select = _query->selectQuery(); + select_modifier = getSelectModifier(select); + bool all_vars = false; + if (std::vector vars = select->var(); not vars.empty()) + for (auto &var : vars) + query_variables.push_back(extractVariable(var)); + else + all_vars = true; + + std::queue tripleBlocks; + for (auto &block : select->whereClause()->groupGraphPattern()->triplesBlock()) + tripleBlocks.push(block); + while (not tripleBlocks.empty()) { + auto block = tripleBlocks.front(); + tripleBlocks.pop(); + SparqlParser::TriplesSameSubjectContext *triplesSameSubject = block->triplesSameSubject(); + + VarOrTerm subj = parseVarOrTerm(triplesSameSubject->varOrTerm()); + registerVariable(subj); + SparqlParser::PropertyListNotEmptyContext *propertyListNotEmpty = triplesSameSubject->propertyListNotEmpty(); + for (auto[pred_node, obj_nodes] : iter::zip(propertyListNotEmpty->verb(), + propertyListNotEmpty->objectList())) { + VarOrTerm pred = parseVerb(pred_node); + registerVariable(pred); + + for (auto &obj_node : obj_nodes->object()) { + VarOrTerm obj = parseObject(obj_node); + registerVariable(obj); + if(ranges::find(bgps, TriplePattern{subj, pred, obj}) == bgps.end()) + bgps.push_back(TriplePattern{subj, pred, obj}); + } + } + if (auto *next_block = block->triplesBlock(); next_block) + tripleBlocks.push(next_block); + } + for (const auto &variable : query_variables) + variables.insert(variable); + if (all_vars) + for (const auto &variable : variables) + if (not anonym_variables.contains(variable)) + query_variables.push_back(variable); + + + + using Label = Subscript::Label; + // generate subscript + robin_hood::unordered_map var_to_label{}; + Label next_label = 'a'; + for (const auto &var : variables) { + var_to_label[var] = next_label++; + } + std::vector> ops_labels{}; + for (const auto &bgp : bgps) { + std::vector