diff --git a/src/main/java/org/jabref/logic/search/query/SearchQueryConversion.java b/src/main/java/org/jabref/logic/search/query/SearchQueryConversion.java index 9c2b8ea0ff2..e782f31d1dc 100644 --- a/src/main/java/org/jabref/logic/search/query/SearchQueryConversion.java +++ b/src/main/java/org/jabref/logic/search/query/SearchQueryConversion.java @@ -7,7 +7,6 @@ import org.jabref.model.search.query.SqlQueryNode; import org.jabref.search.SearchParser; -import org.apache.lucene.search.Query; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -24,9 +23,9 @@ public static String flagsToSearchExpression(SearchQuery searchQuery) { return new SearchFlagsToExpressionVisitor(searchQuery.getSearchFlags()).visit(searchQuery.getContext()); } - public static Query searchToLucene(SearchQuery searchQuery) { + public static String searchToLucene(SearchQuery searchQuery) { LOGGER.debug("Converting search expression to Lucene: {}", searchQuery.getSearchExpression()); - return new SearchToLuceneVisitor().visit(searchQuery.getContext()); + return new SearchToLuceneVisitor(searchQuery.getSearchFlags()).visit(searchQuery.getContext()); } public static List extractSearchTerms(SearchQuery searchQuery) { diff --git a/src/main/java/org/jabref/logic/search/query/SearchToLuceneVisitor.java b/src/main/java/org/jabref/logic/search/query/SearchToLuceneVisitor.java index 4f38a0ed1e2..c25a162045c 100644 --- a/src/main/java/org/jabref/logic/search/query/SearchToLuceneVisitor.java +++ b/src/main/java/org/jabref/logic/search/query/SearchToLuceneVisitor.java @@ -1,152 +1,131 @@ package org.jabref.logic.search.query; +import java.util.EnumSet; import java.util.List; +import java.util.Locale; import org.jabref.model.search.LinkedFilesConstants; +import org.jabref.model.search.SearchFlags; import org.jabref.search.SearchBaseVisitor; import org.jabref.search.SearchParser; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.BooleanClause; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.MatchNoDocsQuery; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.RegexpQuery; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.util.QueryBuilder; +import org.apache.lucene.queryparser.classic.QueryParser; /** * Tests are located in {@link org.jabref.logic.search.query.SearchQueryLuceneConversionTest}. */ -public class SearchToLuceneVisitor extends SearchBaseVisitor { +public class SearchToLuceneVisitor extends SearchBaseVisitor { + private final EnumSet searchFlags; - private static final List SEARCH_FIELDS = LinkedFilesConstants.PDF_FIELDS; - - private final QueryBuilder queryBuilder; - - public SearchToLuceneVisitor() { - this.queryBuilder = new QueryBuilder(LinkedFilesConstants.LINKED_FILES_ANALYZER); + public SearchToLuceneVisitor(EnumSet searchFlags) { + this.searchFlags = searchFlags; } @Override - public Query visitStart(SearchParser.StartContext ctx) { + public String visitStart(SearchParser.StartContext ctx) { return visit(ctx.andExpression()); } @Override - public Query visitImplicitAndExpression(SearchParser.ImplicitAndExpressionContext ctx) { - List children = ctx.expression().stream().map(this::visit).toList(); - if (children.size() == 1) { - return children.getFirst(); - } - BooleanQuery.Builder builder = new BooleanQuery.Builder(); - for (Query child : children) { - builder.add(child, BooleanClause.Occur.MUST); - } - return builder.build(); + public String visitImplicitAndExpression(SearchParser.ImplicitAndExpressionContext ctx) { + List children = ctx.expression().stream().map(this::visit).toList(); + return children.size() == 1 ? children.getFirst() : String.join(" ", children); } @Override - public Query visitParenExpression(SearchParser.ParenExpressionContext ctx) { - return visit(ctx.andExpression()); + public String visitParenExpression(SearchParser.ParenExpressionContext ctx) { + String expr = visit(ctx.andExpression()); + return expr.isEmpty() ? "" : "(" + expr + ")"; } @Override - public Query visitNegatedExpression(SearchParser.NegatedExpressionContext ctx) { - Query innerQuery = visit(ctx.expression()); - if (innerQuery instanceof MatchNoDocsQuery) { - return innerQuery; - } - BooleanQuery.Builder builder = new BooleanQuery.Builder(); - builder.add(innerQuery, BooleanClause.Occur.MUST_NOT); - return builder.build(); + public String visitNegatedExpression(SearchParser.NegatedExpressionContext ctx) { + return "NOT (" + visit(ctx.expression()) + ")"; } @Override - public Query visitBinaryExpression(SearchParser.BinaryExpressionContext ctx) { - Query left = visit(ctx.left); - Query right = visit(ctx.right); + public String visitBinaryExpression(SearchParser.BinaryExpressionContext ctx) { + String left = visit(ctx.left); + String right = visit(ctx.right); - if (left instanceof MatchNoDocsQuery) { + if (left.isEmpty() && right.isEmpty()) { + return ""; + } + if (left.isEmpty()) { return right; } - if (right instanceof MatchNoDocsQuery) { + if (right.isEmpty()) { return left; } - BooleanQuery.Builder builder = new BooleanQuery.Builder(); + String operator = ctx.bin_op.getType() == SearchParser.AND ? " AND " : " OR "; + return left + operator + right; + } + + @Override + public String visitComparison(SearchParser.ComparisonContext ctx) { + String term = SearchQueryConversion.unescapeSearchValue(ctx.searchValue()); + boolean isQuoted = ctx.searchValue().getStart().getType() == SearchParser.STRING_LITERAL; + + // unfielded expression + if (ctx.FIELD() == null) { + if (searchFlags.contains(SearchFlags.REGULAR_EXPRESSION)) { + return "/" + term + "/"; + } + return isQuoted ? "\"" + escapeQuotes(term) + "\"" : QueryParser.escape(term); + } - if (ctx.bin_op.getType() == SearchParser.AND) { - builder.add(left, BooleanClause.Occur.MUST); - builder.add(right, BooleanClause.Occur.MUST); - } else if (ctx.bin_op.getType() == SearchParser.OR) { - builder.add(left, BooleanClause.Occur.SHOULD); - builder.add(right, BooleanClause.Occur.SHOULD); + String field = ctx.FIELD().getText().toLowerCase(Locale.ROOT); + if (!isValidField(field)) { + return ""; } - return builder.build(); + field = "any".equals(field) || "anyfield".equals(field) ? "" : field + ":"; + int operator = ctx.operator().getStart().getType(); + return buildFieldExpression(field, term, operator, isQuoted); } - @Override - public Query visitComparisonExpression(SearchParser.ComparisonExpressionContext ctx) { - return visit(ctx.comparison()); + private boolean isValidField(String field) { + return "any".equals(field) || "anyfield".equals(field) || LinkedFilesConstants.PDF_FIELDS.contains(field); } - @Override - public Query visitComparison(SearchParser.ComparisonContext ctx) { - String field = ctx.FIELD() == null ? null : ctx.FIELD().getText(); - String term = SearchQueryConversion.unescapeSearchValue(ctx.searchValue()); + private String buildFieldExpression(String field, String term, int operator, boolean isQuoted) { + boolean isRegexOp = isRegexOperator(operator); + boolean isNegationOp = isNegationOperator(operator); - // unfielded expression - if (field == null || "anyfield".equals(field) || "any".equals(field)) { - return createMultiFieldQuery(term, ctx.operator()); - } else if (SEARCH_FIELDS.contains(field)) { - return createFieldQuery(field, term, ctx.operator()); + if (isRegexOp) { + String expression = field + "/" + term + "/"; + return isNegationOp ? "NOT " + expression : expression; } else { - return new MatchNoDocsQuery(); + term = isQuoted ? "\"" + escapeQuotes(term) + "\"" : QueryParser.escape(term); + String expression = field + term; + return isNegationOp ? "NOT " + expression : expression; } } - private Query createMultiFieldQuery(String value, SearchParser.OperatorContext operator) { - BooleanQuery.Builder builder = new BooleanQuery.Builder(); - for (String field : SEARCH_FIELDS) { - builder.add(createFieldQuery(field, value, operator), BooleanClause.Occur.SHOULD); - } - return builder.build(); + private static String escapeQuotes(String term) { + return term.replace("\"", "\\\""); } - private Query createFieldQuery(String field, String value, SearchParser.OperatorContext operator) { - if (operator == null) { - return createTermOrPhraseQuery(field, value); - } - - return switch (operator.getStart().getType()) { - case SearchParser.REQUAL, - SearchParser.CREEQUAL -> - new RegexpQuery(new Term(field, value)); + private static boolean isNegationOperator(int operator) { + return switch (operator) { case SearchParser.NEQUAL, SearchParser.NCEQUAL, SearchParser.NEEQUAL, - SearchParser.NCEEQUAL -> - createNegatedQuery(createTermOrPhraseQuery(field, value)); - case SearchParser.NREQUAL, - SearchParser.NCREEQUAL -> - createNegatedQuery(new RegexpQuery(new Term(field, value))); - default -> - createTermOrPhraseQuery(field, value); + SearchParser.NCEEQUAL, + SearchParser.NREQUAL, + SearchParser.NCREEQUAL -> true; + default -> false; }; } - private Query createNegatedQuery(Query query) { - BooleanQuery.Builder negatedQuery = new BooleanQuery.Builder(); - negatedQuery.add(query, BooleanClause.Occur.MUST_NOT); - return negatedQuery.build(); - } - - private Query createTermOrPhraseQuery(String field, String value) { - if (value.contains("*") || value.contains("?")) { - return new TermQuery(new Term(field, value)); - } - return queryBuilder.createPhraseQuery(field, value); + private static boolean isRegexOperator(int operator) { + return switch (operator) { + case SearchParser.REQUAL, + SearchParser.CREEQUAL, + SearchParser.NREQUAL, + SearchParser.NCREEQUAL -> true; + default -> false; + }; } } diff --git a/src/main/java/org/jabref/logic/search/retrieval/LinkedFilesSearcher.java b/src/main/java/org/jabref/logic/search/retrieval/LinkedFilesSearcher.java index 5b2a93047fe..1e3e47a11b4 100644 --- a/src/main/java/org/jabref/logic/search/retrieval/LinkedFilesSearcher.java +++ b/src/main/java/org/jabref/logic/search/retrieval/LinkedFilesSearcher.java @@ -22,6 +22,9 @@ import org.apache.lucene.document.Document; import org.apache.lucene.index.StoredFields; +import org.apache.lucene.queryparser.classic.MultiFieldQueryParser; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; @@ -39,11 +42,14 @@ public final class LinkedFilesSearcher { private final FilePreferences filePreferences; private final BibDatabaseContext databaseContext; private final SearcherManager searcherManager; + private final MultiFieldQueryParser parser; public LinkedFilesSearcher(BibDatabaseContext databaseContext, LuceneIndexer linkedFilesIndexer, FilePreferences filePreferences) { this.searcherManager = linkedFilesIndexer.getSearcherManager(); this.databaseContext = databaseContext; this.filePreferences = filePreferences; + this.parser = new MultiFieldQueryParser(LinkedFilesConstants.PDF_FIELDS.toArray(new String[0]), LinkedFilesConstants.LINKED_FILES_ANALYZER); + parser.setDefaultOperator(QueryParser.Operator.AND); } public SearchResults search(SearchQuery searchQuery) { @@ -51,17 +57,21 @@ public SearchResults search(SearchQuery searchQuery) { return new SearchResults(); } - Query luceneQuery = SearchQueryConversion.searchToLucene(searchQuery); + Optional luceneQuery = getLuceneQuery(searchQuery); + if (luceneQuery.isEmpty()) { + return new SearchResults(); + } + EnumSet searchFlags = searchQuery.getSearchFlags(); boolean shouldSearchInLinkedFiles = searchFlags.contains(SearchFlags.FULLTEXT) && filePreferences.shouldFulltextIndexLinkedFiles(); if (!shouldSearchInLinkedFiles) { return new SearchResults(); } - LOGGER.debug("Searching in linked files with query: {}", luceneQuery); + LOGGER.debug("Searching in linked files with query: {}", luceneQuery.get()); try { IndexSearcher linkedFilesIndexSearcher = acquireIndexSearcher(searcherManager); - SearchResults searchResults = search(linkedFilesIndexSearcher, luceneQuery); + SearchResults searchResults = search(linkedFilesIndexSearcher, luceneQuery.get()); releaseIndexSearcher(searcherManager, linkedFilesIndexSearcher); return searchResults; } catch (IOException | IndexSearcher.TooManyClauses e) { @@ -70,6 +80,16 @@ public SearchResults search(SearchQuery searchQuery) { return new SearchResults(); } + private Optional getLuceneQuery(SearchQuery searchQuery) { + String query = SearchQueryConversion.searchToLucene(searchQuery); + try { + return Optional.of(parser.parse(query)); + } catch (ParseException e) { + LOGGER.error("Error during query parsing", e); + return Optional.empty(); + } + } + private SearchResults search(IndexSearcher indexSearcher, Query searchQuery) throws IOException { TopDocs topDocs = indexSearcher.search(searchQuery, Integer.MAX_VALUE); StoredFields storedFields = indexSearcher.storedFields(); diff --git a/src/test/java/org/jabref/logic/search/query/SearchQueryExtractorConversionTest.java b/src/test/java/org/jabref/logic/search/query/SearchQueryExtractorConversionTest.java index bdd4340f583..e6d9498fb59 100644 --- a/src/test/java/org/jabref/logic/search/query/SearchQueryExtractorConversionTest.java +++ b/src/test/java/org/jabref/logic/search/query/SearchQueryExtractorConversionTest.java @@ -26,7 +26,8 @@ public static Stream searchConversion() { Arguments.of(List.of(), "NOT a"), Arguments.of(List.of("a", "b", "c"), "(any = a OR any = b) AND NOT (NOT c AND title = d)"), Arguments.of(List.of("b", "c"), "title != a OR b OR c"), - Arguments.of(List.of("a", "b"), "a b") + Arguments.of(List.of("a", "b"), "a b"), + Arguments.of(List.of("term1 term2"), "\"term1 term2\"") ); } diff --git a/src/test/java/org/jabref/logic/search/query/SearchQueryLuceneConversionTest.java b/src/test/java/org/jabref/logic/search/query/SearchQueryLuceneConversionTest.java index 992cdd7dca5..7936a99b555 100644 --- a/src/test/java/org/jabref/logic/search/query/SearchQueryLuceneConversionTest.java +++ b/src/test/java/org/jabref/logic/search/query/SearchQueryLuceneConversionTest.java @@ -14,53 +14,55 @@ class SearchQueryLuceneConversionTest { public static Stream searchConversion() { return Stream.of( - Arguments.of("content:term annotations:term", "term"), - Arguments.of("content:term annotations:term", "any = term"), - Arguments.of("content:term annotations:term", "any CONTAINS term"), - Arguments.of("content:term annotations:term", "any MATCHES term"), - Arguments.of("content:term annotations:term", "any =! term"), - Arguments.of("content:term annotations:term", "any == term"), - Arguments.of("content:term annotations:term", "any ==! term"), + Arguments.of("a", "a"), + Arguments.of("the", "the"), + Arguments.of("term", "term"), + Arguments.of("term", "any = term"), + Arguments.of("term", "any CONTAINS term"), + Arguments.of("term", "any MATCHES term"), + Arguments.of("term", "any =! term"), + Arguments.of("term", "any == term"), + Arguments.of("term", "any ==! term"), - Arguments.of("content:\"two term\" annotations:\"two term\"", "\"two terms\""), - Arguments.of("content:\"two term\" annotations:\"two term\"", "any = \"two terms\""), + Arguments.of("\"two terms\"", "\"two terms\""), + Arguments.of("\"two terms\"", "any = \"two terms\""), + Arguments.of("NOT (term)", "NOT term"), - Arguments.of("content:imag", "content = image"), - Arguments.of("annotations:imag", "annotations = image"), - Arguments.of("content:\"imag process\"", "content = \"image processing\""), - Arguments.of("+content:imag +annotations:process", "content = image AND annotations = processing"), - Arguments.of("+(content:imag annotations:process) +(content:term annotations:term)", "(content = image OR annotations = processing) AND term"), - Arguments.of("(content:on annotations:on) (+(content:two annotations:two) +(content:three annotations:three))", "one OR (two AND three)"), + Arguments.of("content:image", "content = image"), + Arguments.of("annotations:image", "annotations = image"), + Arguments.of("content:\"image processing\"", "content = \"image processing\""), + Arguments.of("content:image AND annotations:processing", "content = image AND annotations = processing"), + Arguments.of("(content:image OR annotations:processing) AND term", "(content = image OR annotations = processing) AND term"), + Arguments.of("one OR (two AND three)", "one OR (two AND three)"), - Arguments.of("(-content:term) (-annotations:term)", "any != term"), - Arguments.of("(-content:term) (-annotations:term)", "any !== term"), - Arguments.of("(-content:term) (-annotations:term)", "any !=! term"), - Arguments.of("(-content:\"two term\") (-annotations:\"two term\")", "any != \"two terms\""), - Arguments.of("+(-content:imag) +(-annotations:process)", "content != image AND annotations != processing"), + Arguments.of("NOT term", "any != term"), + Arguments.of("NOT term", "any !== term"), + Arguments.of("NOT term", "any !=! term"), + Arguments.of("NOT \"two terms\"", "any != \"two terms\""), + Arguments.of("content:image AND NOT annotations:processing", "content = image AND annotations != processing"), - Arguments.of("MatchNoDocsQuery(\"\")", "title = image"), - Arguments.of("content:\"imag process\" annotations:\"imag process\"", "\"image processing\" AND author = smith"), - Arguments.of("+(content:imag annotations:imag) +(content:process annotations:process)", "image AND (title = term OR processing)"), - Arguments.of("(content:imag annotations:imag) (content:process annotations:process)", "image OR (title = term OR processing)"), - Arguments.of("MatchNoDocsQuery(\"\")", "title = \"image processing\" AND author = smith"), + // ignore non pdf fields + Arguments.of("", "title = image"), + Arguments.of("\"image processing\"", "\"image processing\" AND author = smith"), + Arguments.of("image AND (processing)", "image AND (title = term OR processing)"), + Arguments.of("image OR (processing)", "image OR (title = term OR processing)"), + Arguments.of("", "title = \"image processing\" AND author = smith"), - Arguments.of("content:neighbou?r annotations:neighbou?r", "neighbou?r"), - Arguments.of("content:neighbo* annotations:neighbo*", "neighbo*"), - Arguments.of("MatchNoDocsQuery(\"\")", "title = neighbou?r"), - Arguments.of("MatchNoDocsQuery(\"\")", "(title == chocolate) OR (author == smith)"), + Arguments.of("neighbou\\?r", "neighbou?r"), + Arguments.of("neighbo\\*", "neighbo*"), + Arguments.of("", "title = neighbou?r"), + Arguments.of("", "(title == chocolate) OR (author == smith)"), - Arguments.of("content:/(John|Doe).+(John|Doe)/ annotations:/(John|Doe).+(John|Doe)/", "any =~ \"(John|Doe).+(John|Doe)\""), - Arguments.of("content:/rev*/ annotations:/rev*/", "anyfield=~ rev*"), - Arguments.of("content:/*rev*/ annotations:/*rev*/", "anyfield=~ *rev*"), - Arguments.of("(-content:/.+/) (-annotations:/.+/)", "any !=~ .+"), - Arguments.of("(-content:/.+/) (-annotations:/.+/)", "groups !=~ .+ AND any !=~ .+") + // regex + Arguments.of("/(John|Doe).+(John|Doe)/", "any =~ \"(John|Doe).+(John|Doe)\""), + Arguments.of("/rev*/", "anyfield=~ rev*"), + Arguments.of("NOT /.+/", "any !=~ .+") ); } @ParameterizedTest @MethodSource void searchConversion(String expected, String searchExpression) { - String result = SearchQueryConversion.searchToLucene(new SearchQuery(searchExpression)).toString(); - assertEquals(expected, result); + assertEquals(expected, SearchQueryConversion.searchToLucene(new SearchQuery(searchExpression))); } }