From ba5b0935e89b54d0d3c8430d1a69ed901ca18f0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Ko=C5=82aczkowski?= Date: Fri, 15 Nov 2024 18:05:09 +0100 Subject: [PATCH] CNDB-11655: Limit the number of clauses before optimizing the Plan Plan#optimize can take a very long time when given plans with thousands of intersected clauses, which can result from using ngram analyzers. Related issue: https://github.com/riptano/cndb/issues/10085. Fixes https://github.com/riptano/cndb/issues/11655 --- .../apache/cassandra/index/sai/plan/Plan.java | 4 +- .../index/sai/plan/QueryController.java | 37 ++++++++++------- .../index/sai/cql/LuceneAnalyzerTest.java | 41 +++++++++++++++++++ 3 files changed, 66 insertions(+), 16 deletions(-) diff --git a/src/java/org/apache/cassandra/index/sai/plan/Plan.java b/src/java/org/apache/cassandra/index/sai/plan/Plan.java index cf6119f47d1d..2bb379a16f14 100644 --- a/src/java/org/apache/cassandra/index/sai/plan/Plan.java +++ b/src/java/org/apache/cassandra/index/sai/plan/Plan.java @@ -361,6 +361,7 @@ public final Plan optimize() /** * Modifies all intersections to not intersect more clauses than the given limit. + * Retains the most selective clauses. */ public final Plan limitIntersectedClauses(int clauseLimit) { @@ -1700,7 +1701,8 @@ private KeysIteration union(List subplans, int id) } /** - * Constructs a plan node representing an intersection of two key sets. + * Constructs a plan node representing an intersection of key sets. + * The subplans will be sorted by selectivity from the most selective to the least selective ones. * @param subplans a list of subplans for intersected key sets */ public KeysIteration intersection(List subplans) diff --git a/src/java/org/apache/cassandra/index/sai/plan/QueryController.java b/src/java/org/apache/cassandra/index/sai/plan/QueryController.java index 48a911fc9051..9a8ee4ecff00 100644 --- a/src/java/org/apache/cassandra/index/sai/plan/QueryController.java +++ b/src/java/org/apache/cassandra/index/sai/plan/QueryController.java @@ -98,8 +98,7 @@ public class QueryController implements Plan.Executor, Plan.CostEstimator /** * Controls whether we optimize query plans. * 0 disables the optimizer. As a side effect, hybrid ANN queries will default to FilterSortOrder.SCAN_THEN_FILTER. - * 1 enables the optimizer and tells the optimizer to respect the intersection clause limit. - * Higher values enable the optimizer and disable the hard intersection clause limit. + * 1 enables the optimizer. * Note: the config is not final to simplify testing. */ @VisibleForTesting @@ -346,34 +345,42 @@ Plan buildPlan() rowsIteration = planFactory.recheckFilter(command.rowFilter(), rowsIteration); rowsIteration = planFactory.limit(rowsIteration, command.limits().rows()); - Plan optimizedPlan; - optimizedPlan = QUERY_OPT_LEVEL > 0 - ? rowsIteration.optimize() - : rowsIteration; - optimizedPlan = KeyRangeIntersectionIterator.INTERSECTION_CLAUSE_LIMIT > 0 && QUERY_OPT_LEVEL <= 1 - ? optimizedPlan.limitIntersectedClauses(KeyRangeIntersectionIterator.INTERSECTION_CLAUSE_LIMIT) - : optimizedPlan; + // Limit the number of intersected clauses before optimizing so we reduce the size of the + // plan given to the optimizer and hence reduce the plan search space and speed up optimization. + // It is possible that some index operators like ':' expand to a huge number of MATCH predicates + // (see CNDB-10085) and could overload the optimizer. + // The intersected subplans are ordered by selectivity in the way the best ones are at the beginning + // of the list, therefore this limit is unlikely to remove good branches of the tree. + // The limit here is higher than the final limit, so that the optimizer has a bit more freedom + // in which predicates it leaves in the plan and the probability of accidentally removing a good branch + // here is even lower. + Plan plan = rowsIteration.limitIntersectedClauses(KeyRangeIntersectionIterator.INTERSECTION_CLAUSE_LIMIT * 3); - if (optimizedPlan.contains(node -> node instanceof Plan.AnnIndexScan)) + if (QUERY_OPT_LEVEL > 0) + plan = plan.optimize(); + + plan = plan.limitIntersectedClauses(KeyRangeIntersectionIterator.INTERSECTION_CLAUSE_LIMIT); + + if (plan.contains(node -> node instanceof Plan.AnnIndexScan)) queryContext.setFilterSortOrder(QueryContext.FilterSortOrder.SCAN_THEN_FILTER); - if (optimizedPlan.contains(node -> node instanceof Plan.KeysSort)) + if (plan.contains(node -> node instanceof Plan.KeysSort)) queryContext.setFilterSortOrder(QueryContext.FilterSortOrder.SEARCH_THEN_ORDER); if (logger.isTraceEnabled()) - logger.trace("Query execution plan:\n" + optimizedPlan.toStringRecursive()); + logger.trace("Query execution plan:\n" + plan.toStringRecursive()); if (Tracing.isTracing()) { - Tracing.trace("Query execution plan:\n" + optimizedPlan.toStringRecursive()); + Tracing.trace("Query execution plan:\n" + plan.toStringRecursive()); List origIndexScans = keysIterationPlan.nodesOfType(Plan.IndexScan.class); - List selectedIndexScans = optimizedPlan.nodesOfType(Plan.IndexScan.class); + List selectedIndexScans = plan.nodesOfType(Plan.IndexScan.class); Tracing.trace("Selecting {} {} of {} out of {} indexes", selectedIndexScans.size(), selectedIndexScans.size() > 1 ? "indexes with cardinalities" : "index with cardinality", selectedIndexScans.stream().map(s -> "" + ((long) s.expectedKeys())).collect(Collectors.joining(", ")), origIndexScans.size()); } - return optimizedPlan; + return plan; } private Plan.KeysIteration buildKeysIterationPlan() diff --git a/test/unit/org/apache/cassandra/index/sai/cql/LuceneAnalyzerTest.java b/test/unit/org/apache/cassandra/index/sai/cql/LuceneAnalyzerTest.java index c4368b8ed4ec..2403bfae8b2a 100644 --- a/test/unit/org/apache/cassandra/index/sai/cql/LuceneAnalyzerTest.java +++ b/test/unit/org/apache/cassandra/index/sai/cql/LuceneAnalyzerTest.java @@ -35,6 +35,7 @@ import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; public class LuceneAnalyzerTest extends SAITester { @@ -783,4 +784,44 @@ public void testInvalidNamesOnConfig() .isInstanceOf(InvalidRequestException.class) .hasMessageContaining("Error configuring analyzer's filter 'synonym': Unknown parameters: {extraParam=xyz}"); } + + @Test + public void testHighNumberOfMatchPredicates() + { + createTable("CREATE TABLE %s (id text PRIMARY KEY, val text)"); + + createIndex("CREATE CUSTOM INDEX ON %s(val) " + + "USING 'org.apache.cassandra.index.sai.StorageAttachedIndex' " + + "WITH OPTIONS = { 'index_analyzer': '{" + + " \"tokenizer\" : {\n" + + " \"name\" : \"ngram\",\n" + + " \"args\" : {\n" + + " \"minGramSize\":\"2\",\n" + + " \"maxGramSize\":\"3\"\n" + + " }\n" + + " }," + + " \"filters\" : [ {\"name\" : \"lowercase\"}] \n" + + " }'}"); + + // Long enough to generate several tens of thousands of ngrams: + String longParam = "The quick brown fox jumps over the lazy DOG.".repeat(250); + + // Generally we expect this query to run in < 25 ms each on reasonably performant + // hardware, but because CI performance can have a lot of variability, + // we take the minimum, and we allow a large margin to avoid random failures. + var count = 5; + var minElapsed = Long.MAX_VALUE; + for (int i = 0; i < count; i++) + { + var startTime = System.currentTimeMillis(); + execute("SELECT * FROM %s WHERE val : '" + longParam + '\''); + var elapsed = System.currentTimeMillis() - startTime; + if (elapsed < minElapsed) + minElapsed = elapsed; + // In extreme case we just want to bail out after the first iteration + if (elapsed > 10000) + break; + } + assertTrue("Query too slow: " + minElapsed + " ms", minElapsed < 1000); + } }