Skip to content

Commit

Permalink
CNDB-11655: Limit the number of clauses before optimizing the Plan
Browse files Browse the repository at this point in the history
Plan#optimize can take a very long time when given plans with thousands
of intersected clauses, which can result from using ngram analyzers.
Related issue: riptano/cndb#10085.

Fixes riptano/cndb#11655
  • Loading branch information
pkolaczk committed Nov 29, 2024
1 parent bc17f2a commit ba5b093
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 16 deletions.
4 changes: 3 additions & 1 deletion src/java/org/apache/cassandra/index/sai/plan/Plan.java
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,7 @@ public final Plan optimize()

/**
* Modifies all intersections to not intersect more clauses than the given limit.
* Retains the most selective clauses.
*/
public final Plan limitIntersectedClauses(int clauseLimit)
{
Expand Down Expand Up @@ -1700,7 +1701,8 @@ private KeysIteration union(List<KeysIteration> subplans, int id)
}

/**
* Constructs a plan node representing an intersection of two key sets.
* Constructs a plan node representing an intersection of key sets.
* The subplans will be sorted by selectivity from the most selective to the least selective ones.
* @param subplans a list of subplans for intersected key sets
*/
public KeysIteration intersection(List<KeysIteration> subplans)
Expand Down
37 changes: 22 additions & 15 deletions src/java/org/apache/cassandra/index/sai/plan/QueryController.java
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,7 @@ public class QueryController implements Plan.Executor, Plan.CostEstimator
/**
* Controls whether we optimize query plans.
* 0 disables the optimizer. As a side effect, hybrid ANN queries will default to FilterSortOrder.SCAN_THEN_FILTER.
* 1 enables the optimizer and tells the optimizer to respect the intersection clause limit.
* Higher values enable the optimizer and disable the hard intersection clause limit.
* 1 enables the optimizer.
* Note: the config is not final to simplify testing.
*/
@VisibleForTesting
Expand Down Expand Up @@ -346,34 +345,42 @@ Plan buildPlan()
rowsIteration = planFactory.recheckFilter(command.rowFilter(), rowsIteration);
rowsIteration = planFactory.limit(rowsIteration, command.limits().rows());

Plan optimizedPlan;
optimizedPlan = QUERY_OPT_LEVEL > 0
? rowsIteration.optimize()
: rowsIteration;
optimizedPlan = KeyRangeIntersectionIterator.INTERSECTION_CLAUSE_LIMIT > 0 && QUERY_OPT_LEVEL <= 1
? optimizedPlan.limitIntersectedClauses(KeyRangeIntersectionIterator.INTERSECTION_CLAUSE_LIMIT)
: optimizedPlan;
// Limit the number of intersected clauses before optimizing so we reduce the size of the
// plan given to the optimizer and hence reduce the plan search space and speed up optimization.
// It is possible that some index operators like ':' expand to a huge number of MATCH predicates
// (see CNDB-10085) and could overload the optimizer.
// The intersected subplans are ordered by selectivity in the way the best ones are at the beginning
// of the list, therefore this limit is unlikely to remove good branches of the tree.
// The limit here is higher than the final limit, so that the optimizer has a bit more freedom
// in which predicates it leaves in the plan and the probability of accidentally removing a good branch
// here is even lower.
Plan plan = rowsIteration.limitIntersectedClauses(KeyRangeIntersectionIterator.INTERSECTION_CLAUSE_LIMIT * 3);

if (optimizedPlan.contains(node -> node instanceof Plan.AnnIndexScan))
if (QUERY_OPT_LEVEL > 0)
plan = plan.optimize();

plan = plan.limitIntersectedClauses(KeyRangeIntersectionIterator.INTERSECTION_CLAUSE_LIMIT);

if (plan.contains(node -> node instanceof Plan.AnnIndexScan))
queryContext.setFilterSortOrder(QueryContext.FilterSortOrder.SCAN_THEN_FILTER);
if (optimizedPlan.contains(node -> node instanceof Plan.KeysSort))
if (plan.contains(node -> node instanceof Plan.KeysSort))
queryContext.setFilterSortOrder(QueryContext.FilterSortOrder.SEARCH_THEN_ORDER);

if (logger.isTraceEnabled())
logger.trace("Query execution plan:\n" + optimizedPlan.toStringRecursive());
logger.trace("Query execution plan:\n" + plan.toStringRecursive());

if (Tracing.isTracing())
{
Tracing.trace("Query execution plan:\n" + optimizedPlan.toStringRecursive());
Tracing.trace("Query execution plan:\n" + plan.toStringRecursive());
List<Plan.IndexScan> origIndexScans = keysIterationPlan.nodesOfType(Plan.IndexScan.class);
List<Plan.IndexScan> selectedIndexScans = optimizedPlan.nodesOfType(Plan.IndexScan.class);
List<Plan.IndexScan> selectedIndexScans = plan.nodesOfType(Plan.IndexScan.class);
Tracing.trace("Selecting {} {} of {} out of {} indexes",
selectedIndexScans.size(),
selectedIndexScans.size() > 1 ? "indexes with cardinalities" : "index with cardinality",
selectedIndexScans.stream().map(s -> "" + ((long) s.expectedKeys())).collect(Collectors.joining(", ")),
origIndexScans.size());
}
return optimizedPlan;
return plan;
}

private Plan.KeysIteration buildKeysIterationPlan()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@

import static org.assertj.core.api.Assertions.assertThatThrownBy;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

public class LuceneAnalyzerTest extends SAITester
{
Expand Down Expand Up @@ -783,4 +784,44 @@ public void testInvalidNamesOnConfig()
.isInstanceOf(InvalidRequestException.class)
.hasMessageContaining("Error configuring analyzer's filter 'synonym': Unknown parameters: {extraParam=xyz}");
}

@Test
public void testHighNumberOfMatchPredicates()
{
createTable("CREATE TABLE %s (id text PRIMARY KEY, val text)");

createIndex("CREATE CUSTOM INDEX ON %s(val) " +
"USING 'org.apache.cassandra.index.sai.StorageAttachedIndex' " +
"WITH OPTIONS = { 'index_analyzer': '{" +
" \"tokenizer\" : {\n" +
" \"name\" : \"ngram\",\n" +
" \"args\" : {\n" +
" \"minGramSize\":\"2\",\n" +
" \"maxGramSize\":\"3\"\n" +
" }\n" +
" }," +
" \"filters\" : [ {\"name\" : \"lowercase\"}] \n" +
" }'}");

// Long enough to generate several tens of thousands of ngrams:
String longParam = "The quick brown fox jumps over the lazy DOG.".repeat(250);

// Generally we expect this query to run in < 25 ms each on reasonably performant
// hardware, but because CI performance can have a lot of variability,
// we take the minimum, and we allow a large margin to avoid random failures.
var count = 5;
var minElapsed = Long.MAX_VALUE;
for (int i = 0; i < count; i++)
{
var startTime = System.currentTimeMillis();
execute("SELECT * FROM %s WHERE val : '" + longParam + '\'');
var elapsed = System.currentTimeMillis() - startTime;
if (elapsed < minElapsed)
minElapsed = elapsed;
// In extreme case we just want to bail out after the first iteration
if (elapsed > 10000)
break;
}
assertTrue("Query too slow: " + minElapsed + " ms", minElapsed < 1000);
}
}

0 comments on commit ba5b093

Please sign in to comment.