diff --git a/src/java/org/apache/cassandra/cql3/functions/IndexFcts.java b/src/java/org/apache/cassandra/cql3/functions/IndexFcts.java index d8b5a9b29c64..eeb2edd9c8aa 100644 --- a/src/java/org/apache/cassandra/cql3/functions/IndexFcts.java +++ b/src/java/org/apache/cassandra/cql3/functions/IndexFcts.java @@ -64,7 +64,7 @@ public ByteBuffer execute(ProtocolVersion protocolVersion, List para LuceneAnalyzer luceneAnalyzer = null; List tokens = new ArrayList<>(); - try (Analyzer analyzer = JSONAnalyzerParser.parse(json)) + try (Analyzer analyzer = JSONAnalyzerParser.parse(json).left) { luceneAnalyzer = new LuceneAnalyzer(UTF8Type.instance, analyzer, new HashMap<>()); diff --git a/src/java/org/apache/cassandra/index/sai/StorageAttachedIndex.java b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndex.java index d974da34ccde..fd2ab5795dce 100644 --- a/src/java/org/apache/cassandra/index/sai/StorageAttachedIndex.java +++ b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndex.java @@ -106,6 +106,7 @@ import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.IndexMetadata; import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.ClientWarn; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.Pair; @@ -115,6 +116,13 @@ public class StorageAttachedIndex implements Index { + public static final String NGRAM_WITHOUT_QUERY_ANALYZER_WARNING = + "Using an ngram analyzer without defining a query_analyzer. " + + "This means that the same ngram analyzer will be applied to both indexed and queried column values. " + + "Applying ngram analysis to the queried values usually produces too many search tokens to be useful. " + + "The large number of tokens can also have a negative impact in performance. " + + "In most cases it's better to use a simpler query_analyzer such as the standard one."; + private static final Logger logger = LoggerFactory.getLogger(StorageAttachedIndex.class); private static final VectorTypeSupport vts = VectorizationProvider.getInstance().getVectorTypeSupport(); @@ -320,9 +328,16 @@ public static Map validateOptions(Map options, T } AbstractType type = TypeUtil.cellValueType(target.left, target.right); - AbstractAnalyzer.fromOptions(type, options); // will throw if invalid - if (AbstractAnalyzer.hasQueryAnalyzer(options)) - AbstractAnalyzer.fromOptionsQueryAnalyzer(type, options); // will throw if invalid + + // Validate analyzers by building them + try (AbstractAnalyzer.AnalyzerFactory analyzerFactory = AbstractAnalyzer.fromOptions(type, options)) + { + if (AbstractAnalyzer.hasQueryAnalyzer(options)) + AbstractAnalyzer.fromOptionsQueryAnalyzer(type, options).close(); + else if (analyzerFactory.isNGram()) + ClientWarn.instance.warn(NGRAM_WITHOUT_QUERY_ANALYZER_WARNING); + } + var config = IndexWriterConfig.fromOptions(null, type, options); // If we are indexing map entries we need to validate the sub-types diff --git a/src/java/org/apache/cassandra/index/sai/analyzer/AbstractAnalyzer.java b/src/java/org/apache/cassandra/index/sai/analyzer/AbstractAnalyzer.java index 04fe75e6f42f..b7ad18340aaf 100644 --- a/src/java/org/apache/cassandra/index/sai/analyzer/AbstractAnalyzer.java +++ b/src/java/org/apache/cassandra/index/sai/analyzer/AbstractAnalyzer.java @@ -42,6 +42,7 @@ import org.apache.cassandra.db.marshal.UTF8Type; import org.apache.cassandra.exceptions.InvalidRequestException; import org.apache.cassandra.index.sai.utils.TypeUtil; +import org.apache.cassandra.utils.Pair; import org.apache.lucene.analysis.Analyzer; public abstract class AbstractAnalyzer implements Iterator @@ -111,6 +112,15 @@ default boolean supportsEquals() return true; } + /** + * @return {@link true} if this analyzer configuration has a n-gram tokenizer or any of its filters is n-gram. + */ + default boolean isNGram() + { + return false; + } + + @Override default void close() { } @@ -132,7 +142,10 @@ public static AnalyzerFactory toAnalyzerFactory(String json, final AbstractType< try { - final Analyzer analyzer = JSONAnalyzerParser.parse(json); + Pair analyzerAndConfig = JSONAnalyzerParser.parse(json); + final Analyzer analyzer = analyzerAndConfig.left; + final boolean isNGram = analyzerAndConfig.right != null && analyzerAndConfig.right.isNGram(); + return new AnalyzerFactory() { @Override @@ -141,11 +154,18 @@ public void close() analyzer.close(); } + @Override public AbstractAnalyzer create() { return new LuceneAnalyzer(type, analyzer, options); } + @Override + public boolean isNGram() + { + return isNGram; + } + @Override public boolean supportsEquals() { diff --git a/src/java/org/apache/cassandra/index/sai/analyzer/JSONAnalyzerParser.java b/src/java/org/apache/cassandra/index/sai/analyzer/JSONAnalyzerParser.java index 9c0bf061292f..5c4e259efee2 100644 --- a/src/java/org/apache/cassandra/index/sai/analyzer/JSONAnalyzerParser.java +++ b/src/java/org/apache/cassandra/index/sai/analyzer/JSONAnalyzerParser.java @@ -25,6 +25,7 @@ import com.fasterxml.jackson.databind.exc.UnrecognizedPropertyException; import org.apache.cassandra.exceptions.InvalidRequestException; import org.apache.cassandra.index.sai.analyzer.filter.BuiltInAnalyzers; +import org.apache.cassandra.utils.Pair; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharFilterFactory; import org.apache.lucene.analysis.TokenFilterFactory; @@ -36,12 +37,12 @@ public class JSONAnalyzerParser { private static final ObjectMapper JSON_MAPPER = new ObjectMapper(); - public static Analyzer parse(String json) throws IOException + public static Pair parse(String json) throws IOException { Analyzer analyzer = matchBuiltInAnalzyer(json.toUpperCase()); if (analyzer != null) { - return analyzer; + return Pair.create(analyzer, null); } LuceneCustomAnalyzerConfig analyzerModel; @@ -147,12 +148,15 @@ public static Analyzer parse(String json) throws IOException throw new InvalidRequestException("Error configuring analyzer's charFilter '" + charFilter.getName() + "': " + e.getMessage()); } } - return builder.build(); + return Pair.create(builder.build(), analyzerModel); } - private static Analyzer matchBuiltInAnalzyer(String maybeAnalyzer) { - for (BuiltInAnalyzers analyzer : BuiltInAnalyzers.values()) { - if (analyzer.name().equals(maybeAnalyzer)) { + private static Analyzer matchBuiltInAnalzyer(String maybeAnalyzer) + { + for (BuiltInAnalyzers analyzer : BuiltInAnalyzers.values()) + { + if (analyzer.name().equals(maybeAnalyzer)) + { return analyzer.getNewAnalyzer(); } } diff --git a/src/java/org/apache/cassandra/index/sai/analyzer/LuceneCustomAnalyzerConfig.java b/src/java/org/apache/cassandra/index/sai/analyzer/LuceneCustomAnalyzerConfig.java index 5e2bff5ab4fc..de8bfbe6091a 100644 --- a/src/java/org/apache/cassandra/index/sai/analyzer/LuceneCustomAnalyzerConfig.java +++ b/src/java/org/apache/cassandra/index/sai/analyzer/LuceneCustomAnalyzerConfig.java @@ -51,4 +51,21 @@ public List getCharFilters() { return charFilters; } + + /** + * @return {@link true} if this analyzer configuration has a n-gram tokenizer or any of its filters is n-gram. + */ + public boolean isNGram() + { + if (getTokenizer().getName().equals("ngram")) + return true; + + for (LuceneClassNameAndArgs filter : getFilters()) + { + if (filter.getName().equals("ngram")) + return true; + } + + return false; + } } diff --git a/test/unit/org/apache/cassandra/index/sai/analyzer/LuceneAnalyzerTest.java b/test/unit/org/apache/cassandra/index/sai/analyzer/LuceneAnalyzerTest.java index 4e9ed0d6c00e..3036ae287107 100644 --- a/test/unit/org/apache/cassandra/index/sai/analyzer/LuceneAnalyzerTest.java +++ b/test/unit/org/apache/cassandra/index/sai/analyzer/LuceneAnalyzerTest.java @@ -189,14 +189,14 @@ public void testMissingClassException() throws Exception public static List tokenize(String testString, String json) throws Exception { - Analyzer luceneAnalyzer = JSONAnalyzerParser.parse(json); - LuceneAnalyzer analyzer = new LuceneAnalyzer(UTF8Type.instance, luceneAnalyzer, new HashMap()); + Analyzer luceneAnalyzer = JSONAnalyzerParser.parse(json).left; + LuceneAnalyzer analyzer = new LuceneAnalyzer(UTF8Type.instance, luceneAnalyzer, new HashMap<>()); ByteBuffer toAnalyze = ByteBuffer.wrap(testString.getBytes(Charsets.UTF_8)); analyzer.reset(toAnalyze); ByteBuffer analyzed = null; - List list = new ArrayList(); + List list = new ArrayList<>(); while (analyzer.hasNext()) { diff --git a/test/unit/org/apache/cassandra/index/sai/cql/LuceneAnalyzerTest.java b/test/unit/org/apache/cassandra/index/sai/cql/LuceneAnalyzerTest.java index 2403bfae8b2a..520ff6e17d9b 100644 --- a/test/unit/org/apache/cassandra/index/sai/cql/LuceneAnalyzerTest.java +++ b/test/unit/org/apache/cassandra/index/sai/cql/LuceneAnalyzerTest.java @@ -29,8 +29,10 @@ import org.apache.cassandra.cql3.restrictions.StatementRestrictions; import org.apache.cassandra.exceptions.InvalidRequestException; import org.apache.cassandra.index.sai.SAITester; +import org.apache.cassandra.index.sai.StorageAttachedIndex; import org.apache.cassandra.index.sai.analyzer.AnalyzerEqOperatorSupport; import org.apache.cassandra.index.sai.analyzer.filter.BuiltInAnalyzers; +import org.apache.cassandra.service.ClientWarn; import org.assertj.core.api.Assertions; import static org.assertj.core.api.Assertions.assertThatThrownBy; @@ -824,4 +826,61 @@ public void testHighNumberOfMatchPredicates() } assertTrue("Query too slow: " + minElapsed + " ms", minElapsed < 1000); } + + @Test + public void testClientWarningOnNGram() + { + // no explicit analyzer + assertNoWarning("{}"); + assertNoWarning("{'ascii': 'true', 'case_sensitive': 'false', 'normalize': 'true'}"); + + // standard analyzer + assertNoWarning("{'index_analyzer': 'standard'}"); + assertNoWarning("{'index_analyzer': 'whitespace'}"); + + // custom non-ngram analyzer + + // ngram analyzer without query_analyzer + assertClientWarningOnNGram("{'index_analyzer': '{" + + " \"tokenizer\":{\"name\":\"ngram\", \"args\":{\"minGramSize\":\"2\", \"maxGramSize\":\"3\"}}," + + " \"filters\":[{\"name\":\"lowercase\"}]}'}"); + assertClientWarningOnNGram("{'index_analyzer': '{" + + " \"tokenizer\":{\"name\":\"whitespace\"}," + + " \"filters\":[{\"name\":\"ngram\", \"args\":{\"minGramSize\":\"2\", \"maxGramSize\":\"3\"}}]}'}"); + + // ngram analyzer with ngram query_analyzer + assertNoWarning("{'index_analyzer': '{" + + " \"tokenizer\":{\"name\":\"ngram\", \"args\":{\"minGramSize\":\"2\", \"maxGramSize\":\"3\"}}}'," + + "'query_analyzer': '{" + + " \"tokenizer\":{\"name\":\"ngram\", \"args\":{\"minGramSize\":\"2\", \"maxGramSize\":\"3\"}}}'}"); + + // ngram analyzer with non-ngram query_analyzer + assertNoWarning("{'index_analyzer': '{" + + " \"tokenizer\":{\"name\":\"ngram\", \"args\":{\"minGramSize\":\"2\", \"maxGramSize\":\"3\"}}," + + " \"filters\":[{\"name\":\"lowercase\"}]}'," + + "'query_analyzer': '{" + + " \"tokenizer\":{\"name\":\"whitespace\"}," + + " \"filters\":[{\"name\":\"porterstem\"}]}'}"); + } + + private void assertClientWarningOnNGram(String indexOptions) + { + createIndexFromOptions(indexOptions); + Assertions.assertThat(ClientWarn.instance.getWarnings()) + .hasSize(1) + .allMatch(w -> w.contains(StorageAttachedIndex.NGRAM_WITHOUT_QUERY_ANALYZER_WARNING)); + } + + private void assertNoWarning(String indexOptions) + { + createIndexFromOptions(indexOptions); + Assertions.assertThat(ClientWarn.instance.getWarnings()).isNull(); + } + + private void createIndexFromOptions(String options) + { + createTable("CREATE TABLE %s (k int PRIMARY KEY, v text)"); + ClientWarn.instance.captureWarnings(); + createIndex("CREATE CUSTOM INDEX ON %s(v) USING 'StorageAttachedIndex' WITH OPTIONS = " + options); + } }