Skip to content

Commit

Permalink
CNDB-11742: Add a client warning when using an n-gram analyzer withou…
Browse files Browse the repository at this point in the history
…t a query analyzer (#1421)
  • Loading branch information
adelapena authored Dec 2, 2024
1 parent ba5b093 commit 262785b
Show file tree
Hide file tree
Showing 7 changed files with 129 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ public ByteBuffer execute(ProtocolVersion protocolVersion, List<ByteBuffer> para

LuceneAnalyzer luceneAnalyzer = null;
List<String> tokens = new ArrayList<>();
try (Analyzer analyzer = JSONAnalyzerParser.parse(json))
try (Analyzer analyzer = JSONAnalyzerParser.parse(json).left)
{
luceneAnalyzer = new LuceneAnalyzer(UTF8Type.instance, analyzer, new HashMap<>());

Expand Down
21 changes: 18 additions & 3 deletions src/java/org/apache/cassandra/index/sai/StorageAttachedIndex.java
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@
import org.apache.cassandra.schema.ColumnMetadata;
import org.apache.cassandra.schema.IndexMetadata;
import org.apache.cassandra.schema.TableMetadata;
import org.apache.cassandra.service.ClientWarn;
import org.apache.cassandra.service.StorageService;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.Pair;
Expand All @@ -115,6 +116,13 @@

public class StorageAttachedIndex implements Index
{
public static final String NGRAM_WITHOUT_QUERY_ANALYZER_WARNING =
"Using an ngram analyzer without defining a query_analyzer. " +
"This means that the same ngram analyzer will be applied to both indexed and queried column values. " +
"Applying ngram analysis to the queried values usually produces too many search tokens to be useful. " +
"The large number of tokens can also have a negative impact in performance. " +
"In most cases it's better to use a simpler query_analyzer such as the standard one.";

private static final Logger logger = LoggerFactory.getLogger(StorageAttachedIndex.class);
private static final VectorTypeSupport vts = VectorizationProvider.getInstance().getVectorTypeSupport();

Expand Down Expand Up @@ -320,9 +328,16 @@ public static Map<String, String> validateOptions(Map<String, String> options, T
}

AbstractType<?> type = TypeUtil.cellValueType(target.left, target.right);
AbstractAnalyzer.fromOptions(type, options); // will throw if invalid
if (AbstractAnalyzer.hasQueryAnalyzer(options))
AbstractAnalyzer.fromOptionsQueryAnalyzer(type, options); // will throw if invalid

// Validate analyzers by building them
try (AbstractAnalyzer.AnalyzerFactory analyzerFactory = AbstractAnalyzer.fromOptions(type, options))
{
if (AbstractAnalyzer.hasQueryAnalyzer(options))
AbstractAnalyzer.fromOptionsQueryAnalyzer(type, options).close();
else if (analyzerFactory.isNGram())
ClientWarn.instance.warn(NGRAM_WITHOUT_QUERY_ANALYZER_WARNING);
}

var config = IndexWriterConfig.fromOptions(null, type, options);

// If we are indexing map entries we need to validate the sub-types
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import org.apache.cassandra.db.marshal.UTF8Type;
import org.apache.cassandra.exceptions.InvalidRequestException;
import org.apache.cassandra.index.sai.utils.TypeUtil;
import org.apache.cassandra.utils.Pair;
import org.apache.lucene.analysis.Analyzer;

public abstract class AbstractAnalyzer implements Iterator<ByteBuffer>
Expand Down Expand Up @@ -111,6 +112,15 @@ default boolean supportsEquals()
return true;
}

/**
* @return {@link true} if this analyzer configuration has a n-gram tokenizer or any of its filters is n-gram.
*/
default boolean isNGram()
{
return false;
}

@Override
default void close()
{
}
Expand All @@ -132,7 +142,10 @@ public static AnalyzerFactory toAnalyzerFactory(String json, final AbstractType<

try
{
final Analyzer analyzer = JSONAnalyzerParser.parse(json);
Pair<Analyzer, LuceneCustomAnalyzerConfig> analyzerAndConfig = JSONAnalyzerParser.parse(json);
final Analyzer analyzer = analyzerAndConfig.left;
final boolean isNGram = analyzerAndConfig.right != null && analyzerAndConfig.right.isNGram();

return new AnalyzerFactory()
{
@Override
Expand All @@ -141,11 +154,18 @@ public void close()
analyzer.close();
}

@Override
public AbstractAnalyzer create()
{
return new LuceneAnalyzer(type, analyzer, options);
}

@Override
public boolean isNGram()
{
return isNGram;
}

@Override
public boolean supportsEquals()
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import com.fasterxml.jackson.databind.exc.UnrecognizedPropertyException;
import org.apache.cassandra.exceptions.InvalidRequestException;
import org.apache.cassandra.index.sai.analyzer.filter.BuiltInAnalyzers;
import org.apache.cassandra.utils.Pair;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharFilterFactory;
import org.apache.lucene.analysis.TokenFilterFactory;
Expand All @@ -36,12 +37,12 @@ public class JSONAnalyzerParser
{
private static final ObjectMapper JSON_MAPPER = new ObjectMapper();

public static Analyzer parse(String json) throws IOException
public static Pair<Analyzer, LuceneCustomAnalyzerConfig> parse(String json) throws IOException
{
Analyzer analyzer = matchBuiltInAnalzyer(json.toUpperCase());
if (analyzer != null)
{
return analyzer;
return Pair.create(analyzer, null);
}

LuceneCustomAnalyzerConfig analyzerModel;
Expand Down Expand Up @@ -147,12 +148,15 @@ public static Analyzer parse(String json) throws IOException
throw new InvalidRequestException("Error configuring analyzer's charFilter '" + charFilter.getName() + "': " + e.getMessage());
}
}
return builder.build();
return Pair.create(builder.build(), analyzerModel);
}

private static Analyzer matchBuiltInAnalzyer(String maybeAnalyzer) {
for (BuiltInAnalyzers analyzer : BuiltInAnalyzers.values()) {
if (analyzer.name().equals(maybeAnalyzer)) {
private static Analyzer matchBuiltInAnalzyer(String maybeAnalyzer)
{
for (BuiltInAnalyzers analyzer : BuiltInAnalyzers.values())
{
if (analyzer.name().equals(maybeAnalyzer))
{
return analyzer.getNewAnalyzer();
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,21 @@ public List<LuceneClassNameAndArgs> getCharFilters()
{
return charFilters;
}

/**
* @return {@link true} if this analyzer configuration has a n-gram tokenizer or any of its filters is n-gram.
*/
public boolean isNGram()
{
if (getTokenizer().getName().equals("ngram"))
return true;

for (LuceneClassNameAndArgs filter : getFilters())
{
if (filter.getName().equals("ngram"))
return true;
}

return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -189,14 +189,14 @@ public void testMissingClassException() throws Exception

public static List<String> tokenize(String testString, String json) throws Exception
{
Analyzer luceneAnalyzer = JSONAnalyzerParser.parse(json);
LuceneAnalyzer analyzer = new LuceneAnalyzer(UTF8Type.instance, luceneAnalyzer, new HashMap<String, String>());
Analyzer luceneAnalyzer = JSONAnalyzerParser.parse(json).left;
LuceneAnalyzer analyzer = new LuceneAnalyzer(UTF8Type.instance, luceneAnalyzer, new HashMap<>());

ByteBuffer toAnalyze = ByteBuffer.wrap(testString.getBytes(Charsets.UTF_8));
analyzer.reset(toAnalyze);
ByteBuffer analyzed = null;

List<String> list = new ArrayList();
List<String> list = new ArrayList<>();

while (analyzer.hasNext())
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,10 @@
import org.apache.cassandra.cql3.restrictions.StatementRestrictions;
import org.apache.cassandra.exceptions.InvalidRequestException;
import org.apache.cassandra.index.sai.SAITester;
import org.apache.cassandra.index.sai.StorageAttachedIndex;
import org.apache.cassandra.index.sai.analyzer.AnalyzerEqOperatorSupport;
import org.apache.cassandra.index.sai.analyzer.filter.BuiltInAnalyzers;
import org.apache.cassandra.service.ClientWarn;
import org.assertj.core.api.Assertions;

import static org.assertj.core.api.Assertions.assertThatThrownBy;
Expand Down Expand Up @@ -824,4 +826,61 @@ public void testHighNumberOfMatchPredicates()
}
assertTrue("Query too slow: " + minElapsed + " ms", minElapsed < 1000);
}

@Test
public void testClientWarningOnNGram()
{
// no explicit analyzer
assertNoWarning("{}");
assertNoWarning("{'ascii': 'true', 'case_sensitive': 'false', 'normalize': 'true'}");

// standard analyzer
assertNoWarning("{'index_analyzer': 'standard'}");
assertNoWarning("{'index_analyzer': 'whitespace'}");

// custom non-ngram analyzer

// ngram analyzer without query_analyzer
assertClientWarningOnNGram("{'index_analyzer': '{" +
" \"tokenizer\":{\"name\":\"ngram\", \"args\":{\"minGramSize\":\"2\", \"maxGramSize\":\"3\"}}," +
" \"filters\":[{\"name\":\"lowercase\"}]}'}");
assertClientWarningOnNGram("{'index_analyzer': '{" +
" \"tokenizer\":{\"name\":\"whitespace\"}," +
" \"filters\":[{\"name\":\"ngram\", \"args\":{\"minGramSize\":\"2\", \"maxGramSize\":\"3\"}}]}'}");

// ngram analyzer with ngram query_analyzer
assertNoWarning("{'index_analyzer': '{" +
" \"tokenizer\":{\"name\":\"ngram\", \"args\":{\"minGramSize\":\"2\", \"maxGramSize\":\"3\"}}}'," +
"'query_analyzer': '{" +
" \"tokenizer\":{\"name\":\"ngram\", \"args\":{\"minGramSize\":\"2\", \"maxGramSize\":\"3\"}}}'}");

// ngram analyzer with non-ngram query_analyzer
assertNoWarning("{'index_analyzer': '{" +
" \"tokenizer\":{\"name\":\"ngram\", \"args\":{\"minGramSize\":\"2\", \"maxGramSize\":\"3\"}}," +
" \"filters\":[{\"name\":\"lowercase\"}]}'," +
"'query_analyzer': '{" +
" \"tokenizer\":{\"name\":\"whitespace\"}," +
" \"filters\":[{\"name\":\"porterstem\"}]}'}");
}

private void assertClientWarningOnNGram(String indexOptions)
{
createIndexFromOptions(indexOptions);
Assertions.assertThat(ClientWarn.instance.getWarnings())
.hasSize(1)
.allMatch(w -> w.contains(StorageAttachedIndex.NGRAM_WITHOUT_QUERY_ANALYZER_WARNING));
}

private void assertNoWarning(String indexOptions)
{
createIndexFromOptions(indexOptions);
Assertions.assertThat(ClientWarn.instance.getWarnings()).isNull();
}

private void createIndexFromOptions(String options)
{
createTable("CREATE TABLE %s (k int PRIMARY KEY, v text)");
ClientWarn.instance.captureWarnings();
createIndex("CREATE CUSTOM INDEX ON %s(v) USING 'StorageAttachedIndex' WITH OPTIONS = " + options);
}
}

0 comments on commit 262785b

Please sign in to comment.