Skip to content

Commit

Permalink
Add support and upgrade to lucille 0.0.3
Browse files Browse the repository at this point in the history
  • Loading branch information
valencik committed Nov 29, 2024
1 parent 82ae0c3 commit 8bc196c
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 98 deletions.
2 changes: 1 addition & 1 deletion build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ val catsEffectV = "3.5.5"
val catsV = "2.12.0"
val fs2V = "3.11.0"
val laikaV = "1.2.1"
val lucilleV = "0.0.2"
val lucilleV = "0.0.3"
val munitCatsEffectV = "2.0.0"
val munitV = "1.0.2"
val scalajsDomV = "2.8.0"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
package pink.cozydev.protosearch

import cats.data.NonEmptyList
import pink.cozydev.lucille.{MultiQuery, Query, TermQuery}
import pink.cozydev.lucille.{Query, TermQuery}
import pink.cozydev.protosearch.internal.PositionalIter

import java.util.regex.PatternSyntaxException
Expand Down Expand Up @@ -50,11 +50,10 @@ object IndexSearcher {

def search(q: Query): Either[String, Set[Int]] =
q match {
case MultiQuery(qs) => qs.traverse(search).map(defaultCombine)
case Query.Or(qs) => qs.traverse(search).map(IndexSearcher.unionSets)
case Query.And(qs) => qs.traverse(search).map(IndexSearcher.intersectSets)
case Query.Not(q) => search(q).map(matches => allDocs -- matches)
case Query.Group(qs) => qs.traverse(search).map(defaultCombine)
case Query.Group(q) => search(q)
case Query.Field(f, q) =>
index.indexes
.get(f)
Expand Down Expand Up @@ -89,17 +88,17 @@ object IndexSearcher {
case Query.Or(qs) => qs.traverse(search).map(IndexSearcher.unionSets)
case Query.And(qs) => qs.traverse(search).map(IndexSearcher.intersectSets)
case Query.Not(q) => search(q).map(matches => allDocs -- matches)
case Query.Group(qs) => qs.traverse(search).map(defaultCombine)
case Query.Group(q) => search(q)
case Query.Field(fn, q) =>
Left(s"Nested field queries not supported. Cannot query field '$fn' with q: $q")
case q: MultiQuery => q.qs.traverse(search).map(defaultCombine)
case q: Query.UnaryMinus => Left(s"Unsupported UnaryMinus in BooleanRetrieval: $q")
case q: Query.UnaryPlus => Left(s"Unsupported UnaryPlus in BooleanRetrieval: $q")
case q: Query.Proximity => Left(s"Unsupported Proximity in BooleanRetrieval: $q")
case q: Query.Fuzzy => Left(s"Unsupported Fuzzy in BooleanRetrieval: $q")
case q: Query.TermRegex => regexSearch(q)
case q: Query.MinimumMatch => Left(s"Unsupported MinimumMatch in BooleanRetrieval: $q")
case q: Query.Boost => Left(s"Unsupported Boost in BooleanRetrieval: $q")
case q: Query.WildCard => Left(s"Unsupported WildCard in BooleanRetrieval: $q")
}

private def phraseSearch(index: Index, q: Query.Phrase): Either[String, Set[Int]] =
Expand Down
5 changes: 2 additions & 3 deletions core/src/main/scala/pink/cozydev/protosearch/Scorer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import cats.syntax.all.*
import pink.cozydev.lucille.Query

import scala.collection.mutable.HashMap
import pink.cozydev.lucille.MultiQuery
import pink.cozydev.protosearch.internal.PositionalIter

import java.util.regex.PatternSyntaxException
Expand All @@ -43,20 +42,20 @@ case class Scorer(index: MultiIndex, defaultOR: Boolean = true) {
case Query.Or(qs) => accScore(idx, qs)
case Query.And(qs) => accScore(idx, qs)
case Query.Not(_) => Right(NonEmptyList.one(Map.empty[Int, Double]))
case Query.Group(qs) => accScore(idx, qs)
case Query.Group(qs) => accScore(idx, NonEmptyList.one(qs))
case Query.Field(fn, q) =>
index.indexes.get(fn) match {
case None => Left(s"Field not found")
case Some(newIndex) => accScore(newIndex, NonEmptyList.one(q))
}
case q: MultiQuery => accScore(idx, q.qs)
case Query.UnaryMinus(_) => Right(NonEmptyList.one(Map.empty[Int, Double]))
case Query.UnaryPlus(q) => accScore(idx, NonEmptyList.one(q))
case q: Query.Proximity => Left(s"Unsupported Proximity encountered in Scorer: $q")
case q: Query.Fuzzy => Left(s"Unsupported Fuzzy encountered in Scorer: $q")
case q: Query.TermRegex => regexScore(idx, docs, q)
case q: Query.MinimumMatch => Left(s"Unsupported MinimumMatch in Scorer: $q")
case q: Query.Boost => Left(s"Unsupported Boost in Scorer: $q")
case q: Query.WildCard => Left(s"Unsupported WildCard in Scorer: $q")
}
accScore(defaultIdx, NonEmptyList.one(qs)).map(combineMaps)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,103 +16,39 @@

package pink.cozydev.protosearch.analysis

import cats.data.NonEmptyList
import pink.cozydev.lucille.Query
import pink.cozydev.lucille.{Query, TermQuery}
import pink.cozydev.lucille.Query._
import pink.cozydev.lucille.QueryParser

import pink.cozydev.lucille.MultiQuery

// TODO This is a hack, the Lucille parser tokenizes on white space only currently
// We perhaps want Lucille to use a tokenizer from textmogrify
// In the meantime, we rewrite the Query with our `Analyzer`
case class QueryAnalyzer(
defaultField: String,
analyzers: Map[String, Analyzer],
) {
private def analyzeTermQ(a: Analyzer, query: Query): Either[String, Query] =
query match {
case q: Query.Term =>
val terms = NonEmptyList.fromFoldable(a.tokenize(q.str))
// println(s"analyzeTerQ processing '$q' -> $terms")
terms match {
case None => Left(s"Error tokenizing Term during analyzeTermQ: $q")
case Some(ts) =>
ts match {
case NonEmptyList(head, Nil) => Right(Query.Term(head))
case termNel => Right(Query.Group(termNel.map(Query.Term.apply)))
}
}
case q: Query.Prefix => Right(q)
case q: Query.TermRange => Right(q)
case q: Query.Phrase => Right(q)
case q: Query.Or =>
q.qs.traverse(qq => analyzeTermQ(a, qq)).map(qs => Query.Or(qs))
case q: Query.And =>
q.qs.traverse(qq => analyzeTermQ(a, qq)).map(qs => Query.And(qs))
case q: Query.Not =>
analyzeTermQ(a, q.q).map(qs => Query.Not(qs))
case q: Query.Group =>
q.qs.traverse(qq => analyzeTermQ(a, qq)).map(qs => Query.Group(qs))
case q: Query.Field => Left(s"Oops, nested field query?: $q")
case q: MultiQuery =>
q.qs.traverse(qq => analyzeTermQ(a, qq)).map(qs => MultiQuery(qs))
case q: Query.UnaryMinus =>
analyzeTermQ(a, q.q).map(qs => Query.UnaryMinus(qs))
case q: Query.UnaryPlus =>
analyzeTermQ(a, q.q).map(qs => Query.UnaryPlus(qs))
case q: Query.Proximity => Right(q)
case q: Query.Fuzzy => Right(q)
case q: Query.TermRegex => Right(q)
case q: Query.MinimumMatch => Right(q)
case q: Query.Boost => Right(q)
}
// TODO Support using the right analyzer for the right field
private val defaultAnalyzer = analyzers(defaultField)

private def analyzeQ(query: Query): Either[String, Query] =
private def analyzeTermQ(query: TermQuery): Either[String, Query] =
query match {
case Query.Term(q) =>
val qs: List[String] = analyzers(defaultField).tokenize(q)
NonEmptyList.fromList(qs) match {
case None => Left(s"Query analysis error, no terms found after tokenizing $query")
case Some(qs) =>
if (qs.length == 1) Right(Query.Term(qs.head))
else
Left(
s"Query analysis error, Term tokenized into multiple terms, this should be supported, but isn't yet"
)
case Term(t) =>
defaultAnalyzer.tokenize(t) match {
case Nil => Left(s"Error tokenizing Term '$t' during query analysis")
case q1 :: Nil => Right(Term(q1))
case q1 :: q2 :: tail => Right(Or(Term(q1), Term(q2), tail.map(Term)))
}
case q: Query.Prefix => Right(q)
case q: Query.TermRange => Right(q)
case Query.Phrase(q) =>
val qs: List[String] = analyzers(defaultField).tokenize(q)
NonEmptyList.fromList(qs) match {
case None => Left(s"Query analysis error, no terms found after tokenizing $query")
case Phrase(p) =>
defaultAnalyzer.tokenize(p) match {
case Nil => Left(s"Error tokenizing Phrase '$p' during query analysis")
// TODO This is also a hack, we shouldn't reconstruct a string!
case Some(qs) => Right(Query.Phrase(qs.toList.mkString(" ")))
}
case q: Query.Or => q.qs.traverse(analyzeQ).map(Query.Or.apply)
case q: Query.And => q.qs.traverse(analyzeQ).map(Query.And.apply)
case q: Query.Not => analyzeQ(q.q).map(Query.Not.apply)
case q: Query.Group => q.qs.traverse(analyzeQ).map(Query.Group.apply)
case Query.Field(fn, q) =>
analyzers.get(fn) match {
case None => Left(s"Query analysis error, field $fn is not supported in query $query")
case Some(a) => analyzeTermQ(a, q).map(qq => Query.Field(fn, qq))
case terms => Right(Phrase(terms.mkString(" ")))
}
case q: MultiQuery => q.qs.traverse(analyzeQ).map(MultiQuery.apply)
case q: Query.UnaryMinus => analyzeQ(q.q).map(Query.UnaryMinus.apply)
case q: Query.UnaryPlus => analyzeQ(q.q).map(Query.UnaryPlus.apply)
case q: Query.Proximity => Right(q)
case q: Query.Fuzzy => Right(q)
case q: Query.TermRegex => Right(q)
case q: Query.MinimumMatch => Right(q)
case q: Query.Boost => Right(q)
case q => Right(q)
}

def parse(queryString: String): Either[String, MultiQuery] = {
val q: Either[String, MultiQuery] =
QueryParser.parse(queryString)
q.flatMap(mq => mq.qs.traverse(analyzeQ).map(qs => MultiQuery(qs)))
}
def parse(queryString: String): Either[String, Query] =
QueryParser.parse(queryString).flatMap(q => q.traverseQ(analyzeTermQ))

}
object QueryAnalyzer {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,35 +16,32 @@

package pink.cozydev.protosearch

import cats.data.NonEmptyList
import pink.cozydev.lucille.Query
import pink.cozydev.protosearch.LastTermRewrite._
import pink.cozydev.lucille.MultiQuery

class LastTermRewriteSuite extends munit.FunSuite {
import pink.cozydev.lucille.Query._

test("termToPrefix rewrites termQ to termQ and prefix") {
val q = Term("f")
val expected =
Group(NonEmptyList.one(Or(NonEmptyList.of(Term("f"), Prefix("f")))))
Group(Or(Term("f"), Prefix("f")))
assertEquals(termToPrefix(q), expected)
}

test("termToPrefix rewrites fieldQ to fieldQ's with termQ and prefix") {
val q: Query = Field("fn", Term("c"))
val expected =
Field("fn", Group(NonEmptyList.one(Or(NonEmptyList.of(Term("c"), Prefix("c"))))))
Field("fn", Group(Or(Term("c"), Prefix("c"))))
assertEquals(q.mapLastTerm(termToPrefix), expected)
}

test("termToPrefix rewrites only the last termQ to termQ and prefix") {
val q =
MultiQuery(Term("first"), Term("f"))
val q = Or(Term("first"), Term("f"))
val expected =
MultiQuery(
Or(
Term("first"),
Group(NonEmptyList.one(Or(NonEmptyList.of(Term("f"), Prefix("f"))))),
Group(Or(Term("f"), Prefix("f"))),
)
assertEquals(q.mapLastTerm(termToPrefix), expected)
}
Expand Down

0 comments on commit 8bc196c

Please sign in to comment.