Skip to content

Commit

Permalink
Merge branch 'search-algorithm' into api
Browse files Browse the repository at this point in the history
  • Loading branch information
memo33 committed Nov 16, 2024
2 parents da43299 + 81d0848 commit ea438f4
Show file tree
Hide file tree
Showing 7 changed files with 40 additions and 8 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

### Changed
- The API was upgraded to version 2.0.
- The fuzzy search algorithm was changed to improve results for partial matches.


## [0.4.5] - 2024-10-17
Expand Down
2 changes: 1 addition & 1 deletion shared/shared/src/main/scala/sc4pac/shared.scala
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ abstract class SharedData {
) derives ReadWriter {
def isSc4pacAsset: Boolean = group == JsonRepoUtil.sc4pacAssetOrg.value
def toBareDep: BareDep = if (isSc4pacAsset) BareAsset(ModuleName(name)) else BareModule(Organization(group), ModuleName(name))
private[sc4pac] def toSearchString: String = s"$group:$name $summary"
private[sc4pac] def toSearchString: String = s"$group:$name $summary".toLowerCase(java.util.Locale.ENGLISH)
}

case class Channel(
Expand Down
2 changes: 1 addition & 1 deletion src/main/scala/sc4pac/Constants.scala
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ object Constants {
val maxRedirectionsOpt = Some(20)
val sslRetryCount = 3 // Coursier legacy
val resumeIncompleteDownloadAttemps = 4
val fuzzySearchThreshold = 50 // 0..100
val fuzzySearchThreshold = 80 // 0..100
val cacheTtl = 12.hours
val channelContentsTtl = 30.minutes
val channelContentsTtlShort = 60.seconds
Expand Down
2 changes: 1 addition & 1 deletion src/main/scala/sc4pac/Data.scala
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ object JsonData extends SharedData {
// def toDependency = DepVariant.fromDependency(C.Dependency(moduleWithAttributes, version)) // TODO remove?
def toDepModule = DepModule(Organization(group), ModuleName(name), version = version, variant = variant)
def toBareModule = BareModule(Organization(group), ModuleName(name))
private[sc4pac] def toSearchString: String = s"$group:$name $summary" // copied from ChannelItem.toSearchString
private[sc4pac] def toSearchString: String = s"$group:$name $summary".toLowerCase(java.util.Locale.ENGLISH) // copied from ChannelItem.toSearchString
def toApiInstalled = api.InstalledStatus.Installed(version = version, variant = variant, installedAt = installedAt, updatedAt = updatedAt)
}

Expand Down
34 changes: 32 additions & 2 deletions src/main/scala/sc4pac/Sc4pac.scala
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ class Sc4pac(val context: ResolutionContext, val tempRoot: os.Path) extends Upda
* Api.searchPlugins implements a similar function and should use the same algorithm.
*/
def search(query: String, threshold: Int, category: Option[String]): Task[Seq[(BareModule, Int, Option[String])]] = iterateAllChannelContents.map { itemsIter =>
val searchTokens = Sc4pac.fuzzySearchTokenize(query)
val results: Seq[(BareModule, Int, Option[String])] =
itemsIter.flatMap { item =>
if (item.isSc4pacAsset) {
Expand All @@ -85,8 +86,8 @@ class Sc4pac(val context: ResolutionContext, val tempRoot: os.Path) extends Upda
} else {
// TODO reconsider choice of search algorithm
val ratio =
if (query.isEmpty && category.isDefined) 100 // return the entire category
else me.xdrop.fuzzywuzzy.FuzzySearch.tokenSetRatio(query, item.toSearchString)
if (searchTokens.isEmpty && category.isDefined) 100 // return the entire category
else Sc4pac.fuzzySearchRatio(searchTokens, item.toSearchString, threshold)
if (ratio >= threshold) {
Some(BareModule(Organization(item.group), ModuleName(item.name)), ratio, Option(item.summary).filter(_.nonEmpty))
} else None
Expand Down Expand Up @@ -593,4 +594,33 @@ object Sc4pac {
}
}

private[sc4pac] def fuzzySearchTokenize(searchString: String): IndexedSeq[String] = {
searchString.toLowerCase(java.util.Locale.ENGLISH).split(' ').toIndexedSeq
}

/** This search implementation tries to work around some deficiencies of the
* fuzzywuzzy library algorithms `tokenSetRatio` and `tokenSetPartialRatio`.
* (The former does not match partial strings, the latter finds lots of
* unsuitable matches for "vip terrain mod" for example.)
*/
private[sc4pac] def fuzzySearchRatio(searchTokens: IndexedSeq[String], text: String, threshold: Int): Int = {
if (searchTokens.isEmpty) {
0
} else {
var acc = 0
for (token <- searchTokens) {
// There is a bug in the fuzzywuzzy library that causes
// partialRatio("wolf", "ulisse wolf hybrid-railway-subway-converter tunnel portals for hybrid railway (hrw)")
// ^ ^ ^ ^
// to output 50 instead of 100, see https://github.com/xdrop/fuzzywuzzy/issues/106
// so as mitigation we first check containment.
val ratio = if (text.contains(token)) 100 else me.xdrop.fuzzywuzzy.FuzzySearch.partialRatio(token, text)
if (ratio >= threshold) { // this eliminates poor matches for some tokens (however, this leads to inconsistent results for varying thresholds due to double truncation)
acc += ratio
}
}
math.round(acc.toFloat / searchTokens.length)
}
}

}
5 changes: 3 additions & 2 deletions src/main/scala/sc4pac/api/api.scala
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,13 @@ class Api(options: sc4pac.cli.Commands.ServerOptions) {
*/
def searchPlugins(query: String, threshold: Int, category: Option[String], items: Seq[JD.InstalledData]): (JD.Channel.Stats, Seq[(JD.InstalledData, Int)]) = {
val categoryStats = collection.mutable.Map.empty[String, Int]
val searchTokens = Sc4pac.fuzzySearchTokenize(query)
val results: Seq[(JD.InstalledData, Int)] =
items.flatMap { item =>
// TODO reconsider choice of search algorithm
val ratio =
if (query.isEmpty) 100 // return the entire category (or everything if there is no filter category)
else me.xdrop.fuzzywuzzy.FuzzySearch.tokenSetRatio(query, item.toSearchString)
if (searchTokens.isEmpty) 100 // return the entire category (or everything if there is no filter category)
else Sc4pac.fuzzySearchRatio(searchTokens, item.toSearchString, threshold)
if (ratio < threshold) {
None
} else {
Expand Down
2 changes: 1 addition & 1 deletion src/main/scala/sc4pac/cli.scala
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ object Commands {
final case class SearchOptions(
@ValueDescription("number") @Group("Search") @Tag("Search")
@HelpMessage(s"Fuziness (0..100, default=${Constants.fuzzySearchThreshold}): Smaller numbers lead to more results.")
threshold: Int = Constants.fuzzySearchThreshold // 0..100, default 50
threshold: Int = Constants.fuzzySearchThreshold // 0..100, default 80
) extends Sc4pacCommandOptions

case object Search extends Command[SearchOptions] {
Expand Down

0 comments on commit ea438f4

Please sign in to comment.