Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding float64 support, document level boosting, and facet collector #52

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions 4.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
Collecting pytest
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file seems like it was added in error...?

Downloading pytest-7.1.2-py3-none-any.whl (297 kB)
Requirement already satisfied: colorama in c:\users\alphaceph\anaconda3\envs\py310\lib\site-packages (from pytest) (0.4.4)
Collecting attrs>=19.2.0
Downloading attrs-21.4.0-py2.py3-none-any.whl (60 kB)
Collecting py>=1.8.2
Downloading py-1.11.0-py2.py3-none-any.whl (98 kB)
Collecting atomicwrites>=1.0
Downloading atomicwrites-1.4.1.tar.gz (14 kB)
Collecting pluggy<2.0,>=0.12
Downloading pluggy-1.0.0-py2.py3-none-any.whl (13 kB)
Collecting packaging
Downloading packaging-21.3-py3-none-any.whl (40 kB)
Collecting iniconfig
Downloading iniconfig-1.1.1-py2.py3-none-any.whl (5.0 kB)
Requirement already satisfied: tomli>=1.0.0 in c:\users\alphaceph\anaconda3\envs\py310\lib\site-packages (from pytest) (2.0.1)
Collecting pyparsing!=3.0.5,>=2.0.2
Downloading pyparsing-3.0.9-py3-none-any.whl (98 kB)
Building wheels for collected packages: atomicwrites
Building wheel for atomicwrites (setup.py): started
Building wheel for atomicwrites (setup.py): finished with status 'done'
Created wheel for atomicwrites: filename=atomicwrites-1.4.1-py2.py3-none-any.whl size=6957 sha256=a1a268c4dc96c217af8ea7655cc187e388dcca401b511a0a00e532af25c25aee
Stored in directory: c:\users\alphaceph\appdata\local\pip\cache\wheels\34\07\0b\33b15f68736109f72ea0bb2499521d87312b932620737447a2
Successfully built atomicwrites
Installing collected packages: pyparsing, py, pluggy, packaging, iniconfig, attrs, atomicwrites, pytest
Successfully installed atomicwrites-1.4.1 attrs-21.4.0 iniconfig-1.1.1 packaging-21.3 pluggy-1.0.0 py-1.11.0 pyparsing-3.0.9 pytest-7.1.2
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
[build-system]
requires = ["maturin"]
requires = ["maturin>=0.13,<0.14"]
build-backend = "maturin"

[project]
name = "tantivy"
requires-python = ">=3.7"

2 changes: 1 addition & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
maturin
maturin==0.13.0
pytest>=4.0
2 changes: 1 addition & 1 deletion src/facet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ impl Facet {
#[classmethod]
fn from_string(_cls: &PyType, facet_string: &str) -> Facet {
Facet {
inner: schema::Facet::from(facet_string),
inner: schema::Facet::from_text(facet_string).unwrap(),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was going to comment about the unwrap but I see that the LHS schema::Facet::from is just implemented using from_text with an unwrap anyway: https://docs.rs/tantivy/0.18.0/src/tantivy/schema/facet.rs.html#181-183

I'm curious what was the reason for this change?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't exactly remember this change. Maybe I couldn't get facets to work without it

}
}

Expand Down
20 changes: 18 additions & 2 deletions src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -314,10 +314,12 @@ impl Index {
/// field is specified in the query.
///
#[args(reload_policy = "RELOAD_POLICY")]
#[args(conjunction_by_default = false)]
pub fn parse_query(
&self,
query: &str,
default_field_names: Option<Vec<String>>,
conjunction_by_default: bool,
) -> PyResult<Query> {
let mut default_fields = vec![];
let schema = self.index.schema();
Expand All @@ -344,12 +346,26 @@ impl Index {
} else {
for (field, field_entry) in self.index.schema().fields() {
if field_entry.is_indexed() {
default_fields.push(field);

match field_entry.field_type() {
tv::schema::FieldType::Facet(_) => {
// facets aren't suited for default fields
},
_ => {
default_fields.push(field);
},
}

}
}
}
let parser =
let mut parser =
tv::query::QueryParser::for_index(&self.index, default_fields);

if conjunction_by_default {
parser.set_conjunction_by_default();
}

let query = parser.parse_query(query).map_err(to_pyerr)?;

Ok(Query { inner: query })
Expand Down
82 changes: 80 additions & 2 deletions src/schemabuilder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

use pyo3::{exceptions, prelude::*};

use tantivy::schema;
use tantivy::schema::{self, FacetOptions};

use crate::schema::Schema;
use std::sync::{Arc, RwLock};
Expand Down Expand Up @@ -131,6 +131,50 @@ impl SchemaBuilder {
Ok(self.clone())
}

/// Add a new float64 field to the schema.
/// Note: When adding value to the index, make sure that it is type-casted to float
/// Adding integers or other values may produce false result
///
/// Args:
/// name (str): The name of the field.
/// stored (bool, optional): If true sets the field as stored, the
/// content of the field can be later restored from a Searcher.
/// Defaults to False.
/// indexed (bool, optional): If true sets the field to be indexed.
/// fast (str, optional): Set the f64 options as a single-valued fast
/// field. Fast fields are designed for random access. Access time
/// are similar to a random lookup in an array. If more than one
/// value is associated to a fast field, only the last one is kept.
/// Can be one of 'single' or 'multi'. If this is set to 'single,
/// the document must have exactly one value associated to the
/// document. If this is set to 'multi', the document can have any
/// number of values associated to the document. Defaults to None,
/// which disables this option.
///
/// Returns the associated field handle.
/// Raises a ValueError if there was an error with the field creation.
#[args(stored = false, indexed = false)]
fn add_float_field(
&mut self,
name: &str,
stored: bool,
indexed: bool,
fast: Option<&str>,
) -> PyResult<Self> {
let builder = &mut self.builder;

let opts = SchemaBuilder::build_float_option(stored, indexed, fast)?;

if let Some(builder) = builder.write().unwrap().as_mut() {
builder.add_f64_field(name, opts);
} else {
return Err(exceptions::PyValueError::new_err(
"Schema builder object isn't valid anymore.",
));
}
Ok(self.clone())
}

/// Add a new unsigned integer field to the schema.
///
/// Args:
Expand Down Expand Up @@ -267,11 +311,12 @@ impl SchemaBuilder {
/// Add a Facet field to the schema.
/// Args:
/// name (str): The name of the field.
#[args(stored = false, indexed = false)]
fn add_facet_field(&mut self, name: &str) -> PyResult<Self> {
let builder = &mut self.builder;

if let Some(builder) = builder.write().unwrap().as_mut() {
builder.add_facet_field(name, INDEXED);
builder.add_facet_field(name, FacetOptions::default());
} else {
return Err(exceptions::PyValueError::new_err(
"Schema builder object isn't valid anymore.",
Expand Down Expand Up @@ -352,6 +397,39 @@ impl SchemaBuilder {
Ok(opts)
}

fn build_float_option(
stored: bool,
indexed: bool,
fast: Option<&str>,
) -> PyResult<schema::NumericOptions> {
let opts = schema::NumericOptions::default();

let opts = if stored { opts.set_stored() } else { opts };
let opts = if indexed { opts.set_indexed() } else { opts };

let fast = match fast {
Some(f) => {
let f = f.to_lowercase();
match f.as_ref() {
"single" => Some(schema::Cardinality::SingleValue),
"multi" => Some(schema::Cardinality::MultiValues),
_ => return Err(exceptions::PyValueError::new_err(
"Invalid index option, valid choices are: 'multivalue' and 'singlevalue'"
)),
}
}
None => None,
};

let opts = if let Some(f) = fast {
opts.set_fast(f)
} else {
opts
};

Ok(opts)
}

fn build_text_option(
stored: bool,
tokenizer_name: &str,
Expand Down
81 changes: 79 additions & 2 deletions src/searcher.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
#![allow(clippy::new_ret_no_self)]

use std::collections::HashMap;
use crate::{document::Document, get_field, query::Query, to_pyerr};
use pyo3::{exceptions::PyValueError, prelude::*};
use tantivy as tv;
use tantivy::collector::{Count, MultiCollector, TopDocs};
use tv::collector::{FacetCollector};
use tv::fastfield::FastFieldReader;
use tv::{SegmentReader, Score, DocId};

/// Tantivy's Searcher class
///
Expand Down Expand Up @@ -41,10 +45,15 @@ impl ToPyObject for Fruit {
/// Object holding a results successful search.
pub(crate) struct SearchResult {
hits: Vec<(Fruit, DocAddress)>,

#[pyo3(get)]
/// How many documents matched the query. Only available if `count` was set
/// to true during the search.
count: Option<usize>,

#[pyo3(get)]
/// Results of facets using using `count_facets_by_field` parameter
facet_counts: Option<HashMap<String, u64>>,
}

#[pymethods]
Expand Down Expand Up @@ -83,10 +92,15 @@ impl Searcher {
/// return. Defaults to 10.
/// count (bool, optional): Should the number of documents that match
/// the query be returned as well. Defaults to true.
/// count_facets_by_field (Field, optional): Return grouped number of
/// documents by the given facet field. Defaults to false
/// order_by_field (Field, optional): A schema field that the results
/// should be ordered by. The field must be declared as a fast field
/// when building the schema. Note, this only works for unsigned
/// fields.
/// weight_by_field (Field, optional): A schema field increases the
/// score of the document by the given value. It should be a fast
/// field of float data type
/// offset (Field, optional): The offset from which the results have
/// to be returned.
///
Expand All @@ -100,7 +114,9 @@ impl Searcher {
query: &Query,
limit: usize,
count: bool,
count_facets_by_field: Option<&str>,
order_by_field: Option<&str>,
weight_by_field: Option<&str>,
offset: usize,
) -> PyResult<SearchResult> {
let mut multicollector = MultiCollector::new();
Expand All @@ -111,8 +127,50 @@ impl Searcher {
None
};

let facet_handle = if let Some(facet_name) = count_facets_by_field {
let field = get_field(&self.inner.index().schema(), facet_name)?;
let mut facet_collector = FacetCollector::for_field(field);
facet_collector.add_facet("/");
Some(multicollector.add_collector(facet_collector))
} else {
None
};

let (mut multifruit, hits) = {
if let Some(order_by) = order_by_field {

if let Some(weight_by) = weight_by_field {

let field = get_field(&self.inner.index().schema(), weight_by)?;
let collector = TopDocs::with_limit(limit)
.and_offset(offset)
.tweak_score(move |segment_reader: &SegmentReader| {
let weight_reader = segment_reader.fast_fields().f64(field).unwrap();
return move |doc: DocId, original_score: Score| {
let weight: f64 = weight_reader.get(doc);
let new_score = original_score + weight as f32;
return new_score
}
});

let top_docs_handle = multicollector.add_collector(collector);
let ret = self.inner.search(query.get(), &multicollector);

match ret {
Ok(mut r) => {
let top_docs = top_docs_handle.extract(&mut r);
let result: Vec<(Fruit, DocAddress)> = top_docs
.iter()
.map(|(f, d)| {
(Fruit::Score(*f), DocAddress::from(d))
})
.collect();
(r, result)
}
Err(e) => return Err(PyValueError::new_err(e.to_string())),
}

} else if let Some(order_by) = order_by_field {

let field = get_field(&self.inner.index().schema(), order_by)?;
let collector = TopDocs::with_limit(limit)
.and_offset(offset)
Expand Down Expand Up @@ -159,7 +217,26 @@ impl Searcher {
None => None,
};

Ok(SearchResult { hits, count })
let facet_counts:Option<HashMap<String, u64>> = match facet_handle {
Some(h) => {
let facet_counts_obj = h.extract(&mut multifruit);

let collection: Vec<(&tv::schema::Facet, u64)> = facet_counts_obj
.get("/")
.collect();

let mut facet_counts:HashMap<String, u64> = HashMap::new();

for (facet, count) in collection.iter() {
facet_counts.insert(facet.to_path_string(), *count);
}

Some(facet_counts)
},
None => None,
};

Ok(SearchResult { hits, count, facet_counts})
}

/// Returns the overall number of documents in the index.
Expand Down