feat(core): Add support for Tantivy based time series index (#1852)

New behavior : This change adds support for the Tantivy indexing library as an alternative to Lucene for time series indexing. In several cases it has been found that this is superior to Lucene performance, especially when it comes to memory usage and predictability of memory spikes. This feature is opt-in via a configuration setting to avoid any unexpected changes during upgrade. For the moment only the raw time series index is supported. Downsample support may come in a future PR. BREAKING CHANGES This change requires a working Rust & C compiler to build given the Tantivy code is written in Rust. README docs have been updated to reflect this. There are no runtime breaking changes.
filodb · amolnayak311 · Nov 13, 2024 · Aug 8, 2024 · Aug 12, 2024 · Aug 21, 2024
commit 26ab57373d17ce66a49c9dbcad5066c8e418a9bb
diff --git a/.github/workflows/scala.yml b/.github/workflows/scala.yml
@@ -4,7 +4,7 @@ on:
   push:
     branches: [ develop ]
   pull_request:
-    branches: [ develop, integration, main ]
+    branches: [ develop, integration, main, feat-index-rust ]
 
 jobs:
   test:
@@ -19,6 +19,14 @@ jobs:
       with:
         java-version: '11'
         distribution: 'adopt'
+    - name: Install Rust Toolchain
+      uses: actions-rust-lang/setup-rust-toolchain@v1
+      with:
+        components: rustfmt, clippy
+        target: x86_64-apple-darwin, aarch64-apple-darwin, aarch64-unknown-linux-gnu
+        cache-workspaces: "core/src/rust -> target"
+    - name: Install cargo-zigbuild
+      run: pip install cargo-zigbuild
     - name: Run tests
       run: .github/workflows/runtests.sh
     - name: Coverage Reports

diff --git a/.gitignore b/.gitignore
@@ -32,4 +32,7 @@ metastore_db/
 **/kafka/src/test/scala/filodb/kafka/shard*
 *lib*
 
+# Allow Rust's lib.rs since we're otherwise blocking *lib* above
+!lib.rs
+
 coordinator/src/test/resources/
diff --git a/README.md b/README.md
@@ -95,6 +95,8 @@ To compile the .mermaid source files to .png's, install the [Mermaid CLI](http:/
 3. [Apache Cassandra](http://cassandra.apache.org/) 2.x or 3.x (We prefer using [CCM](https://github.com/pcmanus/ccm) for local testing)
     - For testing, install a single node C* cluster, like this:  `ccm create v39_single -v 3.9 -n 1 -s`
 4. [Apache Kafka](http://kafka.apache.org/) 0.10.x or above
+5. [Rust](https://www.rust-lang.org/tools/install) to build native components
+6. A working C compiler for your system (GCC or Clang)
 
 Optional:
 

diff --git a/core/src/main/resources/filodb-defaults.conf b/core/src/main/resources/filodb-defaults.conf
@@ -789,6 +789,27 @@ filodb {
       block-memory-manager-percent = 71
     }
 
+    # Settings for Tantivy backed indexes
+    tantivy {
+        # Max number of items to keep in the column cache.  This speeds up retrieval of values,
+        # especially on fast queries.  Each cached item is very small, < 1KB
+        column-cache-count = 1000
+
+        # Max size of the query results cache, in bytes.  This can have a major performance win
+        # for alert type queries that are periodically running the same query over and over.
+        query-cache-max-bytes = 50MB
+
+        # Estimated size of an item in the query cache, in bytes.  This is the size in bits
+        # of the number of documents each segment searches over, estimated to 250k docs.
+        # This is a hint to the cache only and does not bound the max number of items.
+        query-cache-estimated-item-size = 31250
+
+        # Percentage of deleted docs in a segment that will flag this to be considered
+        # for a merge.  Setting this too high will leave too many documents around
+        # and increase query time.
+        deleted-doc-merge-threshold = 0.1
+    }
+
     # At the cost of some extra heap memory, we can track queries holding shared lock for a long time
     # and starving the exclusive access of lock for eviction
     track-queries-holding-eviction-lock = true
@@ -809,6 +830,11 @@ filodb {
 
     # Whether to add the _type_ label to all time series for the purpose of filtering
     type-field-indexing-enabled = false
+
+    # The Part Key index implementation to use.  Supported values:
+    # lucene - Lucene based index (default)
+    # tantivy - Tantivy based index
+    part-key-index-type = lucene
   }
 
   # for standalone worker cluster configuration, see akka-bootstrapper

diff --git a/core/src/main/scala/filodb.core/Utils.scala b/core/src/main/scala/filodb.core/Utils.scala
@@ -1,9 +1,11 @@
 package filodb.core
 
+import java.io.{File, IOException}
 import java.lang.management.ManagementFactory
 
 import com.typesafe.config.{Config, ConfigRenderOptions}
 import com.typesafe.scalalogging.StrictLogging
+import scala.util.{Failure, Try}
 
 object Utils extends StrictLogging {
   private val threadMbean = ManagementFactory.getThreadMXBean
@@ -37,4 +39,34 @@ object Utils extends StrictLogging {
     logger.info(s"Available memory calculated or configured as $availableMem")
     availableMem
   }
+
+  // Recursively delete a folder
+  def deleteRecursively(f: File, deleteRoot: Boolean = false): Try[Boolean] = {
+    val subDirDeletion: Try[Boolean] =
+      if (f.isDirectory)
+        f.listFiles match {
+          case xs: Array[File] if xs != null && !xs.isEmpty =>
+            val subDirDeletions: Array[Try[Boolean]] = xs map (f => deleteRecursively(f, true))
+            subDirDeletions reduce ((reduced, thisOne) => {
+              thisOne match {
+                // Ensures even if one Right(_) is found, thr response will be Right(Throwable)
+                case scala.util.Success(_) if reduced == scala.util.Success(true) => thisOne
+                case Failure(_) => thisOne
+                case _ => reduced
+              }
+            })
+          case _ => scala.util.Success(true)
+        }
+      else
+        scala.util.Success(true)
+
+    subDirDeletion match {
+      case scala.util.Success(_) =>
+        if (deleteRoot) {
+          if (f.delete()) scala.util.Success(true) else Failure(new IOException(s"Unable to delete $f"))
+        } else scala.util.Success(true)
+      case right@Failure(_) => right
+    }
+
+  }
 }
diff --git a/core/src/main/scala/filodb.core/memstore/OnDemandPagingShard.scala b/core/src/main/scala/filodb.core/memstore/OnDemandPagingShard.scala
@@ -263,7 +263,7 @@ TimeSeriesShard(ref, schemas, storeConfig, numShards, quotaSource, shardNum, buf
           logger.debug(s"Creating TSPartition for ODP from part ID $id in dataset=$ref shard=$shardNum")
           // If not there, then look up in Lucene and get the details
           for { partKeyBytesRef <- partKeyIndex.partKeyFromPartId(id)
-                unsafeKeyOffset = PartKeyLuceneIndex.bytesRefToUnsafeOffset(partKeyBytesRef.offset)
+                unsafeKeyOffset = PartKeyIndexRaw.bytesRefToUnsafeOffset(partKeyBytesRef.offset)
                 group = partKeyGroup(schemas.part.binSchema, partKeyBytesRef.bytes, unsafeKeyOffset, numGroups)
                 sch  <- Option(schemas(RecordSchema.schemaID(partKeyBytesRef.bytes, unsafeKeyOffset)))
                           } yield {

diff --git a/core/src/main/scala/filodb.core/memstore/PartKeyIndex.scala b/core/src/main/scala/filodb.core/memstore/PartKeyIndex.scala
diff --git a/core/src/main/scala/filodb.core/memstore/PartKeyLuceneIndex.scala b/core/src/main/scala/filodb.core/memstore/PartKeyLuceneIndex.scala
diff --git a/core/src/main/scala/filodb.core/memstore/PartKeyTantivyIndex.scala b/core/src/main/scala/filodb.core/memstore/PartKeyTantivyIndex.scala
diff --git a/core/src/main/scala/filodb.core/memstore/TimeSeriesShard.scala b/core/src/main/scala/filodb.core/memstore/TimeSeriesShard.scala
@@ -285,8 +285,13 @@ class TimeSeriesShard(val ref: DatasetRef,
   private val indexFacetingEnabledAllLabels = filodbConfig.getBoolean("memstore.index-faceting-enabled-for-all-labels")
   private val numParallelFlushes = filodbConfig.getInt("memstore.flush-task-parallelism")
   private val disableIndexCaching = filodbConfig.getBoolean("memstore.disable-index-caching")
+  private val partKeyIndexType = filodbConfig.getString("memstore.part-key-index-type")
   private val typeFieldIndexingEnabled = filodbConfig.getBoolean("memstore.type-field-indexing-enabled")
-
+  private val tantivyColumnCacheCount = filodbConfig.getLong("memstore.tantivy.column-cache-count")
+  private val tantivyQueryCacheSize = filodbConfig.getMemorySize("memstore.tantivy.query-cache-max-bytes")
+  private val tantivyQueryCacheEstimatedItemSize =
+    filodbConfig.getMemorySize("memstore.tantivy.query-cache-estimated-item-size")
+  private val tantivyDeletedDocMergeThreshold = filodbConfig.getDouble("memstore.tantivy.deleted-doc-merge-threshold")
 
   /////// END CONFIGURATION FIELDS ///////////////////
 
@@ -312,10 +317,19 @@ class TimeSeriesShard(val ref: DatasetRef,
     * Used to answer queries not involving the full partition key.
     * Maintained using a high-performance bitmap index.
     */
-  private[memstore] final val partKeyIndex: PartKeyIndexRaw = new PartKeyLuceneIndex(ref, schemas.part,
-    indexFacetingEnabledAllLabels, indexFacetingEnabledShardKeyLabels, shardNum,
-    storeConfig.diskTTLSeconds * 1000, disableIndexCaching = disableIndexCaching,
-    addMetricTypeField = typeFieldIndexingEnabled)
+  private[memstore] final val partKeyIndex: PartKeyIndexRaw = partKeyIndexType match {
+    case "lucene" => new PartKeyLuceneIndex(ref, schemas.part,
+      indexFacetingEnabledAllLabels, indexFacetingEnabledShardKeyLabels, shardNum,
+      storeConfig.diskTTLSeconds * 1000, disableIndexCaching = disableIndexCaching,
+      addMetricTypeField = typeFieldIndexingEnabled)
+    case "tantivy" => new PartKeyTantivyIndex(ref, schemas.part,
+      shardNum, storeConfig.diskTTLSeconds * 1000, columnCacheCount = tantivyColumnCacheCount,
+      queryCacheMaxSize = tantivyQueryCacheSize.toBytes,
+      queryCacheEstimatedItemSize = tantivyQueryCacheEstimatedItemSize.toBytes,
+      deletedDocMergeThreshold = tantivyDeletedDocMergeThreshold.toFloat,
+      addMetricTypeField = typeFieldIndexingEnabled)
+    case x => sys.error(s"Unsupported part key index type: '$x'")
+  }
 
   private val cardTracker: CardinalityTracker = initCardTracker()
 
@@ -1222,7 +1236,7 @@ class TimeSeriesShard(val ref: DatasetRef,
     }
     partIter.skippedPartIDs.foreach { pId =>
       partKeyIndex.partKeyFromPartId(pId).foreach { pk =>
-        val unsafePkOffset = PartKeyLuceneIndex.bytesRefToUnsafeOffset(pk.offset)
+        val unsafePkOffset = PartKeyIndexRaw.bytesRefToUnsafeOffset(pk.offset)
         val schema = schemas(RecordSchema.schemaID(pk.bytes, unsafePkOffset))
         val shardKey = schema.partKeySchema.colValues(pk.bytes, unsafePkOffset,
           schemas.part.options.shardKeyColumns)
@@ -1907,7 +1921,7 @@ class TimeSeriesShard(val ref: DatasetRef,
     partitions.get(partID) match {
       case TimeSeriesShard.OutOfMemPartition =>
         partKeyIndex.partKeyFromPartId(partID).map { pkBytesRef =>
-          val unsafeKeyOffset = PartKeyLuceneIndex.bytesRefToUnsafeOffset(pkBytesRef.offset)
+          val unsafeKeyOffset = PartKeyIndexRaw.bytesRefToUnsafeOffset(pkBytesRef.offset)
           RecordSchema.schemaID(pkBytesRef.bytes, unsafeKeyOffset)
         }.getOrElse(-1)
       case p: TimeSeriesPartition => p.schema.schemaHash

diff --git a/core/src/rust/Cargo.lock b/core/src/rust/Cargo.lock
diff --git a/core/src/rust/Cargo.toml b/core/src/rust/Cargo.toml
@@ -0,0 +1,9 @@
+[workspace]
+resolver = "2"
+members = ["filodb_core", "tantivy_utils"]
+
+# Keep debug symbols in the final binary
+# This makes the binary slightly larger (~20MB), but makes profiling much more useful
+# and has no runtime impact
+[profile.release]
+debug = true
diff --git a/core/src/rust/clippy.toml b/core/src/rust/clippy.toml
@@ -0,0 +1,2 @@
+allow-unwrap-in-tests = true
+allow-expect-in-tests = true
diff --git a/core/src/rust/filodb_core/Cargo.toml b/core/src/rust/filodb_core/Cargo.toml
@@ -0,0 +1,30 @@
+[package]
+name = "filodb_core"
+version = "0.1.0"
+edition = "2021"
+
+[lib]
+crate-type = ["cdylib"]
+
+[dependencies]
+dhat = "0.3.3"
+filesize = "0.2.0"
+hashbrown = "0.14.5"
+jni = "0.21.1"
+nohash-hasher = "0.2.0"
+nom = "7.1.3"
+num-derive = "0.4.2"
+num-traits = "0.2.19"
+quick_cache = { version = "0.6.2", features = ["stats"] }
+regex = "1.10.5"
+tantivy = "0.22.0"
+tantivy-common = "0.7.0"
+tantivy-fst = "0.5.0"
+tantivy_utils = { path = "../tantivy_utils" }
+thiserror = "1.0.62"
+
+[dev-dependencies]
+bytes = "1.6.1"
+
+[features]
+dhat-heap = []
diff --git a/core/src/rust/filodb_core/src/errors.rs b/core/src/rust/filodb_core/src/errors.rs
@@ -0,0 +1,44 @@
+//! Error types to translate to exceptions for the JVM
+
+use std::borrow::Cow;
+
+use jni::JNIEnv;
+
+const RUNTIME_EXCEPTION_CLASS: &str = "java/lang/RuntimeException";
+
+/// Result type for java exception methods
+pub type JavaResult<T> = Result<T, JavaException>;
+
+/// Error type that can be thrown as an exception
+#[derive(Debug)]
+pub struct JavaException {
+    class: &'static str,
+    message: Cow<'static, str>,
+}
+
+impl JavaException {
+    /// Create a new java.lang.RuntimeException
+    pub fn new_runtime_exception(message: impl Into<Cow<'static, str>>) -> Self {
+        Self::new(RUNTIME_EXCEPTION_CLASS, message)
+    }
+
+    /// Create a new exception with a specified class and message
+    pub fn new(class: &'static str, message: impl Into<Cow<'static, str>>) -> Self {
+        Self {
+            class,
+            message: message.into(),
+        }
+    }
+
+    /// Throw the generated exception on a JNIEnv
+    pub fn set_exception_details(&self, env: &mut JNIEnv) {
+        let _ = env.throw_new(self.class, &self.message);
+    }
+}
+
+// Default conversion for Rust std errors - throw RuntimeException
+impl<T: std::error::Error> From<T> for JavaException {
+    fn from(value: T) -> Self {
+        Self::new_runtime_exception(format!("{value}"))
+    }
+}
diff --git a/core/src/rust/filodb_core/src/exec.rs b/core/src/rust/filodb_core/src/exec.rs
@@ -0,0 +1,56 @@
+//! Helpers for executing code in a JNI method
+
+use jni::{sys::jobject, JNIEnv};
+
+use crate::errors::JavaResult;
+
+/// Execs a function in a JNI context, supplying an environment
+/// and translating any errors to exceptions
+///
+/// All JNI functions should use this to ensure error handling
+/// is properly done
+///
+/// Do *not* panic in any calls - avoid unwrap, expect, etc.
+pub fn jni_exec<F, T>(env: &mut JNIEnv, func: F) -> T
+where
+    F: FnOnce(&mut JNIEnv) -> JavaResult<T>,
+    T: EarlyReturn,
+{
+    let ret = func(env);
+    match ret {
+        Ok(r) => r,
+        Err(e) => {
+            // An error occurred, throw an exception
+            e.set_exception_details(env);
+
+            T::abort_value()
+        }
+    }
+}
+
+/// Trait for early return values when an exception is being thrown
+pub trait EarlyReturn {
+    fn abort_value() -> Self;
+}
+
+impl EarlyReturn for jobject {
+    fn abort_value() -> Self {
+        std::ptr::null_mut()
+    }
+}
+
+impl EarlyReturn for i32 {
+    fn abort_value() -> Self {
+        0
+    }
+}
+
+impl EarlyReturn for i64 {
+    fn abort_value() -> Self {
+        0
+    }
+}
+
+impl EarlyReturn for () {
+    fn abort_value() -> Self {}
+}
diff --git a/core/src/rust/filodb_core/src/index.rs b/core/src/rust/filodb_core/src/index.rs
@@ -0,0 +1,183 @@
+//! Methods to create / destroy the index
+
+use jni::{
+    objects::{JClass, JObjectArray, JString},
+    sys::{jfloat, jlong},
+    JNIEnv,
+};
+use tantivy::{
+    directory::MmapDirectory,
+    indexer::LogMergePolicy,
+    schema::{
+        BytesOptions, FacetOptions, Field, JsonObjectOptions, NumericOptions, Schema,
+        SchemaBuilder, TextFieldIndexing, TextOptions,
+    },
+    IndexBuilder, IndexSettings, ReloadPolicy, TantivyDocument,
+};
+use tantivy_utils::field_constants::{self, facet_field_name, LABEL_LIST};
+
+use crate::{
+    errors::{JavaException, JavaResult},
+    exec::jni_exec,
+    jnienv::JNIEnvExt,
+    state::IndexHandle,
+};
+
+pub const WRITER_MEM_BUDGET: usize = 50 * 1024 * 1024;
+
+/// Create a new index state object by loading and configuring schema
+#[no_mangle]
+pub extern "system" fn Java_filodb_core_memstore_TantivyNativeMethods_00024_newIndexHandle(
+    mut env: JNIEnv,
+    _class: JClass,
+    disk_location: JString,
+    schema_fields: JObjectArray,
+    map_fields: JObjectArray,
+    multi_column_facet_fields: JObjectArray,
+    column_cache_size: jlong,
+    query_cache_max_size: jlong,
+    query_cache_estimated_item_size: jlong,
+    deleted_doc_merge_threshold: jfloat,
+) -> jlong {
+    jni_exec(&mut env, |env| {
+        let disk_location: String = env.get_string(&disk_location)?.into();
+        std::fs::create_dir_all(&disk_location)?;
+
+        let directory = MmapDirectory::open(disk_location)?;
+
+        // Build the schema for documents
+        let (schema, default_field) =
+            build_schema(env, &schema_fields, &map_fields, &multi_column_facet_fields)?;
+
+        // Open index
+        let settings = IndexSettings {
+            ..Default::default()
+        };
+
+        let index = IndexBuilder::new()
+            .schema(schema.clone())
+            .settings(settings)
+            .open_or_create(directory.clone())?;
+
+        let writer = index.writer::<TantivyDocument>(WRITER_MEM_BUDGET)?;
+
+        let mut merge_policy = LogMergePolicy::default();
+        merge_policy.set_del_docs_ratio_before_merge(deleted_doc_merge_threshold);
+
+        writer.set_merge_policy(Box::new(merge_policy));
+
+        let reader = index
+            .reader_builder()
+            // It's tempting to use Manual here as we call refresh periodically
+            // from a timer thread.  However, refresh just means that you can see
+            // all uncommitted documents, not that all merges have completed.  This
+            // means that background merges that are happening that could speed up
+            // queries aren't avaialble when manual is used.  Instead we use
+            // on commit - the cost of this is minor since it's a FS notification
+            // and reloading the segment list is fairly cheap and infrequent.
+            .reload_policy(ReloadPolicy::OnCommitWithDelay)
+            .try_into()?;
+
+        Ok(IndexHandle::new_handle(
+            schema,
+            default_field,
+            writer,
+            reader,
+            directory,
+            column_cache_size as u64,
+            query_cache_max_size as u64,
+            query_cache_estimated_item_size as u64,
+        ))
+    })
+}
+
+#[no_mangle]
+pub extern "system" fn Java_filodb_core_memstore_TantivyNativeMethods_00024_freeIndexHandle(
+    mut env: JNIEnv,
+    _class: JClass,
+    handle: jlong,
+) {
+    jni_exec(&mut env, |_| {
+        unsafe {
+            drop(Box::from_raw(handle as *mut IndexHandle));
+        }
+
+        Ok(())
+    });
+}
+
+fn build_schema(
+    env: &mut JNIEnv,
+    schema_fields: &JObjectArray,
+    map_fields: &JObjectArray,
+    multi_column_facet_fields: &JObjectArray,
+) -> JavaResult<(Schema, Option<Field>)> {
+    let mut builder = SchemaBuilder::new();
+
+    let text_options = TextOptions::default().set_indexing_options(
+        TextFieldIndexing::default()
+            .set_tokenizer("raw")
+            .set_fieldnorms(false),
+    );
+
+    let random_access_text_options = text_options.clone().set_fast(Some("raw"));
+
+    let numeric_options = NumericOptions::default().set_indexed().set_fast();
+
+    // Bytes values are faster to read via the doc store vs fast fields and we don't need any of the fast
+    // field only features like iterating by sorted values
+    let byte_options = BytesOptions::default().set_indexed().set_stored();
+
+    builder.add_text_field(field_constants::DOCUMENT_ID, text_options.clone());
+    builder.add_i64_field(field_constants::PART_ID, numeric_options.clone());
+    builder.add_bytes_field(field_constants::PART_KEY, byte_options);
+    builder.add_i64_field(field_constants::START_TIME, numeric_options.clone());
+    builder.add_i64_field(field_constants::END_TIME, numeric_options.clone());
+    builder.add_text_field(field_constants::TYPE, text_options.clone());
+
+    // Fields from input schema
+    env.foreach_string_in_array(schema_fields, |name| {
+        builder.add_text_field(&name, random_access_text_options.clone());
+
+        Ok(())
+    })?;
+
+    // Map fields - only one supported
+    let len = env.get_array_length(map_fields)?;
+    if len > 1 {
+        return Err(JavaException::new_runtime_exception(
+            "More than one map field specified",
+        ));
+    }
+
+    let default_field = if len == 1 {
+        let name = env.get_object_array_element(map_fields, 0)?.into();
+        let name = env.get_rust_string(&name)?;
+
+        let field = builder.add_json_field(
+            &name,
+            JsonObjectOptions::default()
+                .set_indexing_options(
+                    TextFieldIndexing::default()
+                        .set_tokenizer("raw")
+                        .set_fieldnorms(false),
+                )
+                .set_fast(Some("raw")),
+        );
+
+        Some(field)
+    } else {
+        None
+    };
+
+    env.foreach_string_in_array(multi_column_facet_fields, |name| {
+        builder.add_text_field(&name, random_access_text_options.clone());
+
+        Ok(())
+    })?;
+
+    // Default facet for label list, always added
+    builder.add_facet_field(&facet_field_name(LABEL_LIST), FacetOptions::default());
+
+    Ok((builder.build(), default_field))
+}
diff --git a/core/src/rust/filodb_core/src/ingestion.rs b/core/src/rust/filodb_core/src/ingestion.rs
@@ -0,0 +1,266 @@
+//! Methods that modify the index / do data ingestion
+
+use std::{ops::Bound, sync::atomic::Ordering};
+
+use fields::add_fields;
+use jni::{
+    objects::{JByteArray, JClass, JIntArray, JString},
+    sys::{jboolean, jint, jlong, JNI_TRUE},
+    JNIEnv,
+};
+use tantivy::{
+    collector::Count,
+    indexer::UserOperation,
+    query::{RangeQuery, TermSetQuery},
+    schema::Facet,
+    TantivyDocument, Term,
+};
+use tantivy_utils::field_constants::{self, facet_field_name};
+
+use crate::{
+    errors::JavaResult,
+    exec::jni_exec,
+    jnienv::JNIEnvExt,
+    state::{IndexHandle, IngestingDocument},
+};
+
+mod fields;
+
+#[no_mangle]
+pub extern "system" fn Java_filodb_core_memstore_TantivyNativeMethods_00024_reset(
+    mut env: JNIEnv,
+    _class: JClass,
+    handle: jlong,
+) {
+    jni_exec(&mut env, |_| {
+        let handle = IndexHandle::get_ref_from_handle(handle);
+
+        handle.changes_pending.store(false, Ordering::SeqCst);
+
+        let mut writer = handle.writer.write()?;
+        writer.delete_all_documents()?;
+        writer.commit()?;
+
+        handle.changes_pending.store(false, Ordering::SeqCst);
+
+        Ok(())
+    });
+}
+
+#[no_mangle]
+pub extern "system" fn Java_filodb_core_memstore_TantivyNativeMethods_00024_commit(
+    mut env: JNIEnv,
+    _class: JClass,
+    handle: jlong,
+) {
+    jni_exec(&mut env, |_| {
+        let handle = IndexHandle::get_ref_from_handle(handle);
+
+        handle.changes_pending.store(false, Ordering::SeqCst);
+
+        let mut writer = handle.writer.write()?;
+        writer.commit()?;
+
+        Ok(())
+    });
+}
+
+#[no_mangle]
+pub extern "system" fn Java_filodb_core_memstore_TantivyNativeMethods_00024_ingestDocument(
+    mut env: JNIEnv,
+    _class: JClass,
+    handle: jlong,
+    part_key_data: JByteArray,
+    part_key_offset: jint,
+    part_key_num_bytes: jint,
+    part_id: jint,
+    document_id: JString,
+    start_time: jlong,
+    end_time: jlong,
+    fields: JByteArray,
+    upsert: jboolean,
+) {
+    jni_exec(&mut env, |env| {
+        let handle = IndexHandle::get_ref_from_handle(handle);
+
+        let mut ingesting_doc = IngestingDocument::default();
+
+        if part_id > -1 {
+            ingesting_doc.doc.add_i64(
+                handle.schema.get_field(field_constants::PART_ID)?,
+                part_id.into(),
+            );
+        }
+
+        let document_id = env.get_rust_string(&document_id)?;
+        ingesting_doc.doc.add_text(
+            handle.schema.get_field(field_constants::DOCUMENT_ID)?,
+            document_id.clone(),
+        );
+
+        ingesting_doc.doc.add_i64(
+            handle.schema.get_field(field_constants::START_TIME)?,
+            start_time,
+        );
+
+        ingesting_doc.doc.add_i64(
+            handle.schema.get_field(field_constants::END_TIME)?,
+            end_time,
+        );
+
+        let bytes = env.get_byte_array_offset_len(
+            &part_key_data,
+            part_key_offset as usize,
+            part_key_num_bytes as usize,
+        )?;
+
+        ingesting_doc
+            .doc
+            .add_bytes(handle.schema.get_field(field_constants::PART_KEY)?, bytes);
+
+        // Add dynamic fields
+        let fields = env.get_byte_array(&fields)?;
+        add_fields(&fields, &mut ingesting_doc, &handle.schema)?;
+
+        let doc = prepare_tantivy_doc(handle, &mut ingesting_doc)?;
+
+        // Save it
+        let writer = handle.writer.read()?;
+
+        if upsert == JNI_TRUE {
+            let delete_term = Term::from_field_text(
+                handle.schema.get_field(field_constants::DOCUMENT_ID)?,
+                &document_id,
+            );
+
+            let writer = handle.writer.read()?;
+            writer.run([UserOperation::Delete(delete_term), UserOperation::Add(doc)])?;
+
+            handle.changes_pending.store(true, Ordering::SeqCst);
+        } else {
+            writer.add_document(doc)?;
+        }
+
+        handle.changes_pending.store(true, Ordering::SeqCst);
+
+        Ok(())
+    });
+}
+
+fn prepare_tantivy_doc(
+    handle: &IndexHandle,
+    ingesting_doc: &mut IngestingDocument,
+) -> JavaResult<TantivyDocument> {
+    let mut map_values = std::mem::take(&mut ingesting_doc.map_values);
+
+    // Insert map columns we've built up
+    for (key, value) in map_values.drain() {
+        ingesting_doc
+            .doc
+            .add_object(handle.schema.get_field(&key)?, value);
+    }
+
+    // Build final facet for field list
+    let mut field_names = std::mem::take(&mut ingesting_doc.field_names);
+    field_names.sort();
+
+    for field in field_names {
+        add_facet(
+            handle,
+            ingesting_doc,
+            field_constants::LABEL_LIST,
+            &[field.as_str()],
+        )?;
+    }
+
+    let doc = std::mem::take(&mut ingesting_doc.doc);
+
+    Ok(doc)
+}
+
+fn add_facet(
+    handle: &IndexHandle,
+    ingesting_doc: &mut IngestingDocument,
+    name: &str,
+    value: &[&str],
+) -> JavaResult<()> {
+    if !name.is_empty() && !value.is_empty() {
+        ingesting_doc.doc.add_facet(
+            handle.schema.get_field(&facet_field_name(name))?,
+            Facet::from_path(value),
+        );
+    }
+
+    Ok(())
+}
+
+#[no_mangle]
+pub extern "system" fn Java_filodb_core_memstore_TantivyNativeMethods_00024_removePartKeys(
+    mut env: JNIEnv,
+    _class: JClass,
+    handle: jlong,
+    keys: JIntArray,
+) {
+    jni_exec(&mut env, |env| {
+        let handle = IndexHandle::get_ref_from_handle(handle);
+        let mut terms = vec![];
+
+        let field = handle.schema.get_field(field_constants::PART_ID)?;
+
+        let len = env.get_array_length(&keys)?;
+        let mut part_ids = vec![0i32; len as usize];
+
+        env.get_int_array_region(&keys, 0, &mut part_ids)?;
+
+        for part_id in part_ids {
+            terms.push(Term::from_field_i64(field, part_id as i64));
+        }
+
+        let query = Box::new(TermSetQuery::new(terms));
+
+        let writer = handle.writer.read()?;
+        writer.delete_query(query)?;
+
+        handle.changes_pending.store(true, Ordering::SeqCst);
+
+        Ok(())
+    })
+}
+
+#[no_mangle]
+pub extern "system" fn Java_filodb_core_memstore_TantivyNativeMethods_00024_removePartitionsEndedBefore(
+    mut env: JNIEnv,
+    _class: JClass,
+    handle: jlong,
+    ended_before: jlong,
+    return_deleted_count: jboolean,
+) -> jint {
+    jni_exec(&mut env, |_| {
+        let handle = IndexHandle::get_ref_from_handle(handle);
+
+        let query = RangeQuery::new_i64_bounds(
+            field_constants::END_TIME.to_string(),
+            Bound::Included(0),
+            // To match existing Lucene index behavior, make this inclusive even though it's named
+            // "ended before" in the API
+            Bound::Included(ended_before),
+        );
+
+        let java_ret = if return_deleted_count == JNI_TRUE {
+            let searcher = handle.reader.searcher();
+
+            let collector = Count;
+
+            searcher.search(&query, &collector)?
+        } else {
+            0
+        };
+
+        let writer = handle.writer.read()?;
+        writer.delete_query(Box::new(query))?;
+
+        handle.changes_pending.store(true, Ordering::SeqCst);
+
+        Ok(java_ret as i32)
+    })
+}
diff --git a/core/src/rust/filodb_core/src/ingestion/fields.rs b/core/src/rust/filodb_core/src/ingestion/fields.rs
@@ -0,0 +1,204 @@
+//! Working with field data
+
+use std::collections::BTreeMap;
+
+use nom::{Err, IResult};
+use num_derive::FromPrimitive;
+use tantivy::schema::Schema;
+
+use crate::{
+    parser::{parse_string, parse_type_id, AsNomError, ParserError, TypeParseResult},
+    state::IngestingDocument,
+};
+
+#[derive(FromPrimitive)]
+#[repr(u8)]
+enum FieldTypeId {
+    Indexed = 1,
+    Map = 2,
+    Multicolumn = 3,
+}
+
+pub fn add_fields<'a>(
+    input: &'a [u8],
+    doc: &mut IngestingDocument,
+    schema: &Schema,
+) -> IResult<&'a [u8], (), ParserError> {
+    let mut next_input = input;
+
+    while !next_input.is_empty() {
+        let (input, type_id) = parse_type_id(next_input)?;
+
+        let (input, _) = match type_id {
+            TypeParseResult::Success(FieldTypeId::Indexed) => {
+                parse_indexed_field(input, doc, schema)?
+            }
+            TypeParseResult::Success(FieldTypeId::Map) => parse_map_field(input, doc)?,
+            TypeParseResult::Success(FieldTypeId::Multicolumn) => {
+                parse_multicolumn_field(input, doc, schema)?
+            }
+            TypeParseResult::Failure(type_id) => {
+                return Err(Err::Failure(ParserError::UnknownType(type_id)))
+            }
+        };
+
+        next_input = input;
+    }
+
+    Ok((next_input, ()))
+}
+
+fn parse_indexed_field<'a>(
+    input: &'a [u8],
+    doc: &mut IngestingDocument,
+    schema: &Schema,
+) -> IResult<&'a [u8], (), ParserError> {
+    let (input, field_name) = parse_string(input)?;
+    let (input, value) = parse_string(input)?;
+
+    let field = schema.get_field(&field_name).to_nom_err()?;
+
+    doc.doc.add_text(field, value);
+    doc.field_names.push(field_name.to_string());
+
+    Ok((input, ()))
+}
+
+fn parse_map_field<'a>(
+    input: &'a [u8],
+    doc: &mut IngestingDocument,
+) -> IResult<&'a [u8], (), ParserError> {
+    let (input, map_name) = parse_string(input)?;
+    let (input, field_name) = parse_string(input)?;
+    let (input, value) = parse_string(input)?;
+
+    // Create new map for this map column if needed
+    if !doc.map_values.contains_key(map_name.as_ref()) {
+        doc.map_values.insert(map_name.to_string(), BTreeMap::new());
+    }
+
+    // Capture value
+    doc.map_values
+        .get_mut(map_name.as_ref())
+        .ok_or_else(|| Err::Failure(ParserError::InternalMapError))?
+        .insert(field_name.to_string(), value.to_string().into());
+    doc.field_names.push(field_name.to_string());
+
+    Ok((input, ()))
+}
+
+fn parse_multicolumn_field<'a>(
+    input: &'a [u8],
+    doc: &mut IngestingDocument,
+    schema: &Schema,
+) -> IResult<&'a [u8], (), ParserError> {
+    let (input, field_name) = parse_string(input)?;
+    let (input, value) = parse_string(input)?;
+
+    let field = schema.get_field(&field_name).to_nom_err()?;
+
+    doc.doc.add_text(field, value);
+    doc.field_names.push(field_name.to_string());
+
+    Ok((input, ()))
+}
+
+#[cfg(test)]
+mod tests {
+    use bytes::BufMut;
+    use tantivy::{schema::OwnedValue, Document};
+
+    use tantivy_utils::test_utils::{
+        build_test_schema, COL1_NAME, JSON_ATTRIBUTE1_NAME, JSON_COL_NAME,
+    };
+
+    use super::*;
+
+    #[test]
+    fn test_parse_indexed_field() {
+        let mut doc = IngestingDocument::default();
+        let index = build_test_schema();
+
+        let mut buf = vec![];
+
+        let expected = "abcd";
+
+        buf.put_u16_le(COL1_NAME.len() as u16);
+        buf.put_slice(COL1_NAME.as_bytes());
+        buf.put_u16_le(expected.len() as u16);
+        buf.put_slice(expected.as_bytes());
+
+        let _ = parse_indexed_field(&buf, &mut doc, &index.schema).expect("Should succeed");
+
+        assert!(doc.field_names.contains(&COL1_NAME.to_string()));
+        assert_eq!(
+            **doc
+                .doc
+                .get_sorted_field_values()
+                .first()
+                .unwrap()
+                .1
+                .first()
+                .unwrap(),
+            OwnedValue::Str(expected.into())
+        );
+    }
+
+    #[test]
+    fn test_parse_map_field() {
+        let mut doc = IngestingDocument::default();
+
+        let mut buf = vec![];
+
+        let expected = "abcd";
+
+        buf.put_u16_le(JSON_COL_NAME.len() as u16);
+        buf.put_slice(JSON_COL_NAME.as_bytes());
+        buf.put_u16_le(JSON_ATTRIBUTE1_NAME.len() as u16);
+        buf.put_slice(JSON_ATTRIBUTE1_NAME.as_bytes());
+        buf.put_u16_le(expected.len() as u16);
+        buf.put_slice(expected.as_bytes());
+
+        let _ = parse_map_field(&buf, &mut doc).expect("Should succeed");
+
+        assert!(doc.field_names.contains(&JSON_ATTRIBUTE1_NAME.to_string()));
+        assert_eq!(
+            *doc.map_values
+                .get(JSON_COL_NAME)
+                .unwrap()
+                .get(JSON_ATTRIBUTE1_NAME)
+                .unwrap(),
+            OwnedValue::Str(expected.into())
+        );
+    }
+
+    #[test]
+    fn test_parse_multicolumn_field() {
+        let mut doc = IngestingDocument::default();
+        let index = build_test_schema();
+
+        let mut buf = vec![];
+
+        let expected = "abcd";
+
+        buf.put_u16_le(COL1_NAME.len() as u16);
+        buf.put_slice(COL1_NAME.as_bytes());
+        buf.put_u16_le(expected.len() as u16);
+        buf.put_slice(expected.as_bytes());
+
+        let _ = parse_multicolumn_field(&buf, &mut doc, &index.schema).expect("Should succeed");
+
+        assert!(doc.field_names.contains(&COL1_NAME.to_string()));
+        assert_eq!(
+            **doc
+                .doc
+                .get_sorted_field_values()
+                .first()
+                .unwrap()
+                .1
+                .first()
+                .unwrap(),
+            OwnedValue::Str(expected.into())
+        );
+    }
+}
diff --git a/core/src/rust/filodb_core/src/jnienv.rs b/core/src/rust/filodb_core/src/jnienv.rs
@@ -0,0 +1,88 @@
+//! Extensions to JNIEnv
+
+use jni::{
+    objects::{JByteArray, JObject, JObjectArray, JString},
+    JNIEnv,
+};
+
+use crate::errors::JavaResult;
+
+/// Helper extensions for working with JVM types
+#[allow(dead_code)]
+pub trait JNIEnvExt<'a> {
+    /// Get a rust string from Java String
+    fn get_rust_string(&mut self, obj: &JString) -> JavaResult<String>;
+
+    /// Get a class name and return it as a string
+    /// This is equivilant to Java code `obj.class.name`
+    fn get_object_class_name(&mut self, obj: &JObject) -> JavaResult<String>;
+
+    /// Run a closure over every String in a String[]
+    fn foreach_string_in_array<F>(&mut self, array: &JObjectArray, func: F) -> JavaResult<()>
+    where
+        F: FnMut(String) -> JavaResult<()>;
+
+    /// Get a byte array from the JVM
+    fn get_byte_array_offset_len(
+        &mut self,
+        array: &JByteArray,
+        offset: usize,
+        len: usize,
+    ) -> JavaResult<Vec<u8>>;
+
+    /// Get a byte array from the JVM
+    fn get_byte_array(&mut self, array: &JByteArray) -> JavaResult<Vec<u8>>;
+}
+
+impl<'a> JNIEnvExt<'a> for JNIEnv<'a> {
+    fn get_rust_string(&mut self, obj: &JString) -> JavaResult<String> {
+        let ret = self.get_string(obj)?.into();
+        Ok(ret)
+    }
+
+    fn get_object_class_name(&mut self, obj: &JObject) -> JavaResult<String> {
+        let class = self.get_object_class(obj)?;
+        let name = self
+            .get_field(&class, "name", "Ljava/lang/String;")?
+            .l()?
+            .into();
+
+        let ret = self.get_string(&name)?.into();
+        Ok(ret)
+    }
+
+    fn foreach_string_in_array<F>(&mut self, array: &JObjectArray, mut func: F) -> JavaResult<()>
+    where
+        F: FnMut(String) -> JavaResult<()>,
+    {
+        let len = self.get_array_length(array)?;
+        for idx in 0..len {
+            let s = self.get_object_array_element(array, idx)?.into();
+            let s = self.get_rust_string(&s)?;
+            func(s)?;
+        }
+
+        Ok(())
+    }
+
+    fn get_byte_array_offset_len(
+        &mut self,
+        array: &JByteArray,
+        offset: usize,
+        len: usize,
+    ) -> JavaResult<Vec<u8>> {
+        let mut bytes = vec![0u8; len];
+        let bytes_ptr = bytes.as_mut_ptr() as *mut i8;
+        let bytes_ptr = unsafe { std::slice::from_raw_parts_mut(bytes_ptr, len) };
+
+        self.get_byte_array_region(array, offset as i32, bytes_ptr)?;
+
+        Ok(bytes)
+    }
+
+    fn get_byte_array(&mut self, array: &JByteArray) -> JavaResult<Vec<u8>> {
+        let len = self.get_array_length(array)?;
+
+        self.get_byte_array_offset_len(array, 0, len as usize)
+    }
+}
diff --git a/core/src/rust/filodb_core/src/lib.rs b/core/src/rust/filodb_core/src/lib.rs
@@ -0,0 +1,30 @@
+//! Native methods for FiloDB core
+//!
+//! This library extensively uses JNI to interop with JVM code.
+//!
+//! Any new code should do the following to ensure consistency:
+//!
+//! * All JNI methods should be wrapped in jni_exec.  This turns any
+//!   Rust errors into RuntimeExceptions and allows for cleaner Rust
+//!   error handling.
+//! * No panic/unwrap/expect calls should be used.  Panicing will destroy
+//!   the JVM process.
+//! * Try to use primitive types when possible.  Getting fields on JVM
+//!   objects requires reflection like overhead that can't be optimized
+//!   as well
+//! * Minimize the calls back into the JVM.  Perfer to get passed in
+//!   needed information as arguments vs calling object methods.
+//!
+
+#![deny(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
+
+mod errors;
+mod exec;
+mod index;
+mod ingestion;
+mod jnienv;
+mod parser;
+mod profile;
+mod query_parser;
+mod reader;
+mod state;
diff --git a/core/src/rust/filodb_core/src/parser.rs b/core/src/rust/filodb_core/src/parser.rs
@@ -0,0 +1,146 @@
+//! Binary parser helpers
+
+use std::borrow::Cow;
+
+use nom::{
+    bytes::streaming::take,
+    error::{ErrorKind, ParseError},
+    number::streaming::{le_u16, u8},
+    IResult,
+};
+use num_traits::FromPrimitive;
+use tantivy::TantivyError;
+use thiserror::Error;
+
+/// Error type for query parsing issues
+#[derive(Error, Debug)]
+pub enum ParserError {
+    #[error("Core parsing error: {0:?}")]
+    Nom(ErrorKind),
+    #[error("Index error: {0}")]
+    IndexError(#[from] TantivyError),
+    #[error("Unknown type byte: {0}")]
+    UnknownType(u8),
+    #[error("Unknown occur byte: {0}")]
+    UnknownOccur(u8),
+    #[error("Internal map error")]
+    InternalMapError,
+}
+
+pub trait AsNomError<T> {
+    fn to_nom_err(self) -> Result<T, nom::Err<ParserError>>;
+}
+
+impl<T> AsNomError<T> for Result<T, TantivyError> {
+    fn to_nom_err(self) -> Result<T, nom::Err<ParserError>> {
+        match self {
+            Err(e) => Err(nom::Err::Failure(e.into())),
+            Ok(x) => Ok(x),
+        }
+    }
+}
+
+impl<'a> ParseError<&'a [u8]> for ParserError {
+    fn from_error_kind(_input: &'a [u8], kind: ErrorKind) -> Self {
+        ParserError::Nom(kind)
+    }
+
+    fn append(_input: &'a [u8], _kind: ErrorKind, other: Self) -> Self {
+        other
+    }
+}
+
+pub fn parse_string(input: &[u8]) -> IResult<&[u8], Cow<'_, str>, ParserError> {
+    let (input, length) = le_u16(input)?;
+    let (input, string_data) = take(length)(input)?;
+
+    Ok((input, String::from_utf8_lossy(string_data)))
+}
+
+#[derive(PartialEq, Debug)]
+pub enum TypeParseResult<T> {
+    Success(T),
+    Failure(u8),
+}
+
+impl<T> From<u8> for TypeParseResult<T>
+where
+    T: FromPrimitive,
+{
+    fn from(value: u8) -> Self {
+        match T::from_u8(value) {
+            Some(val) => Self::Success(val),
+            None => Self::Failure(value),
+        }
+    }
+}
+
+pub fn parse_type_id<T>(input: &[u8]) -> IResult<&[u8], TypeParseResult<T>, ParserError>
+where
+    T: FromPrimitive,
+{
+    let (input, type_id) = u8(input)?;
+    Ok((input, type_id.into()))
+}
+
+#[cfg(test)]
+mod tests {
+    use bytes::BufMut;
+    use num_derive::FromPrimitive;
+
+    use super::*;
+
+    #[test]
+    fn test_parse_string() {
+        let mut buf = vec![];
+
+        let expected = "abcd";
+
+        buf.put_u16_le(expected.len() as u16);
+        buf.put_slice(expected.as_bytes());
+
+        let (_, result) = parse_string(&buf).expect("Should succeed");
+
+        assert_eq!(result, expected);
+    }
+
+    #[test]
+    fn test_parse_empty_string() {
+        let mut buf = vec![];
+
+        buf.put_u16_le(0);
+
+        let (_, result) = parse_string(&buf).expect("Should succeed");
+
+        assert_eq!(result, "");
+    }
+
+    #[derive(FromPrimitive, Debug, PartialEq)]
+    #[repr(u8)]
+    pub enum TestTypeId {
+        Val1 = 1,
+        Val2 = 2,
+    }
+
+    #[test]
+    fn test_parse_type_id() {
+        let mut buf = vec![];
+
+        buf.put_u8(1);
+
+        let (_, result) = parse_type_id(&buf).expect("Should succeed");
+
+        assert_eq!(result, TypeParseResult::Success(TestTypeId::Val1));
+    }
+
+    #[test]
+    fn test_parse_type_id_invalid() {
+        let mut buf = vec![];
+
+        buf.put_u8(3);
+
+        let (_, result) = parse_type_id::<TestTypeId>(&buf).expect("Should succeed");
+
+        assert_eq!(result, TypeParseResult::Failure(3));
+    }
+}
diff --git a/core/src/rust/filodb_core/src/profile.rs b/core/src/rust/filodb_core/src/profile.rs
@@ -0,0 +1,149 @@
+//! Helpers for profiling / testing
+
+#[cfg(feature = "dhat-heap")]
+use std::sync::Mutex;
+
+use jni::{
+    objects::JClass,
+    sys::{jdoubleArray, jlong, jstring},
+    JNIEnv,
+};
+
+use crate::{exec::jni_exec, state::IndexHandle};
+
+/// Get cache stats info
+#[no_mangle]
+pub extern "system" fn Java_filodb_core_memstore_TantivyNativeMethods_00024_dumpCacheStats(
+    mut env: JNIEnv,
+    _class: JClass,
+    handle: jlong,
+) -> jstring {
+    jni_exec(&mut env, |env| {
+        let index = IndexHandle::get_ref_from_handle(handle);
+
+        let (column_hits, column_misses) = index.column_cache.stats();
+        let (query_hits, query_misses) = index.query_cache_stats();
+
+        let output = format!(
+            "Column cache: {} hits {} misses {}% hit\nQuery cache: {} hits {} misses {}% hit",
+            column_hits,
+            column_misses,
+            cache_hit_rate(column_hits, column_misses),
+            query_hits,
+            query_misses,
+            cache_hit_rate(query_hits, query_misses),
+        );
+
+        let java_str = env.new_string(output)?;
+
+        Ok(java_str.into_raw())
+    })
+}
+
+/// Get cache hit rates
+#[no_mangle]
+pub extern "system" fn Java_filodb_core_memstore_TantivyNativeMethods_00024_getCacheHitRates(
+    mut env: JNIEnv,
+    _class: JClass,
+    handle: jlong,
+) -> jdoubleArray {
+    jni_exec(&mut env, |env| {
+        let index = IndexHandle::get_ref_from_handle(handle);
+
+        let (column_hits, column_misses) = index.column_cache.stats();
+        let (query_hits, query_misses) = index.query_cache_stats();
+
+        let column_total = column_hits + column_misses;
+        let query_total = query_hits + query_misses;
+        let column_hit_rate = if column_total == 0 {
+            1.0f64
+        } else {
+            (column_hits as f64) / (column_total) as f64
+        };
+
+        let query_hit_rate = if query_total == 0 {
+            1.0f64
+        } else {
+            (query_hits as f64) / (query_total) as f64
+        };
+
+        // Contract with JVM code is (query hit rate, column hit rate)
+        let hit_rates = [query_hit_rate, column_hit_rate];
+
+        let result = env.new_double_array(hit_rates.len() as i32)?;
+        env.set_double_array_region(&result, 0, &hit_rates)?;
+
+        Ok(result.into_raw())
+    })
+}
+
+/// Start memory profiling
+#[no_mangle]
+#[allow(unused_mut, unused_variables)]
+pub extern "system" fn Java_filodb_core_memstore_TantivyNativeMethods_00024_startMemoryProfiling(
+    mut env: JNIEnv,
+    _class: JClass,
+) {
+    #[cfg(feature = "dhat-heap")]
+    jni_exec(&mut env, |_| {
+        PROFILER.lock()?.replace(dhat::Profiler::new_heap());
+
+        Ok(())
+    });
+}
+
+/// Stop memory profiling
+#[no_mangle]
+#[allow(unused_mut, unused_variables)]
+pub extern "system" fn Java_filodb_core_memstore_TantivyNativeMethods_00024_stopMemoryProfiling(
+    mut env: JNIEnv,
+    _class: JClass,
+) {
+    #[cfg(feature = "dhat-heap")]
+    jni_exec(&mut env, |_| {
+        PROFILER.lock()?.take();
+
+        Ok(())
+    });
+}
+
+#[cfg(feature = "dhat-heap")]
+#[global_allocator]
+static ALLOC: dhat::Alloc = dhat::Alloc;
+
+#[cfg(feature = "dhat-heap")]
+static PROFILER: Mutex<Option<dhat::Profiler>> = Mutex::new(None);
+
+fn cache_hit_rate(hits: u64, misses: u64) -> String {
+    format!("{:0.2}", (hits as f64 / (hits + misses) as f64) * 100.0)
+}
+
+#[cfg(test)]
+mod tests {
+    use quick_cache::sync::Cache;
+
+    use super::*;
+
+    #[test]
+    fn test_cache_hit_percent() {
+        let cache: Cache<i32, ()> = Cache::new(100);
+
+        for i in 0..20 {
+            cache.insert(i, ());
+        }
+
+        for i in 0..100 {
+            cache.get(&i);
+        }
+
+        let hits = cache.hits();
+        let misses = cache.misses();
+
+        assert_eq!(20, hits);
+        assert_eq!(80, misses);
+
+        let hit_rate = cache_hit_rate(hits, misses);
+
+        assert_eq!("20.00", hit_rate);
+    }
+}
diff --git a/core/src/rust/filodb_core/src/query_parser.rs b/core/src/rust/filodb_core/src/query_parser.rs
diff --git a/core/src/rust/filodb_core/src/query_parser/filodb_query.rs b/core/src/rust/filodb_core/src/query_parser/filodb_query.rs
@@ -0,0 +1,274 @@
+//! Cachable query implementation
+
+use std::{ops::Bound, sync::Arc};
+
+use quick_cache::Weighter;
+use tantivy::{
+    query::{AllQuery, Query, RangeQuery, TermQuery, TermSetQuery},
+    schema::{Field, IndexRecordOption, Schema},
+    SegmentId, TantivyError, Term,
+};
+use tantivy_common::BitSet;
+use tantivy_utils::field_constants;
+
+use super::parse_query;
+
+/// A query that can potentially be cached
+///
+/// We can't just hold a reference to Tantivy's Query object because
+/// they don't implement Hash/Equals so they can't be a key
+#[derive(Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Clone)]
+pub enum FiloDBQuery {
+    /// A complex query that is serialized in byte form
+    Complex(Arc<Box<[u8]>>),
+    /// Search by part key
+    ByPartKey(Arc<Box<[u8]>>),
+    /// Search by list of part IDs
+    ByPartIds(Arc<Box<[i32]>>),
+    /// Search by end time
+    ByEndTime(i64),
+    /// Search for single part ID (not cached)
+    ByPartId(i32),
+    /// All docs query (not cached)
+    All,
+}
+
+impl tantivy_utils::query::cache::CachableQuery for FiloDBQuery {
+    fn should_cache(&self) -> bool {
+        match self {
+            FiloDBQuery::Complex(_) => true,
+            FiloDBQuery::ByPartIds(_) => true,
+            FiloDBQuery::ByEndTime(_) => true,
+            // No point caching all docs - the "query" is constant time anyway
+            &FiloDBQuery::All => false,
+            // A single term lookup is very efficient - no benefit in caching the doc ID
+            FiloDBQuery::ByPartId(_) => false,
+            // Also single term lookup
+            FiloDBQuery::ByPartKey(_) => false,
+        }
+    }
+
+    fn to_query(
+        &self,
+        schema: &Schema,
+        default_field: Option<Field>,
+    ) -> Result<Box<dyn Query>, TantivyError> {
+        match self {
+            FiloDBQuery::Complex(query_bytes) => {
+                let (_, query) = parse_query(query_bytes, schema, default_field)
+                    .map_err(|e| TantivyError::InternalError(format!("{:#}", e)))?;
+
+                Ok(query)
+            }
+            FiloDBQuery::ByPartKey(part_key) => {
+                let field = schema.get_field(field_constants::PART_KEY)?;
+                let term = Term::from_field_bytes(field, part_key);
+                let query = TermQuery::new(term, IndexRecordOption::Basic);
+
+                Ok(Box::new(query))
+            }
+            FiloDBQuery::ByPartIds(part_ids) => {
+                let part_id_field = schema.get_field(field_constants::PART_ID)?;
+
+                let mut terms = Vec::with_capacity(part_ids.len());
+                for id in part_ids.iter() {
+                    let term = Term::from_field_i64(part_id_field, *id as i64);
+                    terms.push(term);
+                }
+
+                let query = TermSetQuery::new(terms);
+
+                Ok(Box::new(query))
+            }
+            FiloDBQuery::All => Ok(Box::new(AllQuery)),
+            FiloDBQuery::ByPartId(part_id) => {
+                let part_id_field = schema.get_field(field_constants::PART_ID)?;
+                let term = Term::from_field_i64(part_id_field, *part_id as i64);
+
+                let query = TermQuery::new(term, IndexRecordOption::Basic);
+
+                Ok(Box::new(query))
+            }
+            FiloDBQuery::ByEndTime(ended_at) => {
+                let query = RangeQuery::new_i64_bounds(
+                    field_constants::END_TIME.to_string(),
+                    Bound::Included(0),
+                    Bound::Included(*ended_at),
+                );
+
+                Ok(Box::new(query))
+            }
+        }
+    }
+}
+
+#[derive(Clone, Default)]
+pub struct CachableQueryWeighter;
+
+// We want our cache to hold a maximum number of items based on their total size in RAM vs item count
+// This is because not all segments are the same size / not all queries to cache are equal
+//
+// To do this we compute the weight of a given cache item as the size of the query key + the size
+// of the cached bitfield of results.  This enables quick_cache to ensure we never go too much above
+// a fixed amount of RAM usage.
+//
+// The weight does not impact which items get evicted first, just how many need to get evicted to
+// make space for a new incoming item.
+impl Weighter<(SegmentId, FiloDBQuery), Arc<BitSet>> for CachableQueryWeighter {
+    fn weight(&self, key: &(SegmentId, FiloDBQuery), val: &Arc<BitSet>) -> u64 {
+        let bitset_size = ((val.max_value() as usize + 63) / 64) * 8;
+        let key_size = std::mem::size_of::<(SegmentId, FiloDBQuery)>();
+
+        let type_size = match &key.1 {
+            FiloDBQuery::Complex(bytes) => bytes.len() + std::mem::size_of::<Box<[u8]>>(),
+            FiloDBQuery::ByPartKey(part_key) => part_key.len() + std::mem::size_of::<Box<[u8]>>(),
+            FiloDBQuery::ByPartIds(part_ids) => {
+                (part_ids.len() * std::mem::size_of::<i32>()) + std::mem::size_of::<Box<[i32]>>()
+            }
+            FiloDBQuery::All => 0,
+            FiloDBQuery::ByPartId(_) => 0,
+            FiloDBQuery::ByEndTime(_) => 0,
+        };
+
+        (type_size + key_size + bitset_size) as u64
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use tantivy::query::EmptyQuery;
+
+    use tantivy_utils::{query::cache::CachableQuery as _, test_utils::build_test_schema};
+
+    use super::*;
+
+    #[test]
+    fn test_should_cache() {
+        assert!(FiloDBQuery::Complex(Arc::new([0u8; 0].into())).should_cache());
+        assert!(FiloDBQuery::ByPartIds(Arc::new([0i32; 0].into())).should_cache());
+        assert!(FiloDBQuery::ByEndTime(0).should_cache());
+        assert!(!FiloDBQuery::All.should_cache());
+        assert!(!FiloDBQuery::ByPartId(0).should_cache());
+        assert!(!FiloDBQuery::ByPartKey(Arc::new([0u8; 0].into())).should_cache());
+    }
+
+    #[test]
+    fn test_complex_query() {
+        let index = build_test_schema();
+        let weighter = CachableQueryWeighter;
+        let reader = index.searcher.segment_readers().first().unwrap();
+        let query = FiloDBQuery::Complex(Arc::new([1u8, 0u8].into()));
+
+        let parsed = query.to_query(&index.schema, None).expect("Should succeed");
+
+        assert!(parsed.is::<EmptyQuery>());
+
+        assert_eq!(
+            weighter.weight(
+                &(reader.segment_id(), query),
+                &Arc::new(BitSet::with_max_value(1))
+            ),
+            58
+        );
+    }
+
+    #[test]
+    fn test_partkey_query() {
+        let index = build_test_schema();
+        let weighter = CachableQueryWeighter;
+        let reader = index.searcher.segment_readers().first().unwrap();
+        let query = FiloDBQuery::ByPartKey(Arc::new([1u8, 0u8].into()));
+
+        let parsed = query.to_query(&index.schema, None).expect("Should succeed");
+
+        assert!(parsed.is::<TermQuery>());
+
+        assert_eq!(
+            weighter.weight(
+                &(reader.segment_id(), query),
+                &Arc::new(BitSet::with_max_value(1))
+            ),
+            58
+        );
+    }
+
+    #[test]
+    fn test_endtime_query() {
+        let index = build_test_schema();
+        let weighter = CachableQueryWeighter;
+        let reader = index.searcher.segment_readers().first().unwrap();
+        let query = FiloDBQuery::ByEndTime(0);
+
+        let parsed = query.to_query(&index.schema, None).expect("Should succeed");
+
+        assert!(parsed.is::<RangeQuery>());
+
+        assert_eq!(
+            weighter.weight(
+                &(reader.segment_id(), query),
+                &Arc::new(BitSet::with_max_value(1))
+            ),
+            40
+        );
+    }
+
+    #[test]
+    fn test_all_query() {
+        let index = build_test_schema();
+        let weighter = CachableQueryWeighter;
+        let reader = index.searcher.segment_readers().first().unwrap();
+        let query = FiloDBQuery::All;
+
+        let parsed = query.to_query(&index.schema, None).expect("Should succeed");
+
+        assert!(parsed.is::<AllQuery>());
+
+        assert_eq!(
+            weighter.weight(
+                &(reader.segment_id(), query),
+                &Arc::new(BitSet::with_max_value(1))
+            ),
+            40
+        );
+    }
+
+    #[test]
+    fn test_partid_query() {
+        let index = build_test_schema();
+        let weighter = CachableQueryWeighter;
+        let reader = index.searcher.segment_readers().first().unwrap();
+        let query = FiloDBQuery::ByPartId(0);
+
+        let parsed = query.to_query(&index.schema, None).expect("Should succeed");
+
+        assert!(parsed.is::<TermQuery>());
+
+        assert_eq!(
+            weighter.weight(
+                &(reader.segment_id(), query),
+                &Arc::new(BitSet::with_max_value(1))
+            ),
+            40
+        );
+    }
+
+    #[test]
+    fn test_partids_query() {
+        let index = build_test_schema();
+        let weighter = CachableQueryWeighter;
+        let reader = index.searcher.segment_readers().first().unwrap();
+        let query = FiloDBQuery::ByPartIds(Arc::new([1, 2].into()));
+
+        let parsed = query.to_query(&index.schema, None).expect("Should succeed");
+
+        assert!(parsed.is::<TermSetQuery>());
+
+        assert_eq!(
+            weighter.weight(
+                &(reader.segment_id(), query),
+                &Arc::new(BitSet::with_max_value(1))
+            ),
+            64
+        );
+    }
+}
diff --git a/core/src/rust/filodb_core/src/reader.rs b/core/src/rust/filodb_core/src/reader.rs
diff --git a/core/src/rust/filodb_core/src/state.rs b/core/src/rust/filodb_core/src/state.rs
@@ -0,0 +1,150 @@
+//! State objects shared with Java
+
+use std::{
+    collections::{BTreeMap, HashMap},
+    sync::{atomic::AtomicBool, RwLock},
+};
+
+use filesize::PathExt;
+use jni::sys::jlong;
+use tantivy::{
+    directory::MmapDirectory,
+    schema::{Field, OwnedValue, Schema},
+    IndexReader, IndexWriter, Searcher, TantivyDocument, TantivyError,
+};
+use tantivy_utils::{
+    collectors::{
+        column_cache::ColumnCache,
+        limited_collector::{LimitedCollector, LimitedSegmentCollector},
+    },
+    query::cache::QueryCache,
+};
+
+use crate::query_parser::filodb_query::{CachableQueryWeighter, FiloDBQuery};
+
+pub struct IndexHandle {
+    // Fields that don't need explicit synchronization
+    //
+    //
+    // Schema for this nidex
+    pub schema: Schema,
+    // Default field for JSON searches
+    pub default_field: Option<Field>,
+    // Active reader
+    pub reader: IndexReader,
+    // Cache of query -> docs
+    query_cache: QueryCache<FiloDBQuery, CachableQueryWeighter>,
+    // Are there changes pending to commit
+    pub changes_pending: AtomicBool,
+    // Column lookup cache
+    pub column_cache: ColumnCache,
+    // Mmap dir - used for stats only
+    pub mmap_directory: MmapDirectory,
+
+    // Fields that need synchronization
+    //
+    //
+    // Active writer
+    pub writer: RwLock<IndexWriter>,
+}
+
+impl IndexHandle {
+    #[allow(clippy::too_many_arguments)]
+    pub fn new_handle(
+        schema: Schema,
+        default_field: Option<Field>,
+        writer: IndexWriter,
+        reader: IndexReader,
+        mmap_directory: MmapDirectory,
+        column_cache_size: u64,
+        query_cache_max_size: u64,
+        query_cache_estimated_item_size: u64,
+    ) -> jlong {
+        let estimated_item_count: u64 = query_cache_max_size / query_cache_estimated_item_size;
+
+        let obj = Box::new(Self {
+            schema,
+            default_field,
+            writer: RwLock::new(writer),
+            reader,
+            changes_pending: AtomicBool::new(false),
+            query_cache: QueryCache::new(estimated_item_count, query_cache_max_size),
+            column_cache: ColumnCache::new(column_cache_size as usize),
+            mmap_directory,
+        });
+
+        Box::into_raw(obj) as jlong
+    }
+
+    /// Decode handle back into a reference
+    pub fn get_ref_from_handle<'a>(handle: jlong) -> &'a Self {
+        let ptr = handle as *const IndexHandle;
+
+        unsafe { &*ptr }
+    }
+
+    pub fn query_cache_stats(&self) -> (u64, u64) {
+        self.query_cache.query_cache_stats()
+    }
+
+    pub fn query_cache_size(&self) -> u64 {
+        self.query_cache.size()
+    }
+
+    pub fn mmap_size(&self) -> u64 {
+        self.mmap_directory
+            .get_cache_info()
+            .mmapped
+            .into_iter()
+            .map(|p| p.as_path().size_on_disk().unwrap_or(0))
+            .sum()
+    }
+
+    pub fn searcher(&self) -> Searcher {
+        self.reader.searcher()
+    }
+
+    pub fn execute_cachable_query<C>(
+        &self,
+        cachable_query: FiloDBQuery,
+        collector: C,
+    ) -> Result<C::Fruit, TantivyError>
+    where
+        C: LimitedCollector,
+        C::Child: LimitedSegmentCollector,
+    {
+        let searcher = self.reader.searcher();
+
+        self.execute_cachable_query_with_searcher(cachable_query, collector, &searcher)
+    }
+
+    pub fn execute_cachable_query_with_searcher<C>(
+        &self,
+        cachable_query: FiloDBQuery,
+        collector: C,
+        searcher: &Searcher,
+    ) -> Result<C::Fruit, TantivyError>
+    where
+        C: LimitedCollector,
+        C::Child: LimitedSegmentCollector,
+    {
+        self.query_cache.search(
+            searcher,
+            &self.schema,
+            self.default_field,
+            cachable_query,
+            collector,
+        )
+    }
+}
+
+/// A document that is actively being built up for ingesting
+#[derive(Default)]
+pub struct IngestingDocument {
+    // List of map entries we're building up to store in the document
+    pub map_values: HashMap<String, BTreeMap<String, OwnedValue>>,
+    // List of field names in the document being ingested
+    pub field_names: Vec<String>,
+    // Document state for ingestion
+    pub doc: TantivyDocument,
+}
diff --git a/core/src/rust/tantivy_utils/Cargo.toml b/core/src/rust/tantivy_utils/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+name = "tantivy_utils"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+hashbrown = "0.14.5"
+nohash-hasher = "0.2.0"
+quick_cache = { version = "0.6.2", features = ["stats"] }
+tantivy = "0.22.0"
+tantivy-common = "0.7.0"
+tantivy-fst = "0.5.0"
diff --git a/core/src/rust/tantivy_utils/src/collectors.rs b/core/src/rust/tantivy_utils/src/collectors.rs
@@ -0,0 +1,10 @@
+//! Common collectors
+
+pub mod column_cache;
+pub mod limited_collector;
+pub mod part_id_collector;
+pub mod part_key_collector;
+pub mod part_key_record_collector;
+pub mod string_field_collector;
+pub mod time_collector;
+pub mod time_range_filter;
diff --git a/core/src/rust/tantivy_utils/src/collectors/column_cache.rs b/core/src/rust/tantivy_utils/src/collectors/column_cache.rs
@@ -0,0 +1,266 @@
+//! Cache for fast field columns
+
+use std::sync::Arc;
+
+use quick_cache::{sync::Cache, Equivalent};
+use tantivy::{
+    columnar::{BytesColumn, Column, DynamicColumn, HasAssociatedColumnType, StrColumn},
+    SegmentId, SegmentReader,
+};
+
+// Max column items to cache.  These are relatively cheap (< 1KB)
+// 1 item per column, per segment
+const DEFAULT_COLUMN_CACHE_ITEM_COUNT: usize = 1000;
+
+// Helper to avoid having to clone strings just to do a cache lookup
+#[derive(Hash, PartialEq, Eq, Debug, Clone)]
+struct CacheKey<'a>(SegmentId, &'a str);
+
+impl<'a> From<CacheKey<'a>> for (SegmentId, String) {
+    fn from(value: CacheKey<'a>) -> Self {
+        (value.0, value.1.to_string())
+    }
+}
+
+impl<'a> Equivalent<(SegmentId, String)> for CacheKey<'a> {
+    fn equivalent(&self, key: &(SegmentId, String)) -> bool {
+        self.0 == key.0 && self.1 == key.1
+    }
+}
+
+/// Opening DynamicColumn instances requires parsing some headers
+/// and other items that while fast, can add up if you're doing this
+/// thousands of times a second.  Since columns for a given segment
+/// are immutable once created caching this parsed data is safe
+/// and cheap and can result in major speedups on things like
+/// point queries.
+#[derive(Clone)]
+pub struct ColumnCache {
+    cache: Arc<Cache<(SegmentId, String), DynamicColumn>>,
+}
+
+impl Default for ColumnCache {
+    fn default() -> Self {
+        Self::new(DEFAULT_COLUMN_CACHE_ITEM_COUNT)
+    }
+}
+
+impl ColumnCache {
+    pub fn new(size: usize) -> Self {
+        Self {
+            cache: Arc::new(Cache::new(size)),
+        }
+    }
+
+    pub fn stats(&self) -> (u64, u64) {
+        (self.cache.hits(), self.cache.misses())
+    }
+
+    pub fn get_column<T>(
+        &self,
+        reader: &SegmentReader,
+        field: &str,
+    ) -> tantivy::Result<Option<Column<T>>>
+    where
+        T: HasAssociatedColumnType,
+        DynamicColumn: From<Column<T>>,
+        DynamicColumn: Into<Option<Column<T>>>,
+    {
+        let key = CacheKey(reader.segment_id(), field);
+
+        if let Some(col) = self.cache.get(&key) {
+            Ok(col.into())
+        } else {
+            let column: Option<Column<T>> = reader.fast_fields().column_opt(field)?;
+
+            if let Some(col) = column {
+                self.cache.insert(key.into(), col.clone().into());
+
+                Ok(Some(col))
+            } else {
+                Ok(None)
+            }
+        }
+    }
+
+    pub fn get_bytes_column(
+        &self,
+        reader: &SegmentReader,
+        field: &str,
+    ) -> tantivy::Result<Option<BytesColumn>> {
+        let key = CacheKey(reader.segment_id(), field);
+
+        if let Some(col) = self.cache.get(&key) {
+            Ok(col.into())
+        } else {
+            let column: Option<BytesColumn> = reader.fast_fields().bytes(field)?;
+
+            if let Some(col) = column {
+                self.cache.insert(key.into(), col.clone().into());
+
+                Ok(Some(col))
+            } else {
+                Ok(None)
+            }
+        }
+    }
+
+    pub fn get_str_column(
+        &self,
+        reader: &SegmentReader,
+        field: &str,
+    ) -> tantivy::Result<Option<StrColumn>> {
+        let key = CacheKey(reader.segment_id(), field);
+
+        if let Some(col) = self.cache.get(&key) {
+            Ok(col.into())
+        } else {
+            let column: Option<StrColumn> = reader.fast_fields().str(field)?;
+
+            if let Some(col) = column {
+                self.cache.insert(key.into(), col.clone().into());
+
+                Ok(Some(col))
+            } else {
+                Ok(None)
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::field_constants::{PART_ID, PART_KEY};
+    use std::hash::{DefaultHasher, Hash, Hasher};
+
+    use crate::test_utils::{build_test_schema, COL1_NAME};
+
+    use super::*;
+
+    #[test]
+    fn test_cache_key_equivilance() {
+        let index = build_test_schema();
+        let reader = index.searcher.segment_readers().first().unwrap();
+
+        let key = CacheKey(reader.segment_id(), "foo");
+        let owned_key: (SegmentId, String) = key.clone().into();
+
+        assert_eq!(key.0, owned_key.0);
+        assert_eq!(key.1, owned_key.1);
+
+        let mut hasher = DefaultHasher::new();
+        key.hash(&mut hasher);
+        let key_hash = hasher.finish();
+
+        let mut hasher = DefaultHasher::new();
+        owned_key.hash(&mut hasher);
+        let owned_key_hash = hasher.finish();
+
+        assert_eq!(key_hash, owned_key_hash);
+    }
+
+    #[test]
+    fn test_cache_miss() {
+        let index = build_test_schema();
+        let cache = ColumnCache::default();
+        let reader = index.searcher.segment_readers().first().unwrap();
+
+        let _: Column<i64> = cache
+            .get_column(reader, PART_ID)
+            .expect("Should succeed")
+            .expect("Should return one item");
+
+        assert_eq!(cache.cache.misses(), 1);
+        assert_eq!(cache.cache.hits(), 0);
+    }
+
+    #[test]
+    fn test_cache_hit() {
+        let index = build_test_schema();
+        let cache = ColumnCache::default();
+        let reader = index.searcher.segment_readers().first().unwrap();
+
+        let _: Column<i64> = cache
+            .get_column(reader, PART_ID)
+            .expect("Should succeed")
+            .expect("Should return one item");
+
+        let _: Column<i64> = cache
+            .get_column(reader, PART_ID)
+            .expect("Should succeed")
+            .expect("Should return one item");
+
+        assert_eq!(cache.cache.misses(), 1);
+        assert_eq!(cache.cache.hits(), 1);
+    }
+
+    #[test]
+    fn test_str_cache_miss() {
+        let index = build_test_schema();
+        let cache = ColumnCache::default();
+        let reader = index.searcher.segment_readers().first().unwrap();
+
+        let _ = cache
+            .get_str_column(reader, COL1_NAME)
+            .expect("Should succeed")
+            .expect("Should return one item");
+
+        assert_eq!(cache.cache.misses(), 1);
+        assert_eq!(cache.cache.hits(), 0);
+    }
+
+    #[test]
+    fn test_str_cache_hit() {
+        let index = build_test_schema();
+        let cache = ColumnCache::default();
+        let reader = index.searcher.segment_readers().first().unwrap();
+
+        let _ = cache
+            .get_str_column(reader, COL1_NAME)
+            .expect("Should succeed")
+            .expect("Should return one item");
+
+        let _ = cache
+            .get_str_column(reader, COL1_NAME)
+            .expect("Should succeed")
+            .expect("Should return one item");
+
+        assert_eq!(cache.cache.misses(), 1);
+        assert_eq!(cache.cache.hits(), 1);
+    }
+
+    #[test]
+    fn test_bytes_cache_miss() {
+        let index = build_test_schema();
+        let cache = ColumnCache::default();
+        let reader = index.searcher.segment_readers().first().unwrap();
+
+        let _ = cache
+            .get_bytes_column(reader, PART_KEY)
+            .expect("Should succeed")
+            .expect("Should return one item");
+
+        assert_eq!(cache.cache.misses(), 1);
+        assert_eq!(cache.cache.hits(), 0);
+    }
+
+    #[test]
+    fn test_bytes_cache_hit() {
+        let index = build_test_schema();
+        let cache = ColumnCache::default();
+        let reader = index.searcher.segment_readers().first().unwrap();
+
+        let _ = cache
+            .get_bytes_column(reader, PART_KEY)
+            .expect("Should succeed")
+            .expect("Should return one item");
+
+        let _ = cache
+            .get_bytes_column(reader, PART_KEY)
+            .expect("Should succeed")
+            .expect("Should return one item");
+
+        assert_eq!(cache.cache.misses(), 1);
+        assert_eq!(cache.cache.hits(), 1);
+    }
+}
diff --git a/core/src/rust/tantivy_utils/src/collectors/limited_collector.rs b/core/src/rust/tantivy_utils/src/collectors/limited_collector.rs
@@ -0,0 +1,97 @@
+//! Collector that can abort early for return limits, for example
+
+use tantivy::{
+    collector::{Collector, SegmentCollector},
+    query::Weight,
+    DocId, Score, SegmentReader, TantivyError, TERMINATED,
+};
+
+mod limit_counter;
+mod unlimited_collector;
+
+pub use limit_counter::{LimitCounter, LimitCounterOptionExt};
+pub use unlimited_collector::{UnlimitedCollector, UnlimitedSegmentCollector};
+
+/// Marker struct for exceeding limits as an error
+pub struct LimitExceeded;
+
+pub type LimitResult = Result<(), LimitExceeded>;
+
+/// Segment collector that can use a limiter
+pub trait LimitedSegmentCollector: SegmentCollector {
+    fn collect_with_limiter(
+        &mut self,
+        doc: DocId,
+        score: Score,
+        limiter: Option<&mut LimitCounter>,
+    ) -> LimitResult;
+}
+
+/// A collector that can use a limiter to abort early
+/// This is modelled off the Lucene behavior where you can
+/// throw an exception to stop the collection.
+///
+/// Since Rust has no exceptions, we need an error path which
+/// requires an extension trait.
+pub trait LimitedCollector
+where
+    Self: Collector,
+    Self::Child: LimitedSegmentCollector,
+{
+    /// Returns configured limit
+    fn limit(&self) -> usize;
+
+    fn collect_segment_with_limiter(
+        &self,
+        weight: &dyn Weight,
+        segment_ord: u32,
+        reader: &SegmentReader,
+        limiter: &mut LimitCounter,
+    ) -> Result<<Self::Child as SegmentCollector>::Fruit, TantivyError> {
+        let mut segment_collector = self.for_segment(segment_ord, reader)?;
+        let mut scorer = weight.scorer(reader, 1.0)?;
+
+        // This is an extension of the logic that the base Collector trait provides:
+        // For each document the scorer produces:
+        //    * Check if it is alive if we have an alive_bitset
+        //    * Collect it with the limiter method
+        //    * If the collect method returns an error that signals we're at the limit, abort
+        //
+        // This code does not handle scoring, in part because there's no usage of scoring in FiloDB.
+        match (reader.alive_bitset(), self.requires_scoring()) {
+            (Some(alive_bitset), false) => {
+                let mut doc = scorer.doc();
+                while doc != TERMINATED {
+                    if alive_bitset.is_alive(doc)
+                        && segment_collector
+                            .collect_with_limiter(doc, scorer.score(), Some(limiter))
+                            .is_err()
+                    {
+                        // Hit limit
+                        break;
+                    }
+                    doc = scorer.advance();
+                }
+            }
+            (None, false) => {
+                let mut doc = scorer.doc();
+                while doc != TERMINATED {
+                    if segment_collector
+                        .collect_with_limiter(doc, scorer.score(), Some(limiter))
+                        .is_err()
+                    {
+                        break;
+                    }
+                    doc = scorer.advance();
+                }
+            }
+            (_, true) => {
+                return Err(TantivyError::InvalidArgument(
+                    "Scoring not supported".into(),
+                ));
+            }
+        }
+
+        Ok(segment_collector.harvest())
+    }
+}
diff --git a/core/src/rust/tantivy_utils/src/collectors/limited_collector/limit_counter.rs b/core/src/rust/tantivy_utils/src/collectors/limited_collector/limit_counter.rs
@@ -0,0 +1,66 @@
+//! Counter for limiting
+
+use super::{LimitExceeded, LimitResult};
+
+/// Counter to keep track of and enforce a limit
+pub struct LimitCounter {
+    limit: usize,
+    count: usize,
+}
+
+impl LimitCounter {
+    pub fn new(limit: usize) -> Self {
+        Self { limit, count: 0 }
+    }
+
+    /// Increment the seen items, fail if hit the limit
+    #[inline]
+    pub fn increment(&mut self) -> LimitResult {
+        self.count += 1;
+
+        if self.count >= self.limit {
+            Err(LimitExceeded)
+        } else {
+            Ok(())
+        }
+    }
+
+    pub fn at_limit(&self) -> bool {
+        self.count >= self.limit
+    }
+}
+
+pub trait LimitCounterOptionExt {
+    fn increment(&mut self) -> LimitResult;
+}
+
+impl LimitCounterOptionExt for Option<&mut LimitCounter> {
+    #[inline]
+    fn increment(&mut self) -> LimitResult {
+        match self {
+            Some(limiter) => limiter.increment(),
+            None => Ok(()),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_limit_counter() {
+        let mut counter = LimitCounter::new(2);
+
+        // Count 0
+        assert!(!counter.at_limit());
+
+        // Count 1
+        assert!(counter.increment().is_ok());
+        assert!(!counter.at_limit());
+
+        // Count 2
+        assert!(counter.increment().is_err());
+        assert!(counter.at_limit());
+    }
+}
diff --git a/core/src/rust/tantivy_utils/src/collectors/limited_collector/unlimited_collector.rs b/core/src/rust/tantivy_utils/src/collectors/limited_collector/unlimited_collector.rs
@@ -0,0 +1,113 @@
+//! Helper to wrap collectors as unlimited
+//! This is needed for collectors outside this crate, such as the
+//! built-in Tantivy ones
+
+use tantivy::{
+    collector::{Collector, SegmentCollector},
+    DocId, Score, SegmentReader,
+};
+
+use super::{LimitCounter, LimitResult, LimitedCollector, LimitedSegmentCollector};
+
+/// Wrapper to adapt not limited collectors into the limiting framework
+pub struct UnlimitedCollector<T>
+where
+    T: Collector,
+{
+    inner: T,
+}
+
+impl<T> UnlimitedCollector<T>
+where
+    T: Collector,
+{
+    pub fn new(inner: T) -> Self {
+        Self { inner }
+    }
+}
+
+impl<T> LimitedCollector for UnlimitedCollector<T>
+where
+    T: Collector,
+{
+    fn limit(&self) -> usize {
+        usize::MAX
+    }
+}
+
+impl<T> Collector for UnlimitedCollector<T>
+where
+    T: Collector,
+{
+    type Fruit = T::Fruit;
+
+    type Child = UnlimitedSegmentCollector<T::Child>;
+
+    fn for_segment(
+        &self,
+        segment_local_id: tantivy::SegmentOrdinal,
+        segment: &SegmentReader,
+    ) -> tantivy::Result<Self::Child> {
+        let segment_collector = self.inner.for_segment(segment_local_id, segment)?;
+
+        Ok(UnlimitedSegmentCollector::new(segment_collector))
+    }
+
+    fn requires_scoring(&self) -> bool {
+        self.inner.requires_scoring()
+    }
+
+    fn merge_fruits(
+        &self,
+        segment_fruits: Vec<<Self::Child as SegmentCollector>::Fruit>,
+    ) -> tantivy::Result<Self::Fruit> {
+        self.inner.merge_fruits(segment_fruits)
+    }
+}
+
+pub struct UnlimitedSegmentCollector<T>
+where
+    T: SegmentCollector,
+{
+    inner: T,
+}
+
+impl<T> UnlimitedSegmentCollector<T>
+where
+    T: SegmentCollector,
+{
+    pub fn new(inner: T) -> Self {
+        Self { inner }
+    }
+}
+
+impl<T> LimitedSegmentCollector for UnlimitedSegmentCollector<T>
+where
+    T: SegmentCollector,
+{
+    fn collect_with_limiter(
+        &mut self,
+        doc: DocId,
+        score: Score,
+        _limiter: Option<&mut LimitCounter>,
+    ) -> LimitResult {
+        self.inner.collect(doc, score);
+
+        Ok(())
+    }
+}
+
+impl<T> SegmentCollector for UnlimitedSegmentCollector<T>
+where
+    T: SegmentCollector,
+{
+    type Fruit = T::Fruit;
+
+    fn collect(&mut self, doc: DocId, score: Score) {
+        self.inner.collect(doc, score)
+    }
+
+    fn harvest(self) -> Self::Fruit {
+        self.inner.harvest()
+    }
+}
diff --git a/core/src/rust/tantivy_utils/src/collectors/part_id_collector.rs b/core/src/rust/tantivy_utils/src/collectors/part_id_collector.rs
@@ -0,0 +1,153 @@
+//! Collector to pull part IDs from a document
+
+use std::cmp::min;
+
+use crate::field_constants;
+use tantivy::{
+    collector::{Collector, SegmentCollector},
+    columnar::Column,
+    TantivyError,
+};
+
+use crate::collectors::column_cache::ColumnCache;
+
+use super::limited_collector::{
+    LimitCounterOptionExt, LimitResult, LimitedCollector, LimitedSegmentCollector,
+};
+
+pub struct PartIdCollector {
+    limit: usize,
+    column_cache: ColumnCache,
+}
+
+impl PartIdCollector {
+    pub fn new(limit: usize, column_cache: ColumnCache) -> Self {
+        Self {
+            limit,
+            column_cache,
+        }
+    }
+}
+
+impl LimitedCollector for PartIdCollector {
+    fn limit(&self) -> usize {
+        self.limit
+    }
+}
+
+impl Collector for PartIdCollector {
+    type Fruit = Vec<i32>;
+
+    type Child = PartIdSegmentCollector;
+
+    fn for_segment(
+        &self,
+        _segment_local_id: tantivy::SegmentOrdinal,
+        segment: &tantivy::SegmentReader,
+    ) -> tantivy::Result<PartIdSegmentCollector> {
+        let column: Column<i64> = self
+            .column_cache
+            .get_column(segment, field_constants::PART_ID)?
+            .ok_or_else(|| TantivyError::FieldNotFound(field_constants::PART_ID.to_string()))?;
+
+        Ok(PartIdSegmentCollector {
+            column,
+            docs: Vec::new(),
+        })
+    }
+
+    fn requires_scoring(&self) -> bool {
+        false
+    }
+
+    fn merge_fruits(&self, segment_fruits: Vec<Vec<i32>>) -> tantivy::Result<Vec<i32>> {
+        let len: usize = min(segment_fruits.iter().map(|x| x.len()).sum(), self.limit);
+
+        let mut result = Vec::with_capacity(len);
+        for part_ids in segment_fruits {
+            result.extend(part_ids.iter().take(self.limit - result.len()));
+        }
+
+        Ok(result)
+    }
+}
+
+pub struct PartIdSegmentCollector {
+    column: Column<i64>,
+    docs: Vec<i32>,
+}
+
+impl LimitedSegmentCollector for PartIdSegmentCollector {
+    fn collect_with_limiter(
+        &mut self,
+        doc: tantivy::DocId,
+        _score: tantivy::Score,
+        mut limiter: Option<&mut super::limited_collector::LimitCounter>,
+    ) -> LimitResult {
+        if let Some(val) = self.column.first(doc) {
+            self.docs.push(val as i32);
+            limiter.increment()?;
+        }
+
+        Ok(())
+    }
+}
+
+impl SegmentCollector for PartIdSegmentCollector {
+    type Fruit = Vec<i32>;
+
+    fn collect(&mut self, doc: tantivy::DocId, score: tantivy::Score) {
+        let _ = self.collect_with_limiter(doc, score, None);
+    }
+
+    fn harvest(self) -> Self::Fruit {
+        self.docs
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use hashbrown::HashSet;
+    use tantivy::query::AllQuery;
+
+    use crate::test_utils::build_test_schema;
+
+    use super::*;
+
+    #[test]
+    fn test_part_id_collector() {
+        let index = build_test_schema();
+        let cache = ColumnCache::default();
+
+        let collector = PartIdCollector::new(usize::MAX, cache);
+        let query = AllQuery;
+
+        let results = index
+            .searcher
+            .search(&query, &collector)
+            .expect("Should succeed");
+
+        // Two docs, IDs 1 and 10
+        assert_eq!(
+            results.into_iter().collect::<HashSet<i32>>(),
+            [1, 10].into_iter().collect::<HashSet<i32>>()
+        );
+    }
+
+    #[test]
+    fn test_part_id_collector_with_limit() {
+        let index = build_test_schema();
+        let cache = ColumnCache::default();
+
+        let collector = PartIdCollector::new(1, cache);
+        let query = AllQuery;
+
+        let results = index
+            .searcher
+            .search(&query, &collector)
+            .expect("Should succeed");
+
+        // Which doc matches first is non deterministic, just check length
+        assert_eq!(results.len(), 1);
+    }
+}
diff --git a/core/src/rust/tantivy_utils/src/collectors/part_key_collector.rs b/core/src/rust/tantivy_utils/src/collectors/part_key_collector.rs
@@ -0,0 +1,162 @@
+//! Collector for part key binary data
+
+use tantivy::{
+    collector::{Collector, SegmentCollector},
+    schema::OwnedValue,
+    DocAddress, Searcher, TantivyDocument, TantivyError,
+};
+
+use crate::field_constants::PART_KEY;
+
+use super::limited_collector::{LimitResult, LimitedCollector, LimitedSegmentCollector};
+
+#[derive(Default)]
+pub struct PartKeyCollector;
+
+pub struct UnresolvedPartKey(DocAddress);
+
+impl UnresolvedPartKey {
+    pub fn resolve(self, searcher: &Searcher) -> Result<Vec<u8>, TantivyError> {
+        let doc_data = searcher.doc::<TantivyDocument>(self.0)?;
+        let part_key_field = searcher.schema().get_field(PART_KEY)?;
+
+        let Some(OwnedValue::Bytes(part_key)) = doc_data
+            .into_iter()
+            .filter(|x| x.field == part_key_field)
+            .map(|x| x.value)
+            .next()
+        else {
+            return Err(TantivyError::FieldNotFound(PART_KEY.to_string()));
+        };
+
+        Ok(part_key)
+    }
+}
+
+impl PartKeyCollector {
+    pub fn new() -> Self {
+        Self {}
+    }
+}
+
+impl LimitedCollector for PartKeyCollector {
+    fn limit(&self) -> usize {
+        usize::MAX
+    }
+}
+
+impl Collector for PartKeyCollector {
+    type Fruit = Option<UnresolvedPartKey>;
+
+    type Child = PartKeySegmentCollector;
+
+    fn for_segment(
+        &self,
+        segment_local_id: tantivy::SegmentOrdinal,
+        _segment: &tantivy::SegmentReader,
+    ) -> tantivy::Result<PartKeySegmentCollector> {
+        Ok(PartKeySegmentCollector {
+            segment_local_id,
+            doc: None,
+        })
+    }
+
+    fn requires_scoring(&self) -> bool {
+        false
+    }
+
+    fn merge_fruits(
+        &self,
+        segment_fruits: Vec<Option<UnresolvedPartKey>>,
+    ) -> tantivy::Result<Option<UnresolvedPartKey>> {
+        Ok(segment_fruits.into_iter().flatten().next())
+    }
+}
+
+pub struct PartKeySegmentCollector {
+    segment_local_id: u32,
+    doc: Option<UnresolvedPartKey>,
+}
+
+impl LimitedSegmentCollector for PartKeySegmentCollector {
+    fn collect_with_limiter(
+        &mut self,
+        doc: tantivy::DocId,
+        score: tantivy::Score,
+        _limiter: Option<&mut super::limited_collector::LimitCounter>,
+    ) -> LimitResult {
+        self.collect(doc, score);
+
+        Ok(())
+    }
+}
+
+impl SegmentCollector for PartKeySegmentCollector {
+    type Fruit = Option<UnresolvedPartKey>;
+
+    fn collect(&mut self, doc: tantivy::DocId, _score: tantivy::Score) {
+        if self.doc.is_some() {
+            return;
+        }
+
+        self.doc = Some(UnresolvedPartKey(DocAddress::new(
+            self.segment_local_id,
+            doc,
+        )));
+    }
+
+    fn harvest(self) -> Self::Fruit {
+        self.doc
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use tantivy::{
+        query::{EmptyQuery, TermQuery},
+        schema::IndexRecordOption,
+        Term,
+    };
+
+    use crate::test_utils::{build_test_schema, COL1_NAME};
+
+    use super::*;
+
+    #[test]
+    fn test_part_key_collector() {
+        let index = build_test_schema();
+
+        let collector = PartKeyCollector::new();
+        let query = TermQuery::new(
+            Term::from_field_text(index.schema.get_field(COL1_NAME).unwrap(), "ABC"),
+            IndexRecordOption::Basic,
+        );
+
+        let results = index
+            .searcher
+            .search(&query, &collector)
+            .expect("Should succeed");
+
+        assert_eq!(
+            results.map(|r| r.resolve(&index.searcher).unwrap()),
+            Some(vec![0x41, 0x41])
+        );
+    }
+
+    #[test]
+    fn test_part_key_collector_no_match() {
+        let index = build_test_schema();
+
+        let collector = PartKeyCollector::new();
+
+        let query = EmptyQuery;
+
+        let results = index
+            .searcher
+            .search(&query, &collector)
+            .expect("Should succeed");
+
+        // Which doc matches first is non deterministic, just check length
+        assert_eq!(results.map(|r| r.resolve(&index.searcher).unwrap()), None);
+    }
+}
diff --git a/core/src/rust/tantivy_utils/src/collectors/part_key_record_collector.rs b/core/src/rust/tantivy_utils/src/collectors/part_key_record_collector.rs
@@ -0,0 +1,271 @@
+//! Collector for part key binary data
+
+use std::cmp::min;
+
+use crate::collectors::column_cache::ColumnCache;
+use crate::field_constants::{END_TIME, PART_KEY, START_TIME};
+use tantivy::schema::OwnedValue;
+use tantivy::{
+    collector::{Collector, SegmentCollector},
+    columnar::Column,
+    TantivyError,
+};
+use tantivy::{DocAddress, Searcher, TantivyDocument};
+
+use super::limited_collector::{
+    LimitCounterOptionExt, LimitResult, LimitedCollector, LimitedSegmentCollector,
+};
+
+/// Records returned from queries
+#[derive(Debug, PartialEq, Hash, PartialOrd, Eq)]
+pub struct UnresolvedPartKeyRecord {
+    pub doc_id: DocAddress,
+    pub start_time: i64,
+    pub end_time: i64,
+}
+
+impl UnresolvedPartKeyRecord {
+    pub fn resolve(self, searcher: &Searcher) -> Result<PartKeyRecord, TantivyError> {
+        let doc_data = searcher.doc::<TantivyDocument>(self.doc_id)?;
+        let part_key_field = searcher.schema().get_field(PART_KEY)?;
+
+        let Some(OwnedValue::Bytes(part_key)) = doc_data
+            .into_iter()
+            .filter(|x| x.field == part_key_field)
+            .map(|x| x.value)
+            .next()
+        else {
+            return Err(TantivyError::FieldNotFound(PART_KEY.to_string()));
+        };
+
+        Ok(PartKeyRecord {
+            part_key,
+            start_time: self.start_time,
+            end_time: self.end_time,
+        })
+    }
+}
+
+/// Records returned from queries
+#[derive(Debug, PartialEq, Hash, PartialOrd, Eq)]
+pub struct PartKeyRecord {
+    pub part_key: Vec<u8>,
+    pub start_time: i64,
+    pub end_time: i64,
+}
+
+impl PartKeyRecord {
+    pub fn serialized_len(&self) -> usize {
+        // Two i64 ints, 1 u32 int, byte array
+        self.part_key.len() + size_of::<u32>() + (2 * size_of::<i64>())
+    }
+
+    pub fn serialize(self, vec: &mut impl Extend<u8>) {
+        // Serialize as start time, end time, part_key len, part_key
+        vec.extend(self.start_time.to_le_bytes());
+        vec.extend(self.end_time.to_le_bytes());
+
+        let len = self.part_key.len() as u32;
+        vec.extend(len.to_le_bytes());
+        vec.extend(self.part_key);
+    }
+}
+
+pub struct PartKeyRecordCollector {
+    limit: usize,
+    column_cache: ColumnCache,
+}
+
+impl PartKeyRecordCollector {
+    pub fn new(limit: usize, column_cache: ColumnCache) -> Self {
+        Self {
+            limit,
+            column_cache,
+        }
+    }
+}
+
+impl LimitedCollector for PartKeyRecordCollector {
+    fn limit(&self) -> usize {
+        self.limit
+    }
+}
+
+impl Collector for PartKeyRecordCollector {
+    type Fruit = Vec<UnresolvedPartKeyRecord>;
+
+    type Child = PartKeyRecordSegmentCollector;
+
+    fn for_segment(
+        &self,
+        segment_local_id: tantivy::SegmentOrdinal,
+        segment: &tantivy::SegmentReader,
+    ) -> tantivy::Result<PartKeyRecordSegmentCollector> {
+        let start_time_column: Column<i64> = self
+            .column_cache
+            .get_column(segment, START_TIME)?
+            .ok_or_else(|| TantivyError::FieldNotFound(START_TIME.to_string()))?;
+
+        let end_time_column: Column<i64> = self
+            .column_cache
+            .get_column(segment, END_TIME)?
+            .ok_or_else(|| TantivyError::FieldNotFound(END_TIME.to_string()))?;
+
+        Ok(PartKeyRecordSegmentCollector {
+            segment_ord: segment_local_id,
+            start_time_column,
+            end_time_column,
+            docs: Vec::new(),
+        })
+    }
+
+    fn requires_scoring(&self) -> bool {
+        false
+    }
+
+    fn merge_fruits(
+        &self,
+        segment_fruits: Vec<Vec<UnresolvedPartKeyRecord>>,
+    ) -> tantivy::Result<Vec<UnresolvedPartKeyRecord>> {
+        let len: usize = min(segment_fruits.iter().map(|x| x.len()).sum(), self.limit);
+
+        let mut result = Vec::with_capacity(len);
+        for part_ids in segment_fruits {
+            result.extend(part_ids.into_iter().take(self.limit - result.len()));
+        }
+
+        Ok(result)
+    }
+}
+
+pub struct PartKeyRecordSegmentCollector {
+    segment_ord: u32,
+    start_time_column: Column<i64>,
+    end_time_column: Column<i64>,
+    docs: Vec<UnresolvedPartKeyRecord>,
+}
+
+impl LimitedSegmentCollector for PartKeyRecordSegmentCollector {
+    fn collect_with_limiter(
+        &mut self,
+        doc: tantivy::DocId,
+        _score: tantivy::Score,
+        mut limiter: Option<&mut super::limited_collector::LimitCounter>,
+    ) -> LimitResult {
+        let doc_id = DocAddress::new(self.segment_ord, doc);
+
+        let Some(start_time) = self.start_time_column.first(doc) else {
+            return Ok(());
+        };
+
+        let Some(end_time) = self.end_time_column.first(doc) else {
+            return Ok(());
+        };
+
+        self.docs.push(UnresolvedPartKeyRecord {
+            doc_id,
+            start_time,
+            end_time,
+        });
+
+        limiter.increment()?;
+
+        Ok(())
+    }
+}
+
+impl SegmentCollector for PartKeyRecordSegmentCollector {
+    type Fruit = Vec<UnresolvedPartKeyRecord>;
+
+    fn collect(&mut self, doc: tantivy::DocId, score: tantivy::Score) {
+        let _ = self.collect_with_limiter(doc, score, None);
+    }
+
+    fn harvest(self) -> Self::Fruit {
+        self.docs
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashSet;
+
+    use tantivy::query::AllQuery;
+
+    use crate::test_utils::build_test_schema;
+
+    use super::*;
+
+    #[test]
+    fn test_part_key_collector() {
+        let index = build_test_schema();
+        let column_cache = ColumnCache::default();
+
+        let collector = PartKeyRecordCollector::new(usize::MAX, column_cache);
+        let query = AllQuery;
+
+        let results = index
+            .searcher
+            .search(&query, &collector)
+            .expect("Should succeed");
+
+        // Two docs, IDs 1 and 10
+        assert_eq!(
+            results
+                .into_iter()
+                .map(|x| x.resolve(&index.searcher).unwrap())
+                .collect::<HashSet<_>>(),
+            [
+                PartKeyRecord {
+                    part_key: vec![0x41, 0x41],
+                    start_time: 1234,
+                    end_time: 1235
+                },
+                PartKeyRecord {
+                    part_key: vec![0x42, 0x42],
+                    start_time: 4321,
+                    end_time: 10000
+                }
+            ]
+            .into_iter()
+            .collect::<HashSet<_>>()
+        );
+    }
+
+    #[test]
+    fn test_part_key_collector_with_limit() {
+        let index = build_test_schema();
+        let column_cache = ColumnCache::default();
+
+        let collector = PartKeyRecordCollector::new(1, column_cache);
+        let query = AllQuery;
+
+        let results = index
+            .searcher
+            .search(&query, &collector)
+            .expect("Should succeed");
+
+        // Which doc matches first is non deterministic, just check length
+        assert_eq!(results.len(), 1);
+    }
+
+    #[test]
+    fn test_part_key_record_serialize() {
+        let record = PartKeyRecord {
+            part_key: vec![0xAAu8; 2],
+            start_time: 1,
+            end_time: 2,
+        };
+
+        // 8 bytes, 8 bytes, 4 bytes, 2 bytes
+        assert_eq!(22, record.serialized_len());
+
+        let mut vec = vec![];
+        record.serialize(&mut vec);
+
+        assert_eq!(
+            vec![1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 170, 170],
+            vec
+        );
+    }
+}
diff --git a/core/src/rust/tantivy_utils/src/collectors/string_field_collector.rs b/core/src/rust/tantivy_utils/src/collectors/string_field_collector.rs
@@ -0,0 +1,239 @@
+//! Collector to string values from a document
+
+use std::collections::hash_map::Entry;
+
+use hashbrown::HashMap;
+use nohash_hasher::IntMap;
+use tantivy::{
+    collector::{Collector, SegmentCollector},
+    columnar::StrColumn,
+};
+
+use crate::collectors::column_cache::ColumnCache;
+
+use super::limited_collector::{
+    LimitCounterOptionExt, LimitResult, LimitedCollector, LimitedSegmentCollector,
+};
+
+pub struct StringFieldCollector<'a> {
+    field: &'a str,
+    limit: usize,
+    term_limit: usize,
+    column_cache: ColumnCache,
+}
+
+impl<'a> StringFieldCollector<'a> {
+    pub fn new(field: &'a str, limit: usize, term_limit: usize, column_cache: ColumnCache) -> Self {
+        Self {
+            field,
+            limit,
+            term_limit,
+            column_cache,
+        }
+    }
+}
+
+impl<'a> LimitedCollector for StringFieldCollector<'a> {
+    fn limit(&self) -> usize {
+        self.limit
+    }
+}
+
+impl<'a> Collector for StringFieldCollector<'a> {
+    type Fruit = Vec<(String, u64)>;
+
+    type Child = StringFieldSegmentCollector;
+
+    fn for_segment(
+        &self,
+        _segment_local_id: tantivy::SegmentOrdinal,
+        segment: &tantivy::SegmentReader,
+    ) -> tantivy::Result<StringFieldSegmentCollector> {
+        let column = self.column_cache.get_str_column(segment, self.field)?;
+
+        Ok(StringFieldSegmentCollector {
+            column,
+            docs: IntMap::default(),
+            term_limit: self.term_limit,
+        })
+    }
+
+    fn requires_scoring(&self) -> bool {
+        false
+    }
+
+    fn merge_fruits(
+        &self,
+        segment_fruits: Vec<HashMap<String, u64>>,
+    ) -> tantivy::Result<Vec<(String, u64)>> {
+        let mut results = HashMap::new();
+
+        for mut map in segment_fruits.into_iter() {
+            for (value, count) in map.drain() {
+                *results.entry(value).or_insert(0) += count;
+            }
+        }
+
+        let mut results: Vec<_> = results.drain().collect();
+        results.sort_by(|(_, count_a), (_, count_b)| count_b.cmp(count_a));
+
+        let results = results.into_iter().take(self.limit).collect();
+
+        Ok(results)
+    }
+}
+
+pub struct StringFieldSegmentCollector {
+    column: Option<StrColumn>,
+    docs: IntMap<u64, u64>,
+    term_limit: usize,
+}
+
+impl LimitedSegmentCollector for StringFieldSegmentCollector {
+    fn collect_with_limiter(
+        &mut self,
+        doc: tantivy::DocId,
+        _score: tantivy::Score,
+        mut limiter: Option<&mut super::limited_collector::LimitCounter>,
+    ) -> LimitResult {
+        if let Some(column) = &self.column {
+            for ord in column.term_ords(doc) {
+                if self.docs.len() >= self.term_limit {
+                    break;
+                }
+
+                // We wait to translate to strings later to reduce
+                // the number of times we have to copy the data out
+                // to one per ord
+                let entry = self.docs.entry(ord);
+                let increment = matches!(entry, Entry::Vacant(_));
+                *entry.or_insert(0) += 1;
+
+                if increment {
+                    limiter.increment()?;
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+impl SegmentCollector for StringFieldSegmentCollector {
+    type Fruit = HashMap<String, u64>;
+
+    fn collect(&mut self, doc: tantivy::DocId, score: tantivy::Score) {
+        let _ = self.collect_with_limiter(doc, score, None);
+    }
+
+    fn harvest(self) -> Self::Fruit {
+        self.docs
+            .into_iter()
+            .map(|(ord, count)| {
+                if let Some(column) = &self.column {
+                    let mut value = String::new();
+                    let _ = column.ord_to_str(ord, &mut value);
+
+                    (value, count)
+                } else {
+                    (String::new(), count)
+                }
+            })
+            .filter(|(k, _v)| !k.is_empty())
+            .collect()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashSet;
+
+    use tantivy::query::AllQuery;
+
+    use crate::test_utils::{build_test_schema, COL1_NAME, JSON_COL_NAME};
+
+    use super::*;
+
+    #[test]
+    fn test_string_field_collector() {
+        let index = build_test_schema();
+        let column_cache = ColumnCache::default();
+
+        let collector = StringFieldCollector::new(COL1_NAME, usize::MAX, usize::MAX, column_cache);
+        let query = AllQuery;
+
+        let results = index
+            .searcher
+            .search(&query, &collector)
+            .expect("Should succeed");
+
+        // Two docs
+        assert_eq!(
+            results.into_iter().collect::<HashSet<_>>(),
+            [("ABC".to_string(), 1), ("DEF".to_string(), 1)]
+                .into_iter()
+                .collect::<HashSet<_>>()
+        );
+    }
+
+    #[test]
+    fn test_string_field_collector_json() {
+        let index = build_test_schema();
+        let column_cache = ColumnCache::default();
+
+        let col_name = format!("{}.{}", JSON_COL_NAME, "f1");
+        let collector = StringFieldCollector::new(&col_name, usize::MAX, usize::MAX, column_cache);
+        let query = AllQuery;
+
+        let results = index
+            .searcher
+            .search(&query, &collector)
+            .expect("Should succeed");
+
+        // Two docs
+        assert_eq!(
+            results.into_iter().collect::<HashSet<_>>(),
+            [("value".to_string(), 1), ("othervalue".to_string(), 1)]
+                .into_iter()
+                .collect::<HashSet<_>>()
+        );
+    }
+
+    #[test]
+    fn test_string_field_collector_json_invalid_field() {
+        let index = build_test_schema();
+        let column_cache = ColumnCache::default();
+
+        let col_name = format!("{}.{}", JSON_COL_NAME, "invalid");
+        let collector = StringFieldCollector::new(&col_name, usize::MAX, usize::MAX, column_cache);
+        let query = AllQuery;
+
+        let results = index
+            .searcher
+            .search(&query, &collector)
+            .expect("Should succeed");
+
+        // No results, no failure
+        assert_eq!(
+            results.into_iter().collect::<HashSet<_>>(),
+            [].into_iter().collect::<HashSet<_>>()
+        );
+    }
+
+    #[test]
+    fn test_string_field_collector_with_limit() {
+        let index = build_test_schema();
+        let column_cache = ColumnCache::default();
+
+        let collector = StringFieldCollector::new(COL1_NAME, 1, usize::MAX, column_cache);
+        let query = AllQuery;
+
+        let results = index
+            .searcher
+            .search(&query, &collector)
+            .expect("Should succeed");
+
+        // Which doc matches first is non deterministic, just check length
+        assert_eq!(results.len(), 1);
+    }
+}
@@ -0,0 +1,172 @@
+//! Collector to pull part IDs + time values from a document
+
+use std::cmp::min;
+
+use tantivy::{
+    collector::{Collector, SegmentCollector},
+    columnar::Column,
+    TantivyError,
+};
+
+use crate::collectors::column_cache::ColumnCache;
+use crate::field_constants;
+
+use super::limited_collector::{
+    LimitCounterOptionExt, LimitResult, LimitedCollector, LimitedSegmentCollector,
+};
+
+pub struct TimeCollector<'a> {
+    time_field: &'a str,
+    limit: usize,
+    column_cache: ColumnCache,
+}
+
+impl<'a> TimeCollector<'a> {
+    pub fn new(time_field: &'a str, limit: usize, column_cache: ColumnCache) -> Self {
+        Self {
+            time_field,
+            limit,
+            column_cache,
+        }
+    }
+}
+
+impl<'a> LimitedCollector for TimeCollector<'a> {
+    fn limit(&self) -> usize {
+        self.limit
+    }
+}
+
+impl<'a> Collector for TimeCollector<'a> {
+    // Tuple of part_id, time
+    type Fruit = Vec<(i32, i64)>;
+
+    type Child = TimeSegmentCollector;
+
+    fn for_segment(
+        &self,
+        _segment_local_id: tantivy::SegmentOrdinal,
+        segment: &tantivy::SegmentReader,
+    ) -> tantivy::Result<TimeSegmentCollector> {
+        let id_column: Column<i64> = self
+            .column_cache
+            .get_column(segment, field_constants::PART_ID)?
+            .ok_or_else(|| TantivyError::FieldNotFound(field_constants::PART_ID.to_string()))?;
+
+        let time_column: Column<i64> = self
+            .column_cache
+            .get_column(segment, self.time_field)?
+            .ok_or_else(|| TantivyError::FieldNotFound(self.time_field.to_string()))?;
+
+        Ok(TimeSegmentCollector {
+            id_column,
+            time_column,
+            docs: Vec::new(),
+        })
+    }
+
+    fn requires_scoring(&self) -> bool {
+        false
+    }
+
+    fn merge_fruits(
+        &self,
+        segment_fruits: Vec<Vec<(i32, i64)>>,
+    ) -> tantivy::Result<Vec<(i32, i64)>> {
+        let len: usize = min(segment_fruits.iter().map(|x| x.len()).sum(), self.limit);
+
+        let mut result = Vec::with_capacity(len);
+        for part_ids in segment_fruits {
+            result.extend(part_ids.iter().take(self.limit - result.len()));
+        }
+
+        Ok(result)
+    }
+}
+
+pub struct TimeSegmentCollector {
+    id_column: Column<i64>,
+    time_column: Column<i64>,
+    docs: Vec<(i32, i64)>,
+}
+
+impl LimitedSegmentCollector for TimeSegmentCollector {
+    fn collect_with_limiter(
+        &mut self,
+        doc: tantivy::DocId,
+        _score: tantivy::Score,
+        mut limiter: Option<&mut super::limited_collector::LimitCounter>,
+    ) -> LimitResult {
+        if let Some(id) = self.id_column.first(doc) {
+            if let Some(time) = self.time_column.first(doc) {
+                self.docs.push((id as i32, time));
+                limiter.increment()?;
+            }
+        }
+
+        Ok(())
+    }
+}
+
+impl SegmentCollector for TimeSegmentCollector {
+    type Fruit = Vec<(i32, i64)>;
+
+    fn collect(&mut self, doc: tantivy::DocId, score: tantivy::Score) {
+        let _ = self.collect_with_limiter(doc, score, None);
+    }
+
+    fn harvest(self) -> Self::Fruit {
+        self.docs
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashSet;
+
+    use field_constants::START_TIME;
+    use tantivy::query::AllQuery;
+
+    use crate::test_utils::build_test_schema;
+
+    use super::*;
+
+    #[test]
+    fn test_time_collector() {
+        let index = build_test_schema();
+        let column_cache = ColumnCache::default();
+
+        let collector = TimeCollector::new(START_TIME, usize::MAX, column_cache);
+        let query = AllQuery;
+
+        let results = index
+            .searcher
+            .search(&query, &collector)
+            .expect("Should succeed");
+
+        // Two docs, IDs 1 and 10
+        assert_eq!(
+            results.into_iter().collect::<HashSet<(i32, i64)>>(),
+            [(1, 1234), (10, 4321)]
+                .into_iter()
+                .collect::<HashSet<(i32, i64)>>()
+        );
+    }
+
+    #[test]
+    fn test_part_id_collector_with_limit() {
+        let index = build_test_schema();
+        let column_cache = ColumnCache::default();
+
+        let collector = TimeCollector::new(START_TIME, 1, column_cache);
+        let query = AllQuery;
+
+        let results = index
+            .searcher
+            .search(&query, &collector)
+            .expect("Should succeed");
+
+        // Which doc matches first is non deterministic, just check length
+        assert_eq!(results.len(), 1);
+    }
+}
@@ -0,0 +1,262 @@
+//! Filter collector that applies a start and end time range
+
+use tantivy::{
+    collector::{Collector, SegmentCollector},
+    columnar::Column,
+    TantivyError,
+};
+
+use crate::collectors::column_cache::ColumnCache;
+use crate::field_constants;
+
+use super::limited_collector::{LimitResult, LimitedCollector, LimitedSegmentCollector};
+
+/// Filters results on a time range
+pub struct TimeRangeFilter<'a, T>
+where
+    T: LimitedCollector,
+    T::Child: LimitedSegmentCollector,
+{
+    /// Inner collector
+    collector: &'a T,
+    /// Start time
+    start: i64,
+    /// End time
+    end: i64,
+    /// Column cache
+    column_cache: ColumnCache,
+}
+
+impl<'a, T> TimeRangeFilter<'a, T>
+where
+    T: LimitedCollector,
+    T::Child: LimitedSegmentCollector,
+{
+    pub fn new(collector: &'a T, start: i64, end: i64, column_cache: ColumnCache) -> Self {
+        Self {
+            collector,
+            start,
+            end,
+            column_cache,
+        }
+    }
+}
+
+impl<'a, T> LimitedCollector for TimeRangeFilter<'a, T>
+where
+    T: LimitedCollector,
+    T::Child: LimitedSegmentCollector,
+{
+    fn limit(&self) -> usize {
+        self.collector.limit()
+    }
+}
+
+impl<'a, T> Collector for TimeRangeFilter<'a, T>
+where
+    T: LimitedCollector,
+    T::Child: LimitedSegmentCollector,
+{
+    type Fruit = T::Fruit;
+
+    type Child = TimeRangeFilterSegmentCollector<T::Child>;
+
+    fn for_segment(
+        &self,
+        segment_local_id: tantivy::SegmentOrdinal,
+        segment: &tantivy::SegmentReader,
+    ) -> tantivy::Result<Self::Child> {
+        let start_column = if self.end < i64::MAX {
+            Some(
+                self.column_cache
+                    .get_column(segment, field_constants::START_TIME)?
+                    .ok_or_else(|| {
+                        TantivyError::FieldNotFound(field_constants::START_TIME.to_string())
+                    })?,
+            )
+        } else {
+            None
+        };
+
+        let end_column = if self.start > 0 {
+            Some(
+                self.column_cache
+                    .get_column(segment, field_constants::END_TIME)?
+                    .ok_or_else(|| {
+                        TantivyError::FieldNotFound(field_constants::END_TIME.to_string())
+                    })?,
+            )
+        } else {
+            None
+        };
+
+        let collector = self.collector.for_segment(segment_local_id, segment)?;
+
+        Ok(TimeRangeFilterSegmentCollector::<T::Child> {
+            start_column,
+            end_column,
+            start_time: self.start,
+            end_time: self.end,
+            collector,
+        })
+    }
+
+    fn requires_scoring(&self) -> bool {
+        false
+    }
+
+    fn merge_fruits(
+        &self,
+        segment_fruits: Vec<<Self::Child as tantivy::collector::SegmentCollector>::Fruit>,
+    ) -> tantivy::Result<Self::Fruit> {
+        self.collector.merge_fruits(segment_fruits)
+    }
+}
+
+pub struct TimeRangeFilterSegmentCollector<T>
+where
+    T: LimitedSegmentCollector,
+{
+    collector: T,
+    start_column: Option<Column<i64>>,
+    end_column: Option<Column<i64>>,
+    start_time: i64,
+    end_time: i64,
+}
+
+impl<T> LimitedSegmentCollector for TimeRangeFilterSegmentCollector<T>
+where
+    T: LimitedSegmentCollector,
+{
+    fn collect_with_limiter(
+        &mut self,
+        doc: tantivy::DocId,
+        score: tantivy::Score,
+        limiter: Option<&mut super::limited_collector::LimitCounter>,
+    ) -> LimitResult {
+        if let Some(start_column) = &self.start_column {
+            let doc_start = start_column.first(doc).unwrap_or(0);
+            if doc_start > self.end_time {
+                return Ok(());
+            }
+        }
+
+        if let Some(end_column) = &self.end_column {
+            let doc_end = end_column.first(doc).unwrap_or(i64::MAX);
+            if doc_end < self.start_time {
+                return Ok(());
+            }
+        }
+
+        self.collector.collect_with_limiter(doc, score, limiter)
+    }
+}
+
+impl<T> SegmentCollector for TimeRangeFilterSegmentCollector<T>
+where
+    T: LimitedSegmentCollector,
+{
+    type Fruit = T::Fruit;
+
+    fn collect(&mut self, doc: tantivy::DocId, score: tantivy::Score) {
+        let _ = self.collect_with_limiter(doc, score, None);
+    }
+
+    fn harvest(self) -> Self::Fruit {
+        self.collector.harvest()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use tantivy::{collector::Count, query::AllQuery};
+
+    use crate::{collectors::limited_collector::UnlimitedCollector, test_utils::build_test_schema};
+
+    use super::*;
+
+    #[test]
+    fn test_time_filter_match_all() {
+        let index = build_test_schema();
+        let cache = ColumnCache::default();
+
+        let query = AllQuery;
+        let collector = UnlimitedCollector::new(Count);
+        let collector = TimeRangeFilter::new(&collector, 0, i64::MAX, cache);
+        let result = index
+            .searcher
+            .search(&query, &collector)
+            .expect("Should succeed");
+
+        // Should match both docs since there's no effective time filter
+        assert_eq!(result, 2);
+    }
+
+    #[test]
+    fn test_time_filter_match_end_filter() {
+        let index = build_test_schema();
+        let cache = ColumnCache::default();
+
+        let query = AllQuery;
+        let collector = UnlimitedCollector::new(Count);
+        let collector = TimeRangeFilter::new(&collector, 0, 2000, cache);
+        let result = index
+            .searcher
+            .search(&query, &collector)
+            .expect("Should succeed");
+
+        // Should match one doc since the other starts after end
+        assert_eq!(result, 1);
+    }
+
+    #[test]
+    fn test_time_filter_match_start_filter() {
+        let index = build_test_schema();
+        let cache = ColumnCache::default();
+
+        let query = AllQuery;
+        let collector = UnlimitedCollector::new(Count);
+        let collector = TimeRangeFilter::new(&collector, 2000, i64::MAX, cache);
+        let result = index
+            .searcher
+            .search(&query, &collector)
+            .expect("Should succeed");
+
+        // Should match one doc since the other ends after start
+        assert_eq!(result, 1);
+    }
+
+    #[test]
+    fn test_time_filter_match_overlap() {
+        let index = build_test_schema();
+        let cache = ColumnCache::default();
+
+        let query = AllQuery;
+        let collector = UnlimitedCollector::new(Count);
+        let collector = TimeRangeFilter::new(&collector, 1000, 2000, cache);
+        let result = index
+            .searcher
+            .search(&query, &collector)
+            .expect("Should succeed");
+
+        // Should match one doc since the other ends after start
+        assert_eq!(result, 1);
+    }
+
+    #[test]
+    fn test_time_filter_match_outside_range() {
+        let index = build_test_schema();
+        let cache = ColumnCache::default();
+
+        let query = AllQuery;
+        let collector = UnlimitedCollector::new(Count);
+        let collector = TimeRangeFilter::new(&collector, 20_000, 20_000, cache);
+        let result = index
+            .searcher
+            .search(&query, &collector)
+            .expect("Should succeed");
+
+        // Should match no docs - out of range
+        assert_eq!(result, 0);
+    }
+}
@@ -0,0 +1,17 @@
+//! Field names
+
+pub fn facet_field_name(name: &str) -> String {
+    format!("{}{}", FACET_FIELD_PREFIX, name)
+}
+
+// These should be kept in sync with the constants in  PartKeyIndex.scala
+// as they're fields that can be directly queried via incoming filters
+// or fields that are filtered out of label lists
+pub const DOCUMENT_ID: &str = "__partIdField__";
+pub const PART_ID: &str = "__partIdDv__";
+pub const PART_KEY: &str = "__partKey__";
+pub const LABEL_LIST: &str = "__labelList__";
+pub const FACET_FIELD_PREFIX: &str = "$facet_";
+pub const START_TIME: &str = "__startTime__";
+pub const END_TIME: &str = "__endTime__";
+pub const TYPE: &str = "_type_";
@@ -0,0 +1,8 @@
+//! Common utilities for tantivy operations
+#![deny(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
+
+pub mod collectors;
+pub mod field_constants;
+pub mod query;
+
+pub mod test_utils;
@@ -0,0 +1,9 @@
+//! Helpers for queries
+
+pub mod bitset_weight;
+pub mod cache;
+pub mod prefix_query;
+pub mod range_aware_regex;
+pub mod shared_doc_set;
+
+pub const JSON_PREFIX_SEPARATOR: &str = "\0s";
@@ -0,0 +1,90 @@
+//! Weight adapter for a cached bitset
+
+use std::sync::Arc;
+
+use tantivy::{
+    query::{ConstScorer, Explanation, Scorer, Weight},
+    DocId, Score, SegmentReader, TantivyError,
+};
+use tantivy_common::BitSet;
+
+use super::shared_doc_set::SharedDocSet;
+
+/// Weight that can play back a cached doc set
+pub struct BitSetWeight {
+    bitset: Arc<BitSet>,
+}
+
+impl BitSetWeight {
+    pub fn new(bitset: Arc<BitSet>) -> Self {
+        BitSetWeight { bitset }
+    }
+}
+
+impl Weight for BitSetWeight {
+    fn scorer(&self, _reader: &SegmentReader, _boost: Score) -> tantivy::Result<Box<dyn Scorer>> {
+        let docs = SharedDocSet::new(self.bitset.clone());
+        Ok(Box::new(ConstScorer::new(docs, 1.0)))
+    }
+
+    fn explain(&self, reader: &SegmentReader, doc: DocId) -> tantivy::Result<Explanation> {
+        let mut scorer = self.scorer(reader, 1.0)?;
+        if scorer.seek(doc) == doc {
+            Ok(Explanation::new("BitSetWeight", 1.0))
+        } else {
+            Err(TantivyError::InvalidArgument(
+                "Document does not exist".to_string(),
+            ))
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use tantivy::TERMINATED;
+
+    use crate::test_utils::build_test_schema;
+
+    use super::*;
+
+    #[test]
+    fn test_bitset_weight() {
+        let index = build_test_schema();
+
+        let mut bitset = BitSet::with_max_value(100);
+        bitset.insert(1);
+        bitset.insert(10);
+        bitset.insert(100);
+
+        let weight = BitSetWeight::new(bitset.into());
+        let reader = index.searcher.segment_readers().first().unwrap();
+
+        let mut scorer = weight.scorer(reader, 1.0).expect("Should succeed");
+
+        assert_eq!(scorer.doc(), 1);
+        scorer.advance();
+        assert_eq!(scorer.doc(), 10);
+        scorer.advance();
+        assert_eq!(scorer.doc(), 100);
+        scorer.advance();
+        assert_eq!(scorer.doc(), TERMINATED);
+    }
+
+    #[test]
+    fn test_bitset_explain() {
+        let index = build_test_schema();
+
+        let mut bitset = BitSet::with_max_value(100);
+        bitset.insert(1);
+
+        let weight = BitSetWeight::new(bitset.into());
+        let reader = index.searcher.segment_readers().first().unwrap();
+
+        let explanation = weight.explain(reader, 1).expect("Should succeed");
+
+        assert_eq!(
+            format!("{:?}", explanation),
+            "Explanation({\n  \"value\": 1.0,\n  \"description\": \"BitSetWeight\"\n})"
+        );
+    }
+}
@@ -0,0 +1,257 @@
+//! Cached query support
+
+use std::{hash::Hash, sync::Arc};
+
+use quick_cache::{sync::Cache, Equivalent, Weighter};
+use tantivy::{
+    collector::SegmentCollector,
+    query::{EnableScoring, Query, Weight},
+    schema::{Field, Schema},
+    Searcher, SegmentId, TantivyError,
+};
+use tantivy_common::BitSet;
+
+use crate::collectors::limited_collector::{
+    LimitCounter, LimitedCollector, LimitedSegmentCollector,
+};
+
+use super::bitset_weight::BitSetWeight;
+
+/// Cache for query results
+///
+/// The cache key is a tuple of segment ID and query.  The specific format
+/// of the query part is left to the caller as it may be a serialized format.
+///
+/// The key is a bitfield of documents that match the query for a given segment.
+/// The bitfield size in bits will be equal to the number of documents in the
+/// segment.  We keep the BitSet in an Arc to reduce data copies as once created
+/// the field is immutable.
+pub struct QueryCache<QueryType, WeighterType>
+where
+    QueryType: CachableQuery,
+    WeighterType: Weighter<(SegmentId, QueryType), Arc<BitSet>> + Default + Clone,
+{
+    // Cache of query -> docs
+    cache: Cache<(SegmentId, QueryType), Arc<BitSet>, WeighterType>,
+}
+
+/// Trait for cachable query keys
+pub trait CachableQuery: Eq + PartialEq + Hash + Clone {
+    /// Should this query be cached?
+    fn should_cache(&self) -> bool;
+
+    fn to_query(
+        &self,
+        schema: &Schema,
+        default_field: Option<Field>,
+    ) -> Result<Box<dyn Query>, TantivyError>;
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct CachableQueryKey<'a, QueryType>(pub SegmentId, pub &'a QueryType)
+where
+    QueryType: Clone + PartialEq + Eq;
+
+impl<'a, QueryType> From<CachableQueryKey<'a, QueryType>> for (SegmentId, QueryType)
+where
+    QueryType: Clone + PartialEq + Eq,
+{
+    fn from(value: CachableQueryKey<'a, QueryType>) -> Self {
+        (value.0, value.1.clone())
+    }
+}
+
+impl<'a, QueryType> Equivalent<(SegmentId, QueryType)> for CachableQueryKey<'a, QueryType>
+where
+    QueryType: Clone + PartialEq + Eq,
+{
+    fn equivalent(&self, key: &(SegmentId, QueryType)) -> bool {
+        self.0 == key.0 && *self.1 == key.1
+    }
+}
+
+// Tuning parameters for query cache
+const DEFAULT_QUERY_CACHE_MAX_SIZE_BYTES: u64 = 50_000_000;
+// Rough estimate of bitset size - 250k docs
+const DEFAULT_QUERY_CACHE_AVG_ITEM_SIZE: u64 = 31250;
+
+impl<QueryType, WeighterType> QueryCache<QueryType, WeighterType>
+where
+    QueryType: CachableQuery,
+    WeighterType: Weighter<(SegmentId, QueryType), Arc<BitSet>> + Default + Clone,
+{
+    pub fn new(estimated_items_count: u64, weight_capacity: u64) -> Self {
+        Self {
+            cache: Cache::with_weighter(
+                estimated_items_count as usize,
+                weight_capacity,
+                WeighterType::default(),
+            ),
+        }
+    }
+
+    pub fn query_cache_stats(&self) -> (u64, u64) {
+        (self.cache.hits(), self.cache.misses())
+    }
+
+    /// Gets the current cache size, in bytes
+    pub fn size(&self) -> u64 {
+        self.cache.weight()
+    }
+
+    /// Execute a cachable query
+    pub fn search<C>(
+        &self,
+        searcher: &Searcher,
+        schema: &Schema,
+        default_field: Option<Field>,
+        cachable_query: QueryType,
+        collector: C,
+    ) -> Result<C::Fruit, TantivyError>
+    where
+        C: LimitedCollector,
+        C::Child: LimitedSegmentCollector,
+    {
+        let scoring = EnableScoring::disabled_from_searcher(searcher);
+
+        let mut query_weight: Option<Box<dyn Weight>> = None;
+
+        let segment_readers = searcher.segment_readers();
+        let mut fruits: Vec<<C::Child as SegmentCollector>::Fruit> =
+            Vec::with_capacity(segment_readers.len());
+
+        let mut limiter = LimitCounter::new(collector.limit());
+
+        // Note - the query optimizations here only work for the single threaded querying.  That matches
+        // the pattern FiloDB uses because it will dispatch multiple queries at a time on different threads,
+        // so this results in net improvement anyway.  If we need to change to the multithreaded executor
+        // in the future then the lazy query evaluation code will need some work
+        for (segment_ord, segment_reader) in segment_readers.iter().enumerate() {
+            // Is it cached
+            let cache_key = CachableQueryKey(segment_reader.segment_id(), &cachable_query);
+
+            let docs = if let Some(docs) = self.cache.get(&cache_key) {
+                // Cache hit
+                docs
+            } else {
+                // Build query if needed.  We do this lazily as it may be expensive to parse a regex, for example.
+                // This can give a 2-4x speedup in some cases.
+                let weight = if let Some(weight) = &query_weight {
+                    weight
+                } else {
+                    let query = cachable_query.to_query(schema, default_field)?;
+                    let weight = query.weight(scoring)?;
+
+                    query_weight = Some(weight);
+
+                    // Unwrap is safe here because we just set the value
+                    #[allow(clippy::unwrap_used)]
+                    query_weight.as_ref().unwrap()
+                };
+
+                // Load bit set
+                let mut bitset = BitSet::with_max_value(segment_reader.max_doc());
+
+                weight.for_each_no_score(segment_reader, &mut |docs| {
+                    for doc in docs.iter().cloned() {
+                        bitset.insert(doc);
+                    }
+                })?;
+
+                let bitset = Arc::new(bitset);
+
+                if cachable_query.should_cache() {
+                    self.cache.insert(cache_key.into(), bitset.clone());
+                }
+
+                bitset
+            };
+
+            let weight = BitSetWeight::new(docs);
+            let results = collector.collect_segment_with_limiter(
+                &weight,
+                segment_ord as u32,
+                segment_reader,
+                &mut limiter,
+            )?;
+
+            fruits.push(results);
+
+            if limiter.at_limit() {
+                break;
+            }
+        }
+
+        collector.merge_fruits(fruits)
+    }
+}
+
+impl<QueryType, WeighterType> Default for QueryCache<QueryType, WeighterType>
+where
+    QueryType: CachableQuery,
+    WeighterType: Weighter<(SegmentId, QueryType), Arc<BitSet>> + Default + Clone,
+{
+    fn default() -> Self {
+        const QUERY_CACHE_ESTIMATED_ITEM_COUNT: u64 =
+            DEFAULT_QUERY_CACHE_MAX_SIZE_BYTES / DEFAULT_QUERY_CACHE_AVG_ITEM_SIZE;
+
+        Self::new(
+            QUERY_CACHE_ESTIMATED_ITEM_COUNT,
+            DEFAULT_QUERY_CACHE_MAX_SIZE_BYTES,
+        )
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::hash::{DefaultHasher, Hasher};
+
+    use tantivy::query::AllQuery;
+
+    use crate::test_utils::build_test_schema;
+
+    use super::*;
+
+    #[derive(Clone, Eq, PartialEq, Hash, Debug)]
+    pub enum TestQuery {
+        Test(u32),
+    }
+
+    impl CachableQuery for TestQuery {
+        fn should_cache(&self) -> bool {
+            true
+        }
+
+        fn to_query(
+            &self,
+            _schema: &Schema,
+            _default_field: Option<Field>,
+        ) -> Result<Box<dyn Query>, TantivyError> {
+            Ok(Box::new(AllQuery))
+        }
+    }
+
+    #[test]
+    fn test_cache_key_equivilance() {
+        let index = build_test_schema();
+        let reader = index.searcher.segment_readers().first().unwrap();
+
+        let query = TestQuery::Test(1234);
+
+        let key = CachableQueryKey(reader.segment_id(), &query);
+        let owned_key: (SegmentId, TestQuery) = key.clone().into();
+
+        assert_eq!(key.0, owned_key.0);
+        assert_eq!(*key.1, owned_key.1);
+
+        let mut hasher = DefaultHasher::new();
+        key.hash(&mut hasher);
+        let key_hash = hasher.finish();
+
+        let mut hasher = DefaultHasher::new();
+        owned_key.hash(&mut hasher);
+        let owned_key_hash = hasher.finish();
+
+        assert_eq!(key_hash, owned_key_hash);
+    }
+}
@@ -0,0 +1,96 @@
+//! Query that does a prefix match
+
+use std::sync::Arc;
+
+use tantivy::{
+    query::{AutomatonWeight, Query},
+    schema::Field,
+};
+use tantivy_fst::Automaton;
+
+use super::{range_aware_regex::SkipAutomaton, JSON_PREFIX_SEPARATOR};
+
+#[derive(Debug, Clone)]
+pub struct PrefixQuery {
+    automaton: Arc<SkipAutomaton<PrefixAutomaton>>,
+    field: Field,
+    json_path: String,
+}
+
+impl PrefixQuery {
+    pub fn new(prefix: &str, json_path: &str, field: Field) -> Self {
+        let automaton = PrefixAutomaton {
+            prefix: prefix.as_bytes().into(),
+        };
+        let automaton = SkipAutomaton::new(
+            automaton,
+            if json_path.is_empty() {
+                0
+            } else {
+                json_path.len() + JSON_PREFIX_SEPARATOR.len()
+            },
+        );
+
+        Self {
+            automaton: Arc::new(automaton),
+            field,
+            json_path: json_path.into(),
+        }
+    }
+}
+
+impl Query for PrefixQuery {
+    fn weight(
+        &self,
+        _enable_scoring: tantivy::query::EnableScoring<'_>,
+    ) -> tantivy::Result<Box<dyn tantivy::query::Weight>> {
+        let automaton = self.automaton.clone();
+        let weight: AutomatonWeight<SkipAutomaton<PrefixAutomaton>> = if self.json_path.is_empty() {
+            AutomatonWeight::new(self.field, automaton)
+        } else {
+            AutomatonWeight::new_for_json_path(self.field, automaton, self.json_path.as_bytes())
+        };
+
+        Ok(Box::new(weight))
+    }
+}
+
+#[derive(Debug)]
+pub struct PrefixAutomaton {
+    prefix: Box<[u8]>,
+}
+
+impl Automaton for PrefixAutomaton {
+    // The state here is simple - it's the byte offset we're currently checking
+    // A value of prefix.len() means we've checked everything and we match
+    // A value of MAX means we had a mismatch and will never match
+    type State = usize;
+
+    fn start(&self) -> Self::State {
+        0
+    }
+
+    fn is_match(&self, state: &Self::State) -> bool {
+        *state == self.prefix.len()
+    }
+
+    fn accept(&self, state: &Self::State, byte: u8) -> Self::State {
+        if *state < self.prefix.len() {
+            if byte == self.prefix[*state] {
+                *state + 1
+            } else {
+                usize::MAX
+            }
+        } else {
+            *state
+        }
+    }
+
+    fn can_match(&self, state: &Self::State) -> bool {
+        *state != usize::MAX
+    }
+
+    fn will_always_match(&self, state: &Self::State) -> bool {
+        *state == self.prefix.len()
+    }
+}
@@ -0,0 +1,343 @@
+//! Range aware Regex query
+
+use std::sync::Arc;
+
+use tantivy::{
+    query::{AutomatonWeight, EnableScoring, Query, Weight},
+    schema::Field,
+    TantivyError,
+};
+use tantivy_fst::{Automaton, Regex};
+
+use super::JSON_PREFIX_SEPARATOR;
+
+// Tantivy's in box RegexQuery looks at all possible dictionary values for matches
+// For JSON fields this means looking at a lot of values for other fields that can never match
+// This class is range aware limiting the number of considered terms
+
+#[derive(Debug, Clone)]
+pub struct RangeAwareRegexQuery {
+    regex: Arc<SkipAutomaton<Regex>>,
+    prefix: String,
+    field: Field,
+}
+
+impl RangeAwareRegexQuery {
+    /// Creates a new RegexQuery from a given pattern
+    pub fn from_pattern(
+        regex_pattern: &str,
+        prefix: &str,
+        field: Field,
+    ) -> Result<Self, TantivyError> {
+        let regex = create_regex(regex_pattern)?;
+
+        let regex = SkipAutomaton::new(
+            regex,
+            if prefix.is_empty() {
+                0
+            } else {
+                prefix.len() + JSON_PREFIX_SEPARATOR.len()
+            },
+        );
+
+        Ok(RangeAwareRegexQuery {
+            regex: regex.into(),
+            prefix: if prefix.is_empty() {
+                String::new()
+            } else {
+                format!("{}\0s", prefix)
+            },
+            field,
+        })
+    }
+
+    fn specialized_weight(&self) -> AutomatonWeight<SkipAutomaton<Regex>> {
+        if self.prefix.is_empty() {
+            AutomatonWeight::new(self.field, self.regex.clone())
+        } else {
+            AutomatonWeight::new_for_json_path(
+                self.field,
+                self.regex.clone(),
+                self.prefix.as_bytes(),
+            )
+        }
+    }
+}
+
+impl Query for RangeAwareRegexQuery {
+    fn weight(&self, _enabled_scoring: EnableScoring<'_>) -> Result<Box<dyn Weight>, TantivyError> {
+        Ok(Box::new(self.specialized_weight()))
+    }
+}
+
+fn create_regex(pattern: &str) -> Result<Regex, TantivyError> {
+    Regex::new(pattern)
+        .map_err(|err| TantivyError::InvalidArgument(format!("RanageAwareRegexQuery: {err}")))
+}
+
+#[derive(Debug)]
+pub struct SkipAutomaton<A> {
+    inner: A,
+    skip_size: usize,
+}
+
+impl<A> SkipAutomaton<A> {
+    pub fn new(inner: A, skip_size: usize) -> Self {
+        Self { inner, skip_size }
+    }
+}
+
+#[derive(Clone)]
+pub struct SkipAutomatonState<A> {
+    count: usize,
+    inner: A,
+}
+
+impl<A> Automaton for SkipAutomaton<A>
+where
+    A: Automaton,
+    A::State: Clone,
+{
+    type State = SkipAutomatonState<A::State>;
+
+    fn start(&self) -> Self::State {
+        Self::State {
+            count: 0,
+            inner: self.inner.start(),
+        }
+    }
+
+    fn is_match(&self, state: &Self::State) -> bool {
+        if state.count < self.skip_size {
+            false
+        } else {
+            self.inner.is_match(&state.inner)
+        }
+    }
+
+    fn accept(&self, state: &Self::State, byte: u8) -> Self::State {
+        let mut state = state.clone();
+
+        if state.count < self.skip_size {
+            state.count += 1
+        } else {
+            state.inner = self.inner.accept(&state.inner, byte);
+        };
+
+        state
+    }
+
+    fn can_match(&self, state: &Self::State) -> bool {
+        if state.count < self.skip_size {
+            true
+        } else {
+            self.inner.can_match(&state.inner)
+        }
+    }
+
+    fn will_always_match(&self, state: &Self::State) -> bool {
+        if state.count < self.skip_size {
+            false
+        } else {
+            self.inner.will_always_match(&state.inner)
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // For back compat reasons we must ensure the regex language used covers all non-optional items here:
+    // https://lucene.apache.org/core/9_0_0/core/org/apache/lucene/util/automaton/RegExp.html
+    //
+    // These tests validate this
+
+    fn regex_matches(pattern: &str, input: &str) -> bool {
+        let regex = create_regex(pattern).expect("Regex should compile");
+
+        let mut state = regex.start();
+
+        for b in input.as_bytes() {
+            if regex.will_always_match(&state) {
+                return true;
+            }
+
+            if !regex.can_match(&state) {
+                return false;
+            }
+
+            state = regex.accept(&state, *b);
+        }
+
+        regex.is_match(&state)
+    }
+
+    #[test]
+    fn test_regex_empty() {
+        assert!(regex_matches("", ""))
+    }
+
+    #[test]
+    fn test_regex_literal() {
+        assert!(regex_matches("abcd", "abcd"))
+    }
+
+    #[test]
+    fn test_regex_incomplete() {
+        assert!(!regex_matches("abcd", "ab"))
+    }
+
+    #[test]
+    fn test_regex_longer_string() {
+        assert!(!regex_matches("ab", "abcd"))
+    }
+
+    #[test]
+    fn test_regex_substring() {
+        assert!(!regex_matches("bc", "abcd"))
+    }
+
+    #[test]
+    fn test_regex_union() {
+        assert!(regex_matches("a|b", "a"));
+        assert!(regex_matches("a|b", "b"));
+        assert!(!regex_matches("a|b", "c"));
+    }
+
+    #[test]
+    fn test_regex_question_mark() {
+        assert!(regex_matches("a?", "a"));
+        assert!(regex_matches("a?", ""));
+        assert!(!regex_matches("a?", "b"));
+        assert!(!regex_matches("a?", "aa"));
+    }
+
+    #[test]
+    fn test_regex_asterisk() {
+        assert!(regex_matches("a*", "a"));
+        assert!(regex_matches("a*", ""));
+        assert!(!regex_matches("a*", "b"));
+        assert!(regex_matches("a*", "aa"));
+    }
+
+    #[test]
+    fn test_regex_plus() {
+        assert!(regex_matches("a+", "a"));
+        assert!(!regex_matches("a+", ""));
+        assert!(!regex_matches("a+", "b"));
+        assert!(regex_matches("a+", "aa"));
+    }
+
+    #[test]
+    fn test_regex_n() {
+        assert!(regex_matches("a{1}", "a"));
+        assert!(!regex_matches("a{1}", ""));
+        assert!(!regex_matches("a{1}", "b"));
+        assert!(!regex_matches("a{1}", "aa"));
+    }
+
+    #[test]
+    fn test_regex_n_or_more() {
+        assert!(regex_matches("a{1,}", "a"));
+        assert!(!regex_matches("a{1,}", ""));
+        assert!(!regex_matches("a{1,}", "b"));
+        assert!(regex_matches("a{1,}", "aa"));
+    }
+
+    #[test]
+    fn test_regex_n_m() {
+        assert!(regex_matches("a{1,2}", "a"));
+        assert!(!regex_matches("a{1,2}", ""));
+        assert!(!regex_matches("a{1,2}", "b"));
+        assert!(regex_matches("a{1,2}", "aa"));
+        assert!(!regex_matches("a{1,2}", "aaa"));
+    }
+
+    #[test]
+    fn test_regex_char_class() {
+        assert!(regex_matches("[ab]", "a"));
+        assert!(regex_matches("[ab]", "b"));
+        assert!(!regex_matches("[ab]", "c"));
+        assert!(!regex_matches("[ab]", "aa"));
+    }
+
+    #[test]
+    fn test_regex_not_char_class() {
+        assert!(!regex_matches("[^ab]", "a"));
+        assert!(!regex_matches("[^ab]", "b"));
+        assert!(regex_matches("[^ab]", "c"));
+        assert!(!regex_matches("[^ab]", "aa"));
+    }
+
+    #[test]
+    fn test_regex_char_class_range() {
+        assert!(regex_matches("[a-z]", "a"));
+        assert!(regex_matches("[a-z]", "b"));
+        assert!(!regex_matches("[a-z]", "0"));
+        assert!(!regex_matches("[a-z]", "aa"));
+    }
+
+    #[test]
+    fn test_regex_dot() {
+        assert!(regex_matches(".", "a"));
+        assert!(regex_matches(".", "b"));
+        assert!(!regex_matches(".", "aa"));
+    }
+
+    #[test]
+    fn test_regex_group() {
+        assert!(regex_matches("(a)", "a"));
+        assert!(!regex_matches("(a)", "b"));
+        assert!(!regex_matches("(a)", "aa"));
+    }
+
+    #[test]
+    fn test_regex_digit() {
+        assert!(regex_matches(r"\d", "0"));
+        assert!(!regex_matches(r"\d", "b"));
+        assert!(!regex_matches(r"\d", "01"));
+    }
+
+    #[test]
+    fn test_regex_not_digit() {
+        assert!(regex_matches(r"\D", "b"));
+        assert!(!regex_matches(r"\D", "0"));
+        assert!(!regex_matches(r"\D", "ab"));
+    }
+
+    #[test]
+    fn test_regex_whitespace() {
+        assert!(regex_matches(r"\s", " "));
+        assert!(!regex_matches(r"\s", "b"));
+        assert!(!regex_matches(r"\s", "  "));
+    }
+
+    #[test]
+    fn test_regex_not_whitespace() {
+        assert!(regex_matches(r"\S", "a"));
+        assert!(!regex_matches(r"\S", " "));
+        assert!(!regex_matches(r"\S", "aa"));
+    }
+
+    #[test]
+    fn test_regex_word() {
+        assert!(regex_matches(r"\w", "a"));
+        assert!(!regex_matches(r"\w", "-"));
+        assert!(!regex_matches(r"\w", "aa"));
+    }
+
+    #[test]
+    fn test_regex_not_word() {
+        assert!(regex_matches(r"\W", "-"));
+        assert!(!regex_matches(r"\W", "a"));
+        assert!(!regex_matches(r"\W", "--"));
+    }
+
+    #[test]
+    fn test_regex_escape() {
+        assert!(regex_matches(r"\\", r"\"));
+        assert!(!regex_matches(r"\\", "-"));
+        assert!(!regex_matches(r"\\", r"\\"));
+    }
+}
@@ -0,0 +1,176 @@
+//! Low memcpy sharable docset
+
+use std::sync::Arc;
+
+use tantivy::{DocId, DocSet, TERMINATED};
+use tantivy_common::{BitSet, TinySet};
+
+/// Allows for efficient copying of docsets from an immutable bitset
+/// This is doing the same job as BitSetDocSet, but without the memcpy
+/// each time we want to create a new instance
+pub struct SharedDocSet {
+    bits: Arc<BitSet>,
+    current_word_num: u32,
+    current_word: TinySet,
+    current_doc: DocId,
+}
+
+impl SharedDocSet {
+    pub fn new(bits: Arc<BitSet>) -> Self {
+        let current_word = if bits.max_value() == 0 {
+            TinySet::empty()
+        } else {
+            bits.tinyset(0)
+        };
+
+        let mut ret = Self {
+            bits,
+            current_word_num: 0,
+            current_word,
+            current_doc: 0,
+        };
+
+        ret.advance();
+        ret
+    }
+
+    #[inline]
+    fn word_count(&self) -> u32 {
+        (self.bits.max_value() + 63) / 64
+    }
+}
+
+impl DocSet for SharedDocSet {
+    #[inline]
+    fn advance(&mut self) -> DocId {
+        // Case 1 - bits still in the current word
+        if let Some(bit) = self.current_word.pop_lowest() {
+            self.current_doc = (self.current_word_num * 64) + bit;
+
+            // Case 2 - no more words
+        } else if (self.current_word_num + 1) >= self.word_count() {
+            self.current_doc = TERMINATED;
+
+            // Case 3 - advance to next word
+        } else if let Some(word_num) = self.bits.first_non_empty_bucket(self.current_word_num + 1) {
+            self.current_word_num = word_num;
+            self.current_word = self.bits.tinyset(word_num);
+
+            // This is safe because first_non_empty bucket ensured it is non-empty
+            #[allow(clippy::unwrap_used)]
+            let bit = self.current_word.pop_lowest().unwrap();
+            self.current_doc = (self.current_word_num * 64) + bit;
+
+            // Case 4 - end of set
+        } else {
+            self.current_doc = TERMINATED;
+        }
+
+        self.current_doc
+    }
+
+    fn doc(&self) -> DocId {
+        self.current_doc
+    }
+
+    fn seek(&mut self, target: DocId) -> DocId {
+        if target >= self.bits.max_value() {
+            self.current_doc = TERMINATED
+        }
+
+        let target_word = target / 64;
+        if target_word > self.current_word_num {
+            self.current_word_num = target_word;
+            self.current_word = self.bits.tinyset(self.current_word_num);
+
+            self.current_word = self
+                .current_word
+                .intersect(TinySet::range_greater_or_equal(target_word));
+            self.advance();
+        } else {
+            while self.current_doc < target {
+                self.advance();
+            }
+        }
+
+        self.current_doc
+    }
+
+    fn size_hint(&self) -> u32 {
+        self.bits.len() as u32
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_empty_docset() {
+        let bits = BitSet::with_max_value(0);
+        let mut docset = SharedDocSet::new(bits.into());
+
+        assert_eq!(docset.size_hint(), 0);
+        assert_eq!(docset.doc(), TERMINATED);
+        assert_eq!(docset.advance(), TERMINATED);
+        assert_eq!(docset.seek(0), TERMINATED);
+    }
+
+    #[test]
+    fn test_full_docset() {
+        let bits = BitSet::with_max_value_and_full(1000);
+        let mut docset = SharedDocSet::new(bits.into());
+
+        assert_eq!(docset.size_hint(), 1000);
+        for i in 0..1000 {
+            assert_eq!(i as DocId, docset.doc());
+            docset.advance();
+        }
+
+        assert_eq!(docset.doc(), TERMINATED);
+    }
+
+    #[test]
+    fn test_full_docset_seek() {
+        let bits = BitSet::with_max_value_and_full(1000);
+        let mut docset = SharedDocSet::new(bits.into());
+
+        assert_eq!(docset.size_hint(), 1000);
+        docset.seek(50);
+        for i in 50..1000 {
+            assert_eq!(i as DocId, docset.doc());
+            docset.advance();
+        }
+
+        assert_eq!(docset.doc(), TERMINATED);
+    }
+
+    #[test]
+    fn test_sparse_docset() {
+        let mut bits = BitSet::with_max_value(1000);
+        bits.insert(100);
+        bits.insert(235);
+        let mut docset = SharedDocSet::new(bits.into());
+
+        assert_eq!(docset.size_hint(), 2);
+        assert_eq!(docset.doc(), 100);
+        docset.advance();
+        assert_eq!(docset.doc(), 235);
+        docset.advance();
+        assert_eq!(docset.doc(), TERMINATED);
+    }
+
+    #[test]
+    fn test_sparse_docset_seek() {
+        let mut bits = BitSet::with_max_value(1000);
+        bits.insert(100);
+        bits.insert(235);
+        let mut docset = SharedDocSet::new(bits.into());
+
+        assert_eq!(docset.size_hint(), 2);
+        docset.seek(101);
+        assert_eq!(docset.doc(), 235);
+        docset.advance();
+        assert_eq!(docset.doc(), TERMINATED);
+    }
+}
@@ -0,0 +1,100 @@
+//! Utilites for testing
+
+use crate::field_constants;
+use tantivy::{
+    schema::{
+        Field, JsonObjectOptions, Schema, SchemaBuilder, TextFieldIndexing, FAST, INDEXED, STORED,
+        STRING,
+    },
+    Index, Searcher, TantivyDocument,
+};
+
+pub const COL1_NAME: &str = "col1";
+pub const COL2_NAME: &str = "col2";
+pub const JSON_COL_NAME: &str = "json_col";
+pub const JSON_ATTRIBUTE1_NAME: &str = "f1";
+pub const JSON_ATTRIBUTE2_NAME: &str = "f2";
+
+pub struct TestIndex {
+    pub schema: Schema,
+    pub searcher: Searcher,
+    pub json_field: Field,
+}
+
+// Allow unwraps since this is test code
+#[allow(clippy::unwrap_used)]
+pub fn build_test_schema() -> TestIndex {
+    let mut builder = SchemaBuilder::new();
+
+    builder.add_text_field(COL1_NAME, STRING | FAST);
+    builder.add_text_field(COL2_NAME, STRING | FAST);
+    builder.add_i64_field(field_constants::PART_ID, INDEXED | FAST);
+    builder.add_i64_field(field_constants::START_TIME, INDEXED | FAST);
+    builder.add_i64_field(field_constants::END_TIME, INDEXED | FAST);
+    builder.add_bytes_field(field_constants::PART_KEY, INDEXED | FAST | STORED);
+    builder.add_json_field(
+        JSON_COL_NAME,
+        JsonObjectOptions::default()
+            .set_indexing_options(TextFieldIndexing::default().set_tokenizer("raw"))
+            .set_fast(Some("raw")),
+    );
+
+    let schema = builder.build();
+
+    let index = Index::create_in_ram(schema.clone());
+
+    {
+        let mut writer = index.writer::<TantivyDocument>(50_000_000).unwrap();
+
+        let doc = TantivyDocument::parse_json(
+            &schema,
+            r#"{
+                "col1": "ABC",
+                "col2": "def",
+                "__partIdDv__": 1,
+                "__startTime__": 1234,
+                "__endTime__": 1235,
+                "__partKey__": "QUE=",
+                "json_col": {
+                    "f1": "value",
+                    "f2": "value2"
+                }
+            }"#,
+        )
+        .unwrap();
+
+        writer.add_document(doc).unwrap();
+
+        let doc = TantivyDocument::parse_json(
+            &schema,
+            r#"{
+                "col1": "DEF",
+                "col2": "abc",
+                "__partIdDv__": 10,
+                "__startTime__": 4321,
+                "__endTime__": 10000,
+                "__partKey__": "QkI=",
+                "json_col": {
+                    "f1": "othervalue",
+                    "f2": "othervalue2"
+                }
+            }"#,
+        )
+        .unwrap();
+
+        writer.add_document(doc).unwrap();
+
+        writer.commit().unwrap();
+    }
+
+    let reader = index.reader().unwrap();
+    let searcher = reader.searcher();
+
+    let json_field = schema.get_field(JSON_COL_NAME).unwrap();
+
+    TestIndex {
+        schema,
+        searcher,
+        json_field,
+    }
+}
@@ -99,6 +99,13 @@ filodb {
       block-memory-manager-percent = 60
     }
 
+    tantivy {
+      column-cache-count = 1000
+      query-cache-max-bytes = 50MB
+      query-cache-estimated-item-size = 31250
+      deleted-doc-merge-threshold = 0.1
+    }
+
     flush-task-parallelism = 1
     ensure-block-memory-headroom-percent = 5
     ensure-tsp-count-headroom-percent = 5
@@ -110,6 +117,7 @@ filodb {
     index-faceting-enabled-for-all-labels = true
     disable-index-caching = false
     type-field-indexing-enabled = true
+    part-key-index-type = lucene
   }
 
   tasks {

@@ -0,0 +1,27 @@
+package filodb.core.memstore
+
+import org.scalatest.funspec.AnyFunSpec
+import org.scalatest.matchers.should.Matchers
+
+class PartKeyQueryBuilderSpec extends AnyFunSpec with Matchers {
+  it("should match the regex after anchors stripped") {
+    for ((regex, regexNoAnchors) <- Map(
+      """^.*$""" -> """.*""", // both anchor are stripped.
+      """\$""" -> """\$""", // \$ is not removed.
+      """\\\$""" -> """\\\$""", // \$ is not removed.
+      """\\$""" -> """\\""", // $ is removed.
+      """$""" -> """""", // $ is removed.
+      """\^.*$""" -> """\^.*""", // do not remove \^.
+      """^ ^.*$""" -> """ ^.*""", // only remove the first ^.
+      """^.*\$""" -> """.*\$""",  // do not remove \$
+      """^ $foo""" -> """ $foo""",  // the $ is not at the end, keep it.
+      """.* $ \ $$""" -> """.* $ \ $""",  // only remove the last $
+      """foo.*\\\ $""" -> """foo.*\\\ """, // remove $ for it at the end and not escaped.
+      """foo.*\\\$""" -> """foo.*\\\$""", // keep \$.
+      """foo.*\\$""" -> """foo.*\\""",  // remove $ for it at the end and not escaped.
+      """foo.*$\\\\$""" -> """foo.*$\\\\""",  // keep the first $ since it not at the end.
+    )) {
+      PartKeyQueryBuilder.removeRegexAnchors(regex) shouldEqual regexNoAnchors
+    }
+  }
+}
@@ -0,0 +1,145 @@
+package filodb.core.memstore
+
+import filodb.core.GdeltTestData.dataset6
+import filodb.core.{DatasetRef, TestData}
+import filodb.core.binaryrecord2.RecordBuilder
+import filodb.core.metadata.PartitionSchema
+import filodb.core.query.ColumnFilter
+import filodb.core.query.Filter.{Equals, EqualsRegex, In}
+import org.scalatest.BeforeAndAfter
+import org.scalatest.funspec.AnyFunSpec
+import org.scalatest.matchers.should.Matchers
+import org.scalatest.time.SpanSugar.convertIntToGrainOfTime
+
+import java.io.File
+
+class PartKeyTantivyIndexSpec extends AnyFunSpec with Matchers with BeforeAndAfter with PartKeyIndexRawSpec {
+  val keyIndex = new PartKeyTantivyIndex(dataset6.ref, dataset6.schema.partition, 0, 1.hour.toMillis,
+    Some(new File(System.getProperty("java.io.tmpdir"), "part-key-lucene-index")))
+
+  val partBuilder = new RecordBuilder(TestData.nativeMem)
+
+  before {
+    keyIndex.reset()
+    keyIndex.refreshReadersBlocking()
+  }
+
+  after {
+    partBuilder.removeAndFreeContainers(partBuilder.allContainers.length)
+  }
+
+  protected def createNewIndex(ref: DatasetRef,
+                               schema: PartitionSchema,
+                               facetEnabledAllLabels: Boolean,
+                               facetEnabledShardKeyLabels: Boolean,
+                               shardNum: Int,
+                               retentionMillis: Long,
+                               diskLocation: Option[File],
+                               lifecycleManager: Option[IndexMetadataStore]): PartKeyIndexRaw = {
+    new PartKeyTantivyIndex(ref, schema, shardNum, retentionMillis, diskLocation, lifecycleManager)
+  }
+
+  it should behave like commonPartKeyTests(keyIndex, partBuilder)
+
+  it("should encode equals queries correctly") {
+    val builder = new TantivyQueryBuilder()
+
+    // Simple equals filter
+    val filters = List(ColumnFilter("col1", Equals("abcd")))
+    val query = builder.buildQuery(filters)
+
+    query should contain theSameElementsInOrderAs List(1,// Boolean
+      1, // Must
+      2, // Equals
+      4, 0, // Length 4
+      99, 111, 108, 49, // col1
+      4, 0, // Length 4
+      97, 98, 99, 100, // abcd
+      0) // End boolean
+  }
+
+  it("should encode equals regex correctly") {
+    val builder = new TantivyQueryBuilder()
+
+    // Simple equals filter
+    val filters = List(ColumnFilter("col1", EqualsRegex("a.*b")))
+    val query = builder.buildQuery(filters)
+
+    query should contain theSameElementsInOrderAs List(1,// Boolean
+      1, // Must
+      3, // Regex
+      4, 0, // Length 4
+      99, 111, 108, 49, // col1
+      4, 0, // Length 4
+      97, 46, 42, 98, // a.*b
+      0) // End boolean
+  }
+
+  it("should encode term in correctly") {
+    val builder = new TantivyQueryBuilder()
+
+    // Simple equals filter
+    val filters = List(ColumnFilter("col1", In(Set("a","b"))))
+    val query = builder.buildQuery(filters)
+
+    query should contain theSameElementsInOrderAs List(1,// Boolean
+      1, // Must
+      4, // Term In
+      4, 0, // Length 4
+      99, 111, 108, 49, // col1
+      2, 0, // Term count 2
+      1, 0, // Length 1
+      97, // a
+      1, 0, // Length 1
+      98, // b
+      0) // End boolean
+  }
+
+  it("should encode prefix correctly") {
+    val builder = new TantivyQueryBuilder()
+
+    // Simple equals filter
+    val filters = List(ColumnFilter("col1", EqualsRegex("a.*")))
+    val query = builder.buildQuery(filters)
+
+    query should contain theSameElementsInOrderAs List(1,// Boolean
+      1, // Must
+      5, // Prefix
+      4, 0, // Length 4
+      99, 111, 108, 49, // col1
+      1, 0, // Length 1
+      97, // a
+      0) // End boolean
+  }
+
+  it("should encode match all correctly") {
+    val builder = new TantivyQueryBuilder()
+
+    // Simple equals filter
+    val filters = List(ColumnFilter("col1", EqualsRegex(".*")))
+    val query = builder.buildQuery(filters)
+
+    query should contain theSameElementsInOrderAs List(1,// Boolean
+      1, // Must
+      6, // Match All
+      0) // End boolean
+  }
+
+  it("should encode start and end time properly") {
+    val builder = new TantivyQueryBuilder()
+
+    // Simple equals filter
+    val filters = List(ColumnFilter("col1", EqualsRegex(".*")))
+    val query = builder.buildQueryWithStartAndEnd(filters, 1, Long.MaxValue)
+
+    query should contain theSameElementsInOrderAs List(1,// Boolean
+      1, // Must
+      6, // Match All
+      1, // Must
+      7, // Long Range
+      11, 0, 95, 95, 101, 110, 100, 84, 105, 109, 101, 95, 95, // __endTime__
+      1, 0, 0, 0, 0, 0, 0, 0, // 0x1
+      -1, -1, -1, -1, -1, -1, -1, 127, // Long.MAX_VALUE
+      0) // End boolean
+  }
+}
@@ -1,18 +1,18 @@
 package filodb.jmh
 
-import java.lang.management.{BufferPoolMXBean, ManagementFactory}
 import java.util.concurrent.TimeUnit
 
 import scala.collection.mutable
 import scala.concurrent.duration._
 
 import ch.qos.logback.classic.{Level, Logger}
+import org.openjdk.jmh.annotations
 import org.openjdk.jmh.annotations._
 import spire.syntax.cfor._
 
 import filodb.core.DatasetRef
 import filodb.core.binaryrecord2.RecordBuilder
-import filodb.core.memstore.PartKeyLuceneIndex
+import filodb.core.memstore.{PartKeyIndexRaw, PartKeyLuceneIndex, PartKeyTantivyIndex}
 import filodb.core.metadata.Schemas
 import filodb.core.metadata.Schemas.untyped
 import filodb.core.query.{ColumnFilter, Filter}
@@ -21,14 +21,16 @@ import filodb.memory.format.{UnsafeUtils, ZeroCopyUTF8String}
 import filodb.timeseries.TestTimeseriesProducer
 
 // scalastyle:off
-@State(Scope.Thread)
-class PartKeyIndexBenchmark {
+@State(Scope.Benchmark)
+abstract class PartKeyIndexBenchmark {
 
   org.slf4j.LoggerFactory.getLogger("filodb").asInstanceOf[Logger].setLevel(Level.ERROR)
 
+  protected def createPartKeyIndex(): PartKeyIndexRaw
+
   println(s"Building Part Keys")
   val ref = DatasetRef("prometheus")
-  val partKeyIndex = new PartKeyLuceneIndex(ref, untyped.partition, true, true,0, 1.hour.toMillis)
+  val partKeyIndex = createPartKeyIndex()
   val numSeries = 1000000
   val ingestBuilder = new RecordBuilder(MemFactory.onHeapFactory, RecordBuilder.DefaultContainerSize, false)
   val untypedData = TestTimeseriesProducer.timeSeriesData(0, numSeries,
@@ -53,6 +55,20 @@ class PartKeyIndexBenchmark {
     }
   }
 
+  private var lookupTime = now + 1000
+
+  // Adjust the time range for every iteration.  Without this everything ends up fully covered
+  // by query caching and you're only testing performance of the cache.
+  //
+  // In the real world it's very common to run the same query again and again but with a different time range
+  // - think cases like a live dashboard or alerting system.
+  @inline
+  protected def currentLookupTime(): Long = {
+    lookupTime += 1
+
+    lookupTime
+  }
+
   val start = System.nanoTime()
 
   println(s"Indexing started")
@@ -61,10 +77,8 @@ class PartKeyIndexBenchmark {
   val end = System.nanoTime()
 
   println(s"Indexing finished. Added $partId part keys Took ${(end-start)/1000000000L}s")
-  import scala.collection.JavaConverters._
 
-  println(s"Index Memory Map Size: " +
-    s"${ManagementFactory.getPlatformMXBeans(classOf[BufferPoolMXBean]).asScala.find(_.getName == "mapped").get.getMemoryUsed}")
+  println(s"Index Memory Map Size: ${partKeyIndex.indexMmapBytes}")
 
   @Benchmark
   @BenchmarkMode(Array(Mode.Throughput))
@@ -77,7 +91,7 @@ class PartKeyIndexBenchmark {
             ColumnFilter("host", Filter.Equals("H0")),
             ColumnFilter("_metric_", Filter.Equals("heap_usage0"))),
         now,
-        now + 1000)
+        currentLookupTime())
     }
   }
 
@@ -93,7 +107,7 @@ class PartKeyIndexBenchmark {
           ColumnFilter("host", Filter.Equals("H0")),
           ColumnFilter("_metric_", Filter.Equals("heap_usage0"))),
         now,
-        now + 1000)
+        currentLookupTime())
     }
   }
 
@@ -109,7 +123,7 @@ class PartKeyIndexBenchmark {
           ColumnFilter("_metric_", Filter.Equals("heap_usage0")),
           ColumnFilter("instance", Filter.EqualsRegex("Instance-2.*"))),
         now,
-        now + 1000)
+        currentLookupTime())
     }
   }
 
@@ -125,7 +139,7 @@ class PartKeyIndexBenchmark {
           ColumnFilter("_metric_", Filter.Equals("heap_usage0")),
           ColumnFilter("instance", Filter.EqualsRegex(".*2"))),
         now,
-        now + 1000)
+        currentLookupTime())
     }
   }
 
@@ -144,7 +158,7 @@ class PartKeyIndexBenchmark {
               "Instance-11|Instance-12|Instance-13|Instance-14|Instance-15|Instance-16|Instance-17|Instance-18|Instance-19|Instance-20|" +
               "Instance-21|Instance-22|Instance-23|Instance-24|Instance-25|Instance-26|Instance-27|Instance-28|Instance-29|Instance-30"))),
         now,
-        now + 1000).length
+        currentLookupTime()).length
     }
   }
 
@@ -168,7 +182,7 @@ class PartKeyIndexBenchmark {
     cforRange ( 0 until 8 ) { i =>
       val filter = Seq(ColumnFilter("_ns_", Filter.Equals(s"App-$i")),
         ColumnFilter("_ws_", Filter.Equals("demo")))
-      partKeyIndex.labelValuesEfficient(filter, now, now + 1000, "_metric_", 10000)
+      partKeyIndex.labelValuesEfficient(filter, now, currentLookupTime(), "_metric_", 10000)
     }
   }
 
@@ -181,11 +195,51 @@ class PartKeyIndexBenchmark {
       val filter = Seq(ColumnFilter("_ns_", Filter.Equals(s"App-$i")),
         ColumnFilter("_ws_", Filter.Equals("demo")))
       val res = mutable.HashSet[ZeroCopyUTF8String]()
-      partKeyIndex.partIdsFromFilters(filter, now, now + 1000).foreach { pId =>
+      partKeyIndex.partIdsFromFilters(filter, now, currentLookupTime()).foreach { pId =>
         val pk = partKeyIndex.partKeyFromPartId(pId)
         Schemas.promCounter.partition.binSchema.singleColValues(pk.get.bytes, UnsafeUtils.arayOffset, "_metric_", res)
       }
     }
   }
 
+  @Benchmark
+  @BenchmarkMode(Array(Mode.Throughput))
+  @OutputTimeUnit(TimeUnit.SECONDS)
+  def partIdsLookupOverTime(): Unit = {
+    cforRange ( 0 until 8 ) { i =>
+      partKeyIndex.partIdsFromFilters(
+        Seq(ColumnFilter("_ns_", Filter.Equals(s"App-0")),
+          ColumnFilter("_ws_", Filter.Equals("demo")),
+          ColumnFilter("host", Filter.Equals("H0")),
+          ColumnFilter("_metric_", Filter.Equals("heap_usage0")),
+          ColumnFilter("instance", Filter.Equals("Instance-1"))),
+        now,
+        currentLookupTime())
+    }
+  }
+
+}
+
+@State(Scope.Benchmark)
+class PartKeyLuceneIndexBenchmark extends PartKeyIndexBenchmark {
+  override protected def createPartKeyIndex(): PartKeyIndexRaw = {
+    new PartKeyLuceneIndex(ref, untyped.partition, true, true, 0, 1.hour.toMillis)
+  }
 }
+
+@State(Scope.Benchmark)
+class PartKeyTantivyIndexBenchmark extends PartKeyIndexBenchmark {
+  override protected def createPartKeyIndex(): PartKeyIndexRaw = {
+    PartKeyTantivyIndex.startMemoryProfiling()
+
+    new PartKeyTantivyIndex(ref, untyped.partition, 0, 1.hour.toMillis)
+  }
+
+  @TearDown(annotations.Level.Trial)
+  def teardown(): Unit = {
+    PartKeyTantivyIndex.stopMemoryProfiling()
+    val index = partKeyIndex.asInstanceOf[PartKeyTantivyIndex]
+
+    println(s"\nCache stats:\n${index.dumpCacheStats()}\n")
+  }
+}
@@ -0,0 +1,215 @@
+package filodb.jmh
+
+import java.io.File
+import java.util.Base64
+import java.util.concurrent.TimeUnit
+
+import scala.io.Source
+
+import ch.qos.logback.classic.{Level, Logger}
+import org.openjdk.jmh.annotations.{Benchmark, BenchmarkMode, Mode, OutputTimeUnit, Scope, State, TearDown}
+import org.openjdk.jmh.annotations
+import org.scalatest.time.SpanSugar.convertIntToGrainOfTime
+
+import filodb.core.DatasetRef
+import filodb.core.memstore.{IndexMetadataStore, IndexState, PartKeyIndexRaw, PartKeyLuceneIndex, PartKeyTantivyIndex}
+import filodb.core.metadata.Schemas.untyped
+import filodb.core.query.{ColumnFilter, Filter}
+
+/*
+  A benchmark that loads data from an external file full of part keys.
+  This is meant to be used with real world exported data to simulate / evaluate changes.
+
+  The input file is specified as a file path and should be a csv with the following columns:
+
+  * partKey (base64 encoded)
+  * startTime (long)
+  * endTime (long)
+
+  A header row can be optionally included.
+
+  To use this with a data set change the file path below and benchmark queries as needed.
+ */
+// scalastyle:off
+@State(Scope.Benchmark)
+abstract class PartKeyIndexExternalBenchmark {
+  // File path to load from
+  final private val inputPath = "partKeys.csv"
+  // File path to use for index storage
+  final protected val indexPath = "index/path"
+
+  // Filters to create queries below
+  private def wsFilter = ColumnFilter("_ws_", Filter.Equals("myws"))
+  private def nsFilter = ColumnFilter("_ns_", Filter.Equals("myns"))
+  private def narrowFilter = ColumnFilter("hostname", Filter.Equals("example"))
+
+  org.slf4j.LoggerFactory.getLogger("filodb").asInstanceOf[Logger].setLevel(Level.ERROR)
+
+  protected def computeIndexPath(): Option[File] = {
+    val file = new File(s"${indexPath}${this.getClass.getSimpleName}")
+
+    if (!file.exists()) {
+      new File(s"${file.toPath}/${ref.dataset}/0").mkdirs()
+    }
+
+    Some(file)
+  }
+
+  protected def createPartKeyIndex(): PartKeyIndexRaw
+
+  println(s"Building Part Keys")
+  val ref = DatasetRef("prometheus")
+  val partKeyIndex = createPartKeyIndex()
+
+  var partId = 1
+
+  private def load_data(): Unit = {
+    val source = Source.fromFile(inputPath)
+    for (line <- source.getLines()) {
+      if (!line.startsWith("partkey")) {
+        val parts = line.split(',')
+
+        val partKey = Base64.getDecoder.decode(parts(0))
+        val startTime = parts(1).toLong
+        val endTime = parts(2).toLong
+
+        partKeyIndex.addPartKey(partKey, partId, startTime, endTime)()
+        partId += 1
+      }
+    }
+    source.close()
+  }
+
+  if (partKeyIndex.indexNumEntries == 0) {
+    val start = System.nanoTime()
+    println(s"Indexing started at path ${partKeyIndex.indexDiskLocation}")
+    load_data()
+    partKeyIndex.refreshReadersBlocking()
+    val end = System.nanoTime()
+
+    println(s"Indexing finished. Added $partId part keys Took ${(end - start) / 1000000000L}s")
+  } else {
+    partKeyIndex.refreshReadersBlocking()
+    println(s"Loaded existing index with ${partKeyIndex.indexNumEntries} part keys")
+  }
+
+  @TearDown(annotations.Level.Trial)
+  def teardown2(): Unit = {
+    println(s"Ram usage after testing ${partKeyIndex.indexRamBytes}")
+    println(s"Mmap usage after testing ${partKeyIndex.indexMmapBytes}")
+  }
+
+  private var lookupTime = 1
+
+  @inline
+  private def currentLookupTime(): Long = {
+    lookupTime += 1
+
+    lookupTime
+  }
+
+  // Wide query - matches most documents
+  @Benchmark
+  @BenchmarkMode(Array(Mode.Throughput))
+  @OutputTimeUnit(TimeUnit.SECONDS)
+  def labelValuesWide(): Unit = {
+    partKeyIndex.labelValuesEfficient(Seq(wsFilter),
+      currentLookupTime(), Long.MaxValue, "_ns_")
+  }
+
+  // Wide query - matches most documents
+  @Benchmark
+  @BenchmarkMode(Array(Mode.Throughput))
+  @OutputTimeUnit(TimeUnit.SECONDS)
+  def partIdsFromFiltersWide(): Unit = {
+    partKeyIndex.partIdsFromFilters(Seq(wsFilter,
+      nsFilter),
+      currentLookupTime(), Long.MaxValue, 10000)
+  }
+
+  // Wide query - matches most documents
+  @Benchmark
+  @BenchmarkMode(Array(Mode.Throughput))
+  @OutputTimeUnit(TimeUnit.SECONDS)
+  def partKeysFromFiltersWide(): Unit = {
+    partKeyIndex.partKeyRecordsFromFilters(Seq(wsFilter,
+      nsFilter),
+      currentLookupTime(), Long.MaxValue, 100)
+  }
+
+  // Narrow query - matches few (< 10) documents
+  @Benchmark
+  @BenchmarkMode(Array(Mode.Throughput))
+  @OutputTimeUnit(TimeUnit.SECONDS)
+  def labelValuesNarrow(): Unit = {
+    partKeyIndex.labelValuesEfficient(Seq(wsFilter,
+      nsFilter,
+      narrowFilter),
+      currentLookupTime(), Long.MaxValue, "pod")
+  }
+
+  // Narrow query - matches few (< 10) documents
+  @Benchmark
+  @BenchmarkMode(Array(Mode.Throughput))
+  @OutputTimeUnit(TimeUnit.SECONDS)
+  def partIdsFromFiltersNarrow(): Unit = {
+    partKeyIndex.partIdsFromFilters(Seq(wsFilter,
+      nsFilter,
+      narrowFilter),
+      currentLookupTime(), Long.MaxValue, 10000)
+  }
+
+  // Narrow query - matches few (< 10) documents
+  @Benchmark
+  @BenchmarkMode(Array(Mode.Throughput))
+  @OutputTimeUnit(TimeUnit.SECONDS)
+  def partKeysFromFiltersNarrow(): Unit = {
+    partKeyIndex.partKeyRecordsFromFilters(Seq(wsFilter,
+      nsFilter,
+      narrowFilter),
+      currentLookupTime(), Long.MaxValue, 100)
+  }
+}
+
+@State(Scope.Benchmark)
+class PartKeyLuceneIndexExternalBenchmark extends PartKeyIndexExternalBenchmark {
+  override protected def createPartKeyIndex(): PartKeyIndexRaw = {
+    new PartKeyLuceneIndex(ref, untyped.partition, true, true, 0, 1.hour.toMillis, diskLocation = computeIndexPath(),
+      lifecycleManager = Some(new MockLifecycleManager()))
+  }
+
+  @TearDown(annotations.Level.Trial)
+  def teardown(): Unit = {
+    // This is needed to keep data consistent between runs with Lucene
+    partKeyIndex.closeIndex()
+  }
+}
+
+@State(Scope.Benchmark)
+class PartKeyTantivyIndexExternalBenchmark extends PartKeyIndexExternalBenchmark {
+  override protected def createPartKeyIndex(): PartKeyIndexRaw = {
+    PartKeyTantivyIndex.startMemoryProfiling()
+
+    new PartKeyTantivyIndex(ref, untyped.partition, 0, 1.hour.toMillis, diskLocation = computeIndexPath(),
+      lifecycleManager = Some(new MockLifecycleManager()))
+  }
+
+  @TearDown(annotations.Level.Trial)
+  def teardown(): Unit = {
+    PartKeyTantivyIndex.stopMemoryProfiling()
+    val index = partKeyIndex.asInstanceOf[PartKeyTantivyIndex]
+
+    println(s"\nCache stats:\n${index.dumpCacheStats()}\n")
+  }
+}
+
+class MockLifecycleManager extends IndexMetadataStore {
+
+  override def initState(datasetRef: DatasetRef, shard: Int): (IndexState.Value, Option[Long]) = (IndexState.Synced, None)
+
+  override def currentState(datasetRef: DatasetRef, shard: Int): (IndexState.Value, Option[Long]) = (IndexState.Synced, None)
+
+  override def updateState(datasetRef: DatasetRef, shard: Int, state: IndexState.Value, time: Long): Unit = {}
+
+  override def updateInitState(datasetRef: DatasetRef, shard: Int, state: IndexState.Value, time: Long): Unit = {}
+}
@@ -0,0 +1,87 @@
+package filodb.jmh
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.openjdk.jmh.annotations.{Level, Param, Scope, Setup, State, TearDown}
+import org.openjdk.jmh.annotations
+import org.scalatest.time.SpanSugar.convertIntToGrainOfTime
+import spire.syntax.cfor.cforRange
+
+import filodb.core.memstore.{PartKeyIndexRaw, PartKeyLuceneIndex, PartKeyTantivyIndex}
+import filodb.core.metadata.Schemas.untyped
+import filodb.memory.BinaryRegionConsumer
+
+// scalastyle:off
+@State(Scope.Benchmark)
+abstract class PartKeyIndexIngestionBenchmark extends PartKeyIndexBenchmark {
+  // How many part keys are added / removed per second
+  final val itemsPerSecond = 10
+  // How often do we commit to disk / refresh readers
+  final val commitWindowInSeconds = 30
+
+  // 0 days, 1 day, 5 days, 30 days
+  @Param(Array("0", "1", "5", "30"))
+  var durationInDays: Long = _
+
+  @Setup(Level.Trial)
+  def setup(): Unit = {
+    println(s"Simulating $durationInDays days of churn")
+    val start = System.nanoTime()
+    val churnSteps = ((durationInDays * 24 * 60 * 60) / commitWindowInSeconds).toInt
+    var partId = 0
+    val itemsPerStep = commitWindowInSeconds * itemsPerSecond
+
+    val partKeys = new ArrayBuffer[Array[Byte]]()
+    val consumer = new BinaryRegionConsumer {
+      def onNext(base: Any, offset: Long): Unit = {
+        val partKey = untyped.partition.binSchema.asByteArray(base, offset)
+        partKeys += partKey
+      }
+    }
+    partKeyBuilder.allContainers.foreach(_.consumeRecords(consumer))
+
+    cforRange ( 0 until churnSteps ) { _ =>
+      cforRange ( 0 until itemsPerStep ) { _ =>
+        val partKey = partKeys(partId)
+        // When we ingested we used 1 based part IDs, not 0 based
+        partKeyIndex.upsertPartKey(partKey, partId+1, now)()
+
+        partId += 1
+        partId = partId % numSeries
+      }
+    }
+
+    val end = System.nanoTime()
+    println(s"Finished ingesting new changes. Took ${(end-start)/1000000000L}s")
+
+    partKeyIndex.refreshReadersBlocking()
+    val end2 = System.nanoTime()
+    println(s"Churning finished. Took ${(end2-start)/1000000000L}s")
+    println(s"New Index Memory Map Size: ${partKeyIndex.indexMmapBytes}")
+    println(s"Doc count: ${partKeyIndex.indexNumEntries}")
+  }
+}
+
+@State(Scope.Benchmark)
+class PartKeyLuceneIndexIngestionBenchmark extends PartKeyIndexIngestionBenchmark {
+  override protected def createPartKeyIndex(): PartKeyIndexRaw = {
+    new PartKeyLuceneIndex(ref, untyped.partition, true, true, 0, 1.hour.toMillis)
+  }
+}
+
+@State(Scope.Benchmark)
+class PartKeyTantivyIndexIngestionBenchmark extends PartKeyIndexIngestionBenchmark {
+  override protected def createPartKeyIndex(): PartKeyIndexRaw = {
+    PartKeyTantivyIndex.startMemoryProfiling()
+
+    new PartKeyTantivyIndex(ref, untyped.partition, 0, 1.hour.toMillis)
+  }
+
+  @TearDown(annotations.Level.Trial)
+  def teardown(): Unit = {
+    PartKeyTantivyIndex.stopMemoryProfiling()
+    val index = partKeyIndex.asInstanceOf[PartKeyTantivyIndex]
+
+    println(s"\nCache stats:\n${index.dumpCacheStats()}\n")
+  }
+}
@@ -80,7 +80,8 @@ object Dependencies {
     "com.esotericsoftware"         % "kryo"               % "4.0.0" excludeAll(excludeMinlog),
     "com.dorkbox"                  % "MinLog-SLF4J"       % "1.12",
     "com.github.ben-manes.caffeine" % "caffeine"          % "3.0.5",
-    "com.twitter"                  %% "chill"             % "0.9.3"
+    "com.twitter"                  %% "chill"             % "0.9.3",
+    "org.apache.commons"           % "commons-lang3"      % "3.14.0"
   )
 
   lazy val sparkJobsDeps = commonDeps ++ Seq(

@@ -1,5 +1,6 @@
 import Dependencies._
 import FiloSettings._
+import RustPlugin._
 import com.typesafe.sbt.SbtMultiJvm.MultiJvmKeys.MultiJvm
 import io.gatling.sbt.GatlingPlugin
 import pl.project13.scala.sbt.JmhPlugin
@@ -45,6 +46,7 @@ object Submodules {
   )
 
   lazy val core = (project in file("core"))
+    .enablePlugins(RustPlugin)
     .dependsOn(memory % "compile->compile; test->test")
     .settings(
       commonSettings,

@@ -0,0 +1,230 @@
+import org.apache.commons.lang3._
+import sbt._
+import sbt.Keys._
+import sbt.io.Path._
+import sbt.nio.Keys._
+import scala.sys.process._
+
+/*
+  Plugin that adds support to build native Rust code as part of a module.
+  This will build the code, include it in resources, and allow runtime loading.
+ */
+object RustPlugin extends AutoPlugin {
+  object autoImport {
+    // Tasks
+    val rustCompile = taskKey[Unit]("Compile rust code for this module.")
+    val rustClean = taskKey[Unit]("Clean rust build for this module.")
+    val rustGatherLibraries = taskKey[Seq[(File, String)]]("Gather the list of native libraries produced by the build.")
+    val rustLint = taskKey[Unit]("Run linting on rust code for this module.")
+    val rustTest = taskKey[Unit]("Test rust code for this module.")
+
+    // Settings
+    val rustSourceDir = settingKey[File]("Path to base directory with rust code.")
+    val rustArchitectures = settingKey[Seq[String]]("List of architectures to build for.  Takes either a Rust " +
+      "target tuple or the special key 'host' to build for the current machine.  To supply multiple architectures " +
+      "separate them with a ';' character.")
+    val rustFeatures = settingKey[String]("Value to pass to cargo's --features option.  Defaults to an empty string.")
+    var rustOptimize = settingKey[Boolean]("Enable optimization during rust builds.  Defaults to false for host " +
+      "only builds and true for any other configuration.")
+  }
+
+  import autoImport._
+
+  lazy val settings: Seq[Setting[_]] = Seq(
+    rustSourceDir := baseDirectory.value / "src" / "rust",
+    rustArchitectures := {
+      val archs = Option(System.getProperty("rust.architectures")).getOrElse("host")
+
+      archs.split(';').toSeq
+    },
+    rustFeatures := {
+      val features = Option(System.getProperty("rust.features")).getOrElse("")
+
+      features
+    },
+    rustOptimize := {
+      val optimize = Option(System.getProperty("rust.optimize")).getOrElse("true")
+
+      optimize.toBoolean
+    },
+    rustClean := {
+      val log = streams.value.log
+      val sourceDir = rustSourceDir.value
+
+      log.info(s"Cleaning rust source at $sourceDir")
+
+      val returnCode = Process(s"cargo clean", sourceDir) ! cargoLog(log)
+
+      if (returnCode != 0)
+        sys.error(s"cargo clean failed with exit code $returnCode")
+    },
+    rustCompile := {
+      val log = streams.value.log
+      val sourceDir = rustSourceDir.value
+      val features = rustFeatures.value
+
+      for (archTarget <- rustArchitectures.value) {
+        log.info(s"Compiling rust source at $sourceDir for architecture $archTarget with features '$features'")
+
+        // target setup
+        val targetCommand = if (archTarget == "host") {
+          ""
+        } else {
+          s"--target $archTarget"
+        }
+
+        // Use build for the host target, zigbuild for everything else
+        val buildCommand  = if (archTarget == "host") {
+          "build"
+        } else {
+          "zigbuild"
+        }
+
+        // Check if release build
+        val buildFlags = if (rustOptimize.value) {
+          "--release"
+        } else {
+          ""
+        }
+
+        val featureFlags = if (features.isBlank) {
+          ""
+        } else {
+          s" --features $features "
+        }
+
+        val returnCode = Process(s"cargo $buildCommand $buildFlags $featureFlags $targetCommand",
+          sourceDir) ! cargoLog(log)
+
+        if (returnCode != 0)
+          sys.error(s"cargo build failed with exit code $returnCode")
+      }
+    },
+    rustGatherLibraries := {
+      val log = streams.value.log
+      var list: Seq[(File, String)] = Seq()
+
+      // Compile first
+      rustCompile.value
+
+      val release = rustOptimize.value
+      val releaseDir = if (release) {
+        "release"
+      } else {
+        "debug"
+      }
+
+      val targetFolder = rustSourceDir.value / "target"
+      val fileTree = fileTreeView.value
+
+      // For each architecture find artifacts
+      for (archTarget <- rustArchitectures.value) {
+        // Special case - host
+        val archFolder = if (archTarget == "host") {
+          targetFolder / releaseDir
+        } else {
+          // General case
+          targetFolder / archTarget / releaseDir
+        }
+
+        // get os arch / kernel, build path
+        val resourceArchTarget = mapRustTargetToJVMTarget(archTarget)
+
+        // Find library files in folder
+        // We place every produced library in a resource path like
+        // /native/<kernel>/<arch>/<file>
+        val glob = fileTree.list(Glob(archFolder) / "*.{so,dylib}").collect {
+          case (path, attributes) if attributes.isRegularFile => file(path.toString)
+        }
+        val files = glob.pair(rebase(archFolder, s"/native/$resourceArchTarget"))
+
+        list = list ++ files
+      }
+
+      list
+    },
+    rustTest := {
+      val log = streams.value.log
+      val sourceDir = rustSourceDir.value
+
+      val returnCode = Process(s"cargo test", sourceDir) ! cargoLog(log)
+
+      returnCode match {
+        case 101 => sys.error("One or more tests failed")
+        case 0 => ()
+        case x => sys.error(s"cargo test failed with exit code $x")
+      }
+    },
+    resourceGenerators += Def.task {
+      val log = streams.value.log
+
+      val libraries: Seq[(File, String)] = rustGatherLibraries.value
+      val resources: Seq[File] = for ((file, path) <- libraries) yield {
+        val resource = resourceManaged.value / path
+
+        if (IO.getModifiedTimeOrZero(file) > IO.getModifiedTimeOrZero(resource)) {
+          IO.copyFile(file, resource, preserveLastModified = true)
+        }
+        resource
+      }
+      resources
+    }.taskValue
+  )
+
+  lazy val testSettings: Seq[Setting[_]] = Seq(
+    rustLint := {
+      val log = streams.value.log
+      val sourceDir = rustSourceDir.value
+
+      val returnCode = Process(s"cargo clippy --all-targets -- -D warnings", sourceDir) ! cargoLog(log)
+      if (returnCode != 0)
+        sys.error(s"cargo clippy failed with exit code $returnCode")
+    },
+    test := {
+      // Run rust tests and linting
+      rustLint.value
+      rustTest.value
+      // Run base test task
+      test.value
+    }
+  )
+
+  // Map an input Rust arch tuple to the correct target folder for JVM loading
+  private def mapRustTargetToJVMTarget(arch: String): String = {
+    // Rust tuples are basically clang tuples and look like:
+    // aarch64-apple-darwin
+    // x86_64-unknown-linux-gnu
+    //
+    // We want the architecture (first part)
+    // and the kernel part (third part)
+    val RustPattern = "([^-]+)-([^-]+)-([^-]+).*".r
+    arch match {
+      case "host" => s"$getHostKernel/${SystemUtils.OS_ARCH}"
+      case RustPattern(arch, _, kernel) => s"$kernel/$arch"
+      case x => sys.error(s"Unsupported architecture $x")
+    }
+  }
+
+  // Get normalized host kernel name
+  private def getHostKernel: String = {
+    if (SystemUtils.IS_OS_LINUX) {
+      "linux"
+    } else if (SystemUtils.IS_OS_MAC) {
+      "darwin"
+    } else if (SystemUtils.IS_OS_WINDOWS) {
+      "windows"
+    } else {
+      sys.error(s"Unhandled platform ${SystemUtils.OS_NAME}")
+    }
+  }
+
+  // Cargo logs to both stdout and stderr with normal output
+  // Log all of these events as info
+  private def cargoLog(log: sbt.Logger): ProcessLogger = new ProcessLogger {
+    def out(s: => String): Unit = log.info(s)
+    def err(s: => String): Unit = log.info(s)
+    def buffer[T](f: => T): T = f
+  }
+
+  override lazy val projectSettings = inConfig(Compile)(settings) ++ inConfig(Test)(settings ++ testSettings)
+}
@@ -0,0 +1,2 @@
+// Used by RustPlugin to look at current OS info
+libraryDependencies += "org.apache.commons" % "commons-lang3" % "3.14.0"
@@ -1,8 +1,8 @@
 addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")
 
-addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5")
+addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.9")
 
-addSbtPlugin("pl.project13.scala" % "sbt-jmh" % "0.3.7")
+addSbtPlugin("pl.project13.scala" % "sbt-jmh" % "0.4.7")
 
 addSbtPlugin("com.typesafe.sbt" % "sbt-multi-jvm" % "0.4.0")
 

@@ -1,10 +1,11 @@
 #!/bin/bash
-sbt "jmh/jmh:run -rf json -i 5 -wi 3 -f 1 -jvmArgsAppend -XX:MaxInlineLevel=20 \
+sbt -Drust.optimize=true "jmh/jmh:run -rf json -i 5 -wi 3 -f 1 -jvmArgsAppend -XX:MaxInlineLevel=20 \
  -jvmArgsAppend -Xmx4g -jvmArgsAppend -XX:MaxInlineSize=99 \
  filodb.jmh.QueryHiCardInMemoryBenchmark \
  filodb.jmh.QueryInMemoryBenchmark \
  filodb.jmh.QueryAndIngestBenchmark \
  filodb.jmh.IngestionBenchmark \
  filodb.jmh.QueryOnDemandBenchmark \
  filodb.jmh.GatewayBenchmark \
- filodb.jmh.PartKeyIndexBenchmark"
+ filodb.jmh.PartKeyLuceneIndexBenchmark \
+ filodb.jmh.PartKeyTantivyIndexBenchmark"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		allow-unwrap-in-tests = true
		allow-expect-in-tests = true
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		// Used by RustPlugin to look at current OS info
		libraryDependencies += "org.apache.commons" % "commons-lang3" % "3.14.0"