diff --git a/src/Experimental/RAGTools/generation.jl b/src/Experimental/RAGTools/generation.jl index 50448b211..437ebe5c9 100644 --- a/src/Experimental/RAGTools/generation.jl +++ b/src/Experimental/RAGTools/generation.jl @@ -64,6 +64,17 @@ function build_context(contexter::ContextEnumerator, return context end +""" + build_context(contexter::ContextEnumerator, + index::AbstractManagedIndex, candidates::AbstractCandidateWithChunks; + verbose::Bool = true, + chunks_window_margin::Tuple{Int, Int} = (1, 1), kwargs...) + + build_context!(contexter::ContextEnumerator, + index::AbstractManagedIndex, result::AbstractRAGResult; kwargs...) + +Dispatch for `AbstractManagedIndex` with `AbstractCandidateWithChunks`. +""" function build_context(contexter::ContextEnumerator, index::AbstractManagedIndex, candidates::AbstractCandidateWithChunks; @@ -124,7 +135,6 @@ function answer!( throw(ArgumentError("Answerer $(typeof(answerer)) not implemented")) end -# TODO: update docs signature """ answer!( answerer::SimpleAnswerer, index::AbstractDocumentIndex, result::AbstractRAGResult; @@ -173,6 +183,17 @@ function answer!( return result end + +""" + answer!( + answerer::SimpleAnswerer, index::AbstractManagedIndex, result::AbstractRAGResult; + model::AbstractString = PT.MODEL_CHAT, verbose::Bool = true, + template::Symbol = :RAGAnswerFromContext, + cost_tracker = Threads.Atomic{Float64}(0.0), + kwargs...) + +Dispatch for `AbstractManagedIndex`. +""" function answer!( answerer::SimpleAnswerer, index::AbstractManagedIndex, result::AbstractRAGResult; model::AbstractString = PT.MODEL_CHAT, verbose::Bool = true, @@ -228,7 +249,6 @@ function refine!( end -# TODO: update docs signature """ refine!( refiner::NoRefiner, index::AbstractChunkIndex, result::AbstractRAGResult; @@ -247,10 +267,9 @@ function refine!( end -# TODO: update docs signature """ refine!( - refiner::SimpleRefiner, index::AbstractDocumentIndex, result::AbstractRAGResult; + refiner::SimpleRefiner, index::Union{AbstractDocumentIndex, AbstractManagedIndex}, result::AbstractRAGResult; verbose::Bool = true, model::AbstractString = PT.MODEL_CHAT, template::Symbol = :RAGAnswerRefiner, @@ -303,10 +322,9 @@ function refine!( end -# TODO: update docs signature """ refine!( - refiner::TavilySearchRefiner, index::AbstractDocumentIndex, result::AbstractRAGResult; + refiner::TavilySearchRefiner, index::Union{AbstractDocumentIndex, AbstractManagedIndex}, result::AbstractRAGResult; verbose::Bool = true, model::AbstractString = PT.MODEL_CHAT, include_answer::Bool = true, @@ -458,10 +476,9 @@ It uses `ContextEnumerator`, `SimpleAnswerer`, `SimpleRefiner`, and `NoPostproce postprocessor::AbstractPostprocessor = NoPostprocessor() end -# TODO: update docs signature """ generate!( - generator::AbstractGenerator, index::AbstractDocumentIndex, result::AbstractRAGResult; + generator::AbstractGenerator, index::Union{AbstractDocumentIndex, AbstractManagedIndex}, result::AbstractRAGResult; verbose::Integer = 1, api_kwargs::NamedTuple = NamedTuple(), contexter::AbstractContextBuilder = generator.contexter, @@ -591,8 +608,9 @@ function Base.show(io::IO, cfg::AbstractRAGConfig) dump(io, cfg; maxdepth = 2) end +# TODO: add example for Pinecone """ - airag(cfg::AbstractRAGConfig, index::AbstractDocumentIndex; + airag(cfg::AbstractRAGConfig, index::Union{AbstractDocumentIndex, AbstractManagedIndex}; question::AbstractString, verbose::Integer = 1, return_all::Bool = false, api_kwargs::NamedTuple = NamedTuple(), diff --git a/src/Experimental/RAGTools/preparation.jl b/src/Experimental/RAGTools/preparation.jl index 187f9495f..2f18e7b47 100644 --- a/src/Experimental/RAGTools/preparation.jl +++ b/src/Experimental/RAGTools/preparation.jl @@ -145,9 +145,12 @@ end PineconeIndexer <: AbstractIndexBuilder Pinecone index to be returned by `build_index`. + +It uses `FileChunker`, `SimpleEmbedder` and `NoTagger` as default chunker, embedder and tagger. """ @kwdef mutable struct PineconeIndexer <: AbstractIndexBuilder chunker::AbstractChunker = FileChunker() + # TODO: BatchEmbedder? embedder::AbstractEmbedder = SimpleEmbedder() tagger::AbstractTagger = NoTagger() end @@ -726,18 +729,94 @@ function build_index( return index end +# TODO: where to put these? using Pinecone: Pinecone, PineconeContextv3, PineconeIndexv3, init_v3, Index, PineconeVector, upsert using UUIDs: UUIDs, uuid4 -# TODO: change docs """ build_index( - indexer::PineconeIndexer; - namespace::AbstractString, + indexer::PineconeIndexer, files_or_docs::Vector{<:AbstractString}; + metadata::Vector{Dict{String, Any}} = Vector{Dict{String, Any}}(), + pinecone_context::Pinecone.PineconeContextv3 = Pinecone.init_v3(""), + pinecone_index::Pinecone.PineconeIndexv3 = nothing, + pinecone_namespace::AbstractString = "", + upsert::Bool = true, verbose::Integer = 1, - index_id = gensym("PTPineconeIndex"), + index_id = gensym(pinecone_namespace), + chunker::AbstractChunker = indexer.chunker, + chunker_kwargs::NamedTuple = NamedTuple(), + embedder::AbstractEmbedder = indexer.embedder, + embedder_kwargs::NamedTuple = NamedTuple(), + tagger::AbstractTagger = indexer.tagger, + tagger_kwargs::NamedTuple = NamedTuple(), + api_kwargs::NamedTuple = NamedTuple(), cost_tracker = Threads.Atomic{Float64}(0.0)) Builds a `PineconeIndex` containing a Pinecone context (API key, index and namespace). +The index stores the document chunks and their embeddings (and potentially other information). + +The function processes each file or document (depending on `chunker`), splits its content into chunks, embeds these chunks +and then combines this information into a retrievable index. The chunks and embeddings are upsert to Pinecone using +the provided Pinecone context (unless the `upsert` flag is set to `false`). + +# Arguments +- `indexer::PineconeIndexer`: The indexing logic for Pinecone operations. +- `files_or_docs`: A vector of valid file paths to be indexed (chunked and embedded). +- `metadata::Vector{Dict{String, Any}}`: A vector of metadata attributed to each docs file, given as dictionaries with `String` keys. Default is empty vector. +- `pinecone_context::Pinecone.PineconeContextv3`: The Pinecone API key generated using Pinecone.jl. Must be specified. +- `pinecone_index::Pinecone.PineconeIndexv3`: The Pinecone index generated using Pinecone.jl. Must be specified. +- `pinecone_namespace::AbstractString`: The Pinecone namespace associated to `pinecone_index`. +- `upsert::Bool = true`: A flag specifying whether to upsert the chunks and embeddings to Pinecone. Defaults to `true`. +- `verbose`: An Integer specifying the verbosity of the logs. Default is `1` (high-level logging). `0` is disabled. +- `index_id`: A unique identifier for the index. Default is a generated symbol. +- `chunker`: The chunker logic to use for splitting the documents. Default is `TextChunker()`. +- `chunker_kwargs`: Parameters to be provided to the `get_chunks` function. Useful to change the `separators` or `max_length`. + - `sources`: A vector of strings indicating the source of each chunk. Default is equal to `files_or_docs`. +- `embedder`: The embedder logic to use for embedding the chunks. Default is `BatchEmbedder()`. +- `embedder_kwargs`: Parameters to be provided to the `get_embeddings` function. Useful to change the `target_batch_size_length` or reduce asyncmap tasks `ntasks`. + - `model`: The model to use for embedding. Default is `PT.MODEL_EMBEDDING`. +- `tagger`: The tagger logic to use for extracting tags from the chunks. Default is `NoTagger()`, ie, skip tag extraction. There are also `PassthroughTagger` and `OpenTagger`. +- `tagger_kwargs`: Parameters to be provided to the `get_tags` function. + - `model`: The model to use for tags extraction. Default is `PT.MODEL_CHAT`. + - `template`: A template to be used for tags extraction. Default is `:RAGExtractMetadataShort`. + - `tags`: A vector of vectors of strings directly providing the tags for each chunk. Applicable for `tagger::PasstroughTagger`. +- `api_kwargs`: Parameters to be provided to the API endpoint. Shared across all API calls if provided. +- `cost_tracker`: A `Threads.Atomic{Float64}` object to track the total cost of the API calls. Useful to pass the total cost to the parent call. + +# Returns +- `PineconeIndex`: An object containing the compiled index of chunks, embeddings, tags, vocabulary, sources and metadata, together with the Pinecone connection data. + +See also: `PineconeIndex`, `get_chunks`, `get_embeddings`, `get_tags`, `CandidateWithChunks`, `find_closest`, `find_tags`, `rerank`, `retrieve`, `generate!`, `airag` + +# Examples +```julia +using Pinecone + +# Prepare the Pinecone connection data +pinecone_context = Pinecone.init_v3(ENV["PINECONE_API_KEY"]) +pindex = ENV["PINECONE_INDEX"] +pinecone_index = !isempty(pindex) ? Pinecone.Index(pinecone_context, pindex) : nothing +namespace = "my-namespace" + +# Add metadata about the sources in Pinecone +metadata = [Dict{String, Any}("source" => doc_file) for doc_file in docs_files] + +# Build the index. By default, the chunks and embeddings get upserted to Pinecone. +const RT = PromptingTools.Experimental.RAGTools +index_pinecone = RT.build_index( + RT.PineconeIndexer(), + docs_files; + pinecone_context = pinecone_context, + pinecone_index = pinecone_index, + pinecone_namespace = namespace, + metadata = metadata +) + +# Notes +- If you get errors about exceeding embedding input sizes, first check the `max_length` in your chunks. + If that does NOT resolve the issue, try changing the `embedding_kwargs`. + In particular, reducing the `target_batch_size_length` parameter (eg, 10_000) and number of tasks `ntasks=1`. + Some providers cannot handle large batch sizes (eg, Databricks). + """ function build_index( indexer::PineconeIndexer, files_or_docs::Vector{<:AbstractString}; @@ -745,7 +824,7 @@ function build_index( pinecone_context::Pinecone.PineconeContextv3 = Pinecone.init_v3(""), pinecone_index::Pinecone.PineconeIndexv3 = nothing, pinecone_namespace::AbstractString = "", - upsert::Bool = false, + upsert::Bool = true, verbose::Integer = 1, index_id = gensym(pinecone_namespace), chunker::AbstractChunker = indexer.chunker, @@ -756,7 +835,7 @@ function build_index( tagger_kwargs::NamedTuple = NamedTuple(), api_kwargs::NamedTuple = NamedTuple(), cost_tracker = Threads.Atomic{Float64}(0.0)) - @assert !isempty(pinecone_context.apikey) && !isnothing(pinecone_index) "Pinecone context and index not set" + @assert !isempty(pinecone_context.apikey) && !isnothing(pinecone_index) && "Pinecone context and index not set" ## Split into chunks chunks, sources = get_chunks(chunker, files_or_docs; diff --git a/src/Experimental/RAGTools/retrieval.jl b/src/Experimental/RAGTools/retrieval.jl index 673fdafba..5af1ad3a7 100644 --- a/src/Experimental/RAGTools/retrieval.jl +++ b/src/Experimental/RAGTools/retrieval.jl @@ -241,6 +241,37 @@ function find_closest( return CandidateChunks(indexid(index), positions, Float32.(scores)) end +# Dispatch to find scores for multiple embeddings +function find_closest( + finder::AbstractSimilarityFinder, index::AbstractChunkIndex, + query_emb::AbstractMatrix{<:Real}, query_tokens::AbstractVector{<:AbstractVector{<:AbstractString}} = Vector{Vector{String}}(); + top_k::Int = 100, kwargs...) + if isnothing(chunkdata(parent(index))) + return CandidateChunks(; index_id = indexid(index)) + end + ## reduce top_k since we have more than one query + top_k_ = top_k ÷ size(query_emb, 2) + ## simply vcat together (gets sorted from the highest similarity to the lowest) + if isempty(query_tokens) + mapreduce( + c -> find_closest(finder, index, c; top_k = top_k_, kwargs...), vcat, eachcol(query_emb)) + else + @assert length(query_tokens)==size(query_emb, 2) "Length of `query_tokens` must be equal to the number of columns in `query_emb`." + mapreduce( + (emb, tok) -> find_closest(finder, index, emb, tok; top_k = top_k_, kwargs...), vcat, eachcol(query_emb), query_tokens) + end +end + +""" + find_closest( + finder::AbstractSimilarityFinder, index::PineconeIndex, + query_emb::AbstractVector{<:Real}, query_tokens::AbstractVector{<:AbstractString} = String[]; + top_k::Int = 10, kwargs...) + +Finds the indices of chunks that are closest to query embedding (`query_emb`) by querying Pinecone. + +Returns only `top_k` closest indices. +""" function find_closest( finder::AbstractSimilarityFinder, index::PineconeIndex, query_emb::AbstractVector{<:Real}, query_tokens::AbstractVector{<:AbstractString} = String[]; @@ -261,6 +292,7 @@ function find_closest( scores = [m.score for m in matches] chunks = [m.metadata.content for m in matches] metadata = [JSON3.read(JSON3.write(m.metadata), Dict{String, Any}) for m in matches] + # TODO: metadata might not have `source`, change this sources = [m.metadata.source for m in matches] return CandidateWithChunks( @@ -272,6 +304,7 @@ function find_closest( sources = Vector{String}(sources)) end +# Dispatch to find scores for multiple embeddings function find_closest( finder::AbstractSimilarityFinder, index::PineconeIndex, query_emb::AbstractMatrix{<:Real}, query_tokens::AbstractVector{<:AbstractVector{<:AbstractString}} = Vector{Vector{String}}(); @@ -290,27 +323,6 @@ function find_closest( end end -# Dispatch to find scores for multiple embeddings -function find_closest( - finder::AbstractSimilarityFinder, index::AbstractChunkIndex, - query_emb::AbstractMatrix{<:Real}, query_tokens::AbstractVector{<:AbstractVector{<:AbstractString}} = Vector{Vector{String}}(); - top_k::Int = 100, kwargs...) - if isnothing(chunkdata(parent(index))) - return CandidateChunks(; index_id = indexid(index)) - end - ## reduce top_k since we have more than one query - top_k_ = top_k ÷ size(query_emb, 2) - ## simply vcat together (gets sorted from the highest similarity to the lowest) - if isempty(query_tokens) - mapreduce( - c -> find_closest(finder, index, c; top_k = top_k_, kwargs...), vcat, eachcol(query_emb)) - else - @assert length(query_tokens)==size(query_emb, 2) "Length of `query_tokens` must be equal to the number of columns in `query_emb`." - mapreduce( - (emb, tok) -> find_closest(finder, index, emb, tok; top_k = top_k_, kwargs...), vcat, eachcol(query_emb), query_tokens) - end -end - ### For MultiIndex function find_closest( finder::MultiFinder, index::AbstractMultiIndex, @@ -612,7 +624,7 @@ function find_tags(method::AllTagFilter, index::AbstractChunkIndex, end """ - find_tags(method::NoTagFilter, index::AbstractChunkIndex, + find_tags(method::NoTagFilter, index::Union{AbstractChunkIndex, AbstractManagedIndex}, tags::Union{T, AbstractVector{<:T}}; kwargs...) where {T <: Union{ AbstractString, Regex, Nothing}} @@ -620,12 +632,6 @@ end Returns all chunks in the index, ie, no filtering, so we simply return `nothing` (easier for dispatch). """ -# function find_tags(method::NoTagFilter, index::AbstractChunkIndex, -# tags::Union{T, AbstractVector{<:T}}; kwargs...) where {T <: -# Union{ -# AbstractString, Regex, Nothing}} -# return nothing -# end function find_tags( method::NoTagFilter, index::Union{AbstractChunkIndex, AbstractManagedIndex}, @@ -748,8 +754,6 @@ function rerank(reranker::NoReranker, candidates::AbstractCandidateWithChunks; top_n::Integer = length(candidates), kwargs...) - # Since this is almost a passthrough strategy, it returns the candidate_chunks unchanged - # but it truncates to `top_n` if necessary return first(candidates, top_n) end @@ -1017,11 +1021,22 @@ end PineconeRetriever <: AbstractRetriever Dispatch for `retrieve` for Pinecone. + +# Fields +- `rephraser::AbstractRephraser`: the rephrasing method, dispatching `rephrase` - uses `NoRephraser` +- `embedder::AbstractEmbedder`: the embedding method, dispatching `get_embeddings` (see Preparation Stage for more details) - uses `SimpleEmbedder` +- `processor::AbstractProcessor`: the processor method, dispatching `get_keywords` (see Preparation Stage for more details) - uses `NoProcessor` +- `finder::AbstractSimilarityFinder`: the similarity search method, dispatching `find_closest` - uses `CosineSimilarity` +- `tagger::AbstractTagger`: the tag generating method, dispatching `get_tags` (see Preparation Stage for more details) - uses `NoTagger` +- `filter::AbstractTagFilter`: the tag matching method, dispatching `find_tags` - uses `NoTagFilter` +- `reranker::AbstractReranker`: the reranking method, dispatching `rerank` - uses `NoReranker` """ @kwdef mutable struct PineconeRetriever <: AbstractRetriever rephraser::AbstractRephraser = NoRephraser() + # TODO: BatchEmbedder? embedder::AbstractEmbedder = SimpleEmbedder() processor::AbstractProcessor = NoProcessor() + # TODO: actually do something with this; Pinecone allows choosing finder finder::AbstractSimilarityFinder = CosineSimilarity() tagger::AbstractTagger = NoTagger() filter::AbstractTagFilter = NoTagFilter() @@ -1242,6 +1257,33 @@ function retrieve(retriever::AbstractRetriever, return result end +""" + retrieve(retriever::PineconeRetriever, + index::PineconeIndex, + question::AbstractString; + verbose::Integer = 1, + top_k::Integer = 100, + top_n::Integer = 10, + api_kwargs::NamedTuple = NamedTuple(), + rephraser::AbstractRephraser = retriever.rephraser, + rephraser_kwargs::NamedTuple = NamedTuple(), + embedder::AbstractEmbedder = retriever.embedder, + embedder_kwargs::NamedTuple = NamedTuple(), + processor::AbstractProcessor = retriever.processor, + processor_kwargs::NamedTuple = NamedTuple(), + finder::AbstractSimilarityFinder = retriever.finder, + finder_kwargs::NamedTuple = NamedTuple(), + tagger::AbstractTagger = retriever.tagger, + tagger_kwargs::NamedTuple = NamedTuple(), + filter::AbstractTagFilter = retriever.filter, + filter_kwargs::NamedTuple = NamedTuple(), + reranker::AbstractReranker = retriever.reranker, + reranker_kwargs::NamedTuple = NamedTuple(), + cost_tracker = Threads.Atomic{Float64}(0.0), + kwargs...) + +Dispatch method for `PineconeIndex`. +""" function retrieve(retriever::PineconeRetriever, index::PineconeIndex, question::AbstractString; diff --git a/src/Experimental/RAGTools/types.jl b/src/Experimental/RAGTools/types.jl index 3301c1b67..1dcfcdc0b 100644 --- a/src/Experimental/RAGTools/types.jl +++ b/src/Experimental/RAGTools/types.jl @@ -136,17 +136,39 @@ chunkdata(index::ChunkEmbeddingsIndex) = embeddings(index) # For backward compatibility const ChunkIndex = ChunkEmbeddingsIndex +# TODO: where to put these? indexid(index::AbstractManagedIndex) = index.id chunks(index::AbstractManagedIndex) = index.chunks sources(index::AbstractManagedIndex) = index.sources +# TODO: what about this? using Pinecone: Pinecone, PineconeContextv3, PineconeIndexv3 + +""" + PineconeIndex + +Main struct for storing document chunks and their embeddings along with the necessary Pinecone context for connecting to Pinecone. + +# Fields +- `id::Symbol`: unique identifier of each index (a symbol of the Pinecone index namespace) +- `pinecone_context::Pinecone.PineconeContextv3`: Pinecone API key +- `pinecone_index::Pinecone.PineconeIndexv3`: Pinecone index +- `pinecone_namespace::String`: name of the namespace inside the Pinecone index +- `chunks::Vector{<:AbstractString}`: underlying document chunks / snippets +- `embeddings::Union{Nothing, Matrix{<:Real}}`: for semantic search +- `tags::Union{Nothing, AbstractMatrix{<:Bool}}`: for exact search, filtering, etc. This is often a sparse matrix indicating which chunks have the given `tag` (see `tag_vocab` for the position lookup) +- `tags_vocab::Union{Nothing, Vector{<:AbstractString}}`: vocabulary for the `tags` matrix (each column in `tags` is one item in `tags_vocab` and rows are the chunks) +- `sources::Vector{<:AbstractString}`: sources of the chunks +- `metadata::Vector{Dict{String, Any}}`: metadata for each chunk/embedding stored in Pinecone +""" @kwdef struct PineconeIndex{ T1 <: Union{Nothing, AbstractString}, T2 <: Union{Nothing, Matrix{<:Real}}, T3 <: Union{Nothing, AbstractMatrix{<:Bool}} } <: AbstractManagedIndex + # TODO: id should be a combination of index + namespace? id::Symbol # namespace + # TODO: these should not be v3, maybe? pinecone_context::Pinecone.PineconeContextv3 pinecone_index::Pinecone.PineconeIndexv3 pinecone_namespace::String @@ -159,9 +181,10 @@ using Pinecone: Pinecone, PineconeContextv3, PineconeIndexv3 # column oriented, ie, each column is one item in `tags_vocab` and rows are the chunks tags::T3 = nothing tags_vocab::Union{Nothing, Vector{<:AbstractString}} = nothing + sources::Union{Nothing, Vector{<:AbstractString}} = nothing # metadata for each chunk + # TODO: should be changed to `extras`? but different type -- this needs to be vector of dicts metadata::Vector{Dict{String, Any}} = Vector{Dict{String, Any}}() - sources::Union{Nothing, Vector{<:AbstractString}} = nothing end HasKeywords(::PineconeIndex) = false HasEmbeddings(::PineconeIndex) = true @@ -549,6 +572,11 @@ Base.@propagate_inbounds function translate_positions_to_parent( end +""" + SubManagedIndex + +Provides the same functionality for `AbstractManagedIndex` as `SubChunkIndex` does for `AbstractChunkIndex`. +""" @kwdef struct SubManagedIndex{T <: AbstractManagedIndex} <: AbstractManagedIndex parent::T positions::Vector{Int} @@ -560,6 +588,7 @@ Base.parent(index::SubManagedIndex) = index.parent HasEmbeddings(index::SubManagedIndex) = HasEmbeddings(parent(index)) HasKeywords(index::SubManagedIndex) = HasKeywords(parent(index)) +# TODO: see which of these are needed Base.@propagate_inbounds function chunks(index::SubManagedIndex) view(chunks(parent(index)), positions(index)) end @@ -569,7 +598,6 @@ end Base.@propagate_inbounds function chunkdata(index::SubManagedIndex) chunkdata(parent(index), positions(index)) end -"Access chunkdata for a subset of chunks, `chunk_idx` is a vector of chunk indices in the index" Base.@propagate_inbounds function chunkdata( index::SubManagedIndex, chunk_idx::AbstractVector{<:Integer}) ## We need this accessor because different chunk indices can have chunks in different dimensions!! @@ -671,16 +699,31 @@ function CandidateChunks(index::AbstractChunkIndex, positions::AbstractVector{<: indexid(index), convert(Vector{Int}, positions), convert(Vector{Float32}, scores)) end + +""" + CandidateWithChunks + +Similar to `CandidateChunks`, but for `AbstractManagedIndex`. It's the result of the retrieval stage of RAG. + +# Fields +- `index_id::Symbol`: the id of the index from which the candidates are drawn +- `positions::Vector{Int}`: the positions of the candidates in the index (ie, `5` refers to the 5th chunk in the index - `chunks(index)[5]`) +- `scores::Vector{Float32}`: the similarity scores of the candidates from the query (higher is better) +- `chunks::Vector{String}`: the chunks retrieved for a given question +- `metadata::AbstractVector`: metadata corresponding to `chunks` +- `sources::Vector{String}`: sources corresponding to `chunks` +""" @kwdef struct CandidateWithChunks{TP <: Integer, TD <: Real} <: AbstractCandidateWithChunks index_id::Symbol positions::Vector{TP} = Int[] scores::Vector{TD} = Float32[] - ## fields that we don't have in Index anymore -- so we get them "per question" + ## fields obtained "per question" chunks::Vector{String} = String[] metadata::AbstractVector = Dict{String, Any}[] sources::Vector{String} = String[] end +# TODO: see which can be removed indexid(cc::CandidateWithChunks) = cc.index_id positions(cc::CandidateWithChunks) = cc.positions scores(cc::CandidateWithChunks) = cc.scores @@ -942,7 +985,6 @@ end Base.@propagate_inbounds function Base.view(index::SubChunkIndex, cc::MultiCandidateChunks) SubChunkIndex(index, cc) end -# TODO: proper `view` -- `SubManagedIndex`? Base.@propagate_inbounds function Base.view(index::AbstractManagedIndex, cc::CandidateWithChunks) @boundscheck let chk_vector = chunks(parent(index)) if !checkbounds(Bool, axes(chk_vector, 1), positions(cc))