From 27729fc36785c92e52ec507f832d6b70cec4db34 Mon Sep 17 00:00:00 2001 From: Iulia Dumitru Date: Thu, 1 Aug 2024 15:37:30 +0200 Subject: [PATCH] Integrate Pinecone on all levels --- .gitignore | 4 +- Project.toml | 3 + src/Experimental/RAGTools/RAGTools.jl | 4 +- src/Experimental/RAGTools/generation.jl | 24 ++++++ src/Experimental/RAGTools/preparation.jl | 55 ++++++++++++++ src/Experimental/RAGTools/rag_interface.jl | 7 ++ src/Experimental/RAGTools/retrieval.jl | 74 +++++++++++++++++++ src/Experimental/RAGTools/types.jl | 18 +++++ templates/persona-task/JuliaRAGAssistant.json | 23 ++++++ 9 files changed, 209 insertions(+), 3 deletions(-) create mode 100644 templates/persona-task/JuliaRAGAssistant.json diff --git a/.gitignore b/.gitignore index 729a3f342..85e2c05b8 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,6 @@ **/.vscode # exclude scratch files -**/_* \ No newline at end of file +**/_* + +.env \ No newline at end of file diff --git a/Project.toml b/Project.toml index 5a35589f8..94e994b27 100644 --- a/Project.toml +++ b/Project.toml @@ -7,15 +7,18 @@ version = "0.44.0" AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" +DotEnv = "4dc1fcf4-5e3b-5448-94ab-0c38ec0385c1" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" OpenAI = "e9f21f70-7185-4079-aca2-91159181367c" +Pinecone = "ee90fdae-f7f0-4648-8b00-9c0307cf46d9" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" Preferences = "21216c6a-2e73-6563-6e65-726566657250" REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Revise = "295af30f-e4ad-537b-8983-00126c2a3abe" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [weakdeps] diff --git a/src/Experimental/RAGTools/RAGTools.jl b/src/Experimental/RAGTools/RAGTools.jl index fe4b80788..5f137793c 100644 --- a/src/Experimental/RAGTools/RAGTools.jl +++ b/src/Experimental/RAGTools/RAGTools.jl @@ -32,12 +32,12 @@ include("api_services.jl") include("rag_interface.jl") -export ChunkIndex, ChunkKeywordsIndex, ChunkEmbeddingsIndex, CandidateChunks, RAGResult +export ChunkIndex, ChunkKeywordsIndex, ChunkEmbeddingsIndex, PTPineconeIndex, CandidateChunks, RAGResult export MultiIndex, SubChunkIndex, MultiCandidateChunks include("types.jl") export build_index, get_chunks, get_embeddings, get_keywords, get_tags, SimpleIndexer, - KeywordsIndexer + KeywordsIndexer, PTPineconeIndexer include("preparation.jl") include("rank_gpt.jl") diff --git a/src/Experimental/RAGTools/generation.jl b/src/Experimental/RAGTools/generation.jl index fb187b980..070d7f81a 100644 --- a/src/Experimental/RAGTools/generation.jl +++ b/src/Experimental/RAGTools/generation.jl @@ -63,6 +63,30 @@ function build_context(contexter::ContextEnumerator, return context end +using Pinecone: Pinecone, query +using JSON3: JSON3, read +""" + build_context(contexter::ContextEnumerator, + index::AbstractPTPineconeIndex; + verbose::Bool = true, + top_k::Int = 10, + kwargs...) + +Build context strings by querying Pinecone. +``` +""" +function build_context(contexter::ContextEnumerator, + index::AbstractPTPineconeIndex; + verbose::Bool = true, + top_k::Int = 10, + kwargs...) + pinecone_results = Pinecone.query(index.pinecone_context, index.pinecone_index, index.embedding, top_k, index.namespace, false, true) + results_json = JSON3.read(pinecone_results) + context = results_json.matches[1].metadata.content + + return context +end + function build_context!(contexter::AbstractContextBuilder, index::AbstractDocumentIndex, result::AbstractRAGResult; kwargs...) throw(ArgumentError("Contexter $(typeof(contexter)) not implemented")) diff --git a/src/Experimental/RAGTools/preparation.jl b/src/Experimental/RAGTools/preparation.jl index d91fc6baa..649e259bd 100644 --- a/src/Experimental/RAGTools/preparation.jl +++ b/src/Experimental/RAGTools/preparation.jl @@ -20,6 +20,13 @@ Chunker when you provide text to `get_chunks` functions. Inputs are directly chu """ struct TextChunker <: AbstractChunker end +""" + NoChunker <: AbstractChunker + + +""" +struct NoChunker <: AbstractChunker end + ### Embedding Types """ NoEmbedder <: AbstractEmbedder @@ -134,6 +141,19 @@ It uses `TextChunker`, `KeywordsProcessor`, and `NoTagger` as default chunker, p tagger::AbstractTagger = NoTagger() end +""" + PTPineconeIndexer <: AbstractIndexBuilder + +Pinecone index to be returned by `build_index`. + +It uses `NoChunker`, `NoEmbedder`, and `NoTagger` as default chunker, embedder, and tagger. +""" +@kwdef mutable struct PTPineconeIndexer <: AbstractIndexBuilder + chunker::AbstractChunker = NoChunker() + embedder::AbstractEmbedder = NoEmbedder() + tagger::AbstractTagger = NoTagger() +end + ### Functions ## "Build an index for RAG (Retriever-Augmented Generation) applications. REQUIRES SparseArrays and LinearAlgebra packages to be loaded!!" @@ -166,6 +186,10 @@ function load_text(chunker::TextChunker, input::AbstractString; @assert length(source)<=512 "Each `source` should be less than 512 characters long. Detected: $(length(source)) characters. You must provide sources for each text when using `TextChunker`" return input, source end +function load_text(chunker::NoChunker, input::AbstractString = ""; + source::AbstractString = input, kwargs...) +return input, source +end """ get_chunks(chunker::AbstractChunker, @@ -705,6 +729,37 @@ function build_index( return index end +using Pinecone: Pinecone, init_v3, Index +""" + build_index( + indexer::PTPineconeIndexer; + namespace::AbstractString, + schema::AbstractPromptSchema = OpenAISchema(); + verbose::Integer = 1, + index_id = gensym("PTPineconeIndex"), + cost_tracker = Threads.Atomic{Float64}(0.0)) + +Builds a `PTPineconeIndex` containing a Pinecone context (API key, index and namespace). +""" +function build_index( + indexer::PTPineconeIndexer, + namespace::AbstractString, + schema::PromptingTools.AbstractPromptSchema = PromptingTools.OpenAISchema(); + verbose::Integer = 1, + index_id = gensym("PTPineconeIndex"), + cost_tracker = Threads.Atomic{Float64}(0.0)) + + pinecone_context = Pinecone.init_v3(ENV["PINECONE_API_KEY"]) + pindex = ENV["PINECONE_INDEX"] + pinecone_index = pinecone_index = !isempty(pindex) ? Pinecone.Index(pinecone_context, pindex) : nothing + + index = PTPineconeIndex(; id = index_id, pinecone_context, pinecone_index, namespace, schema) + + (verbose > 0) && @info "Index built! (cost: \$$(round(cost_tracker[], digits=3)))" + + return index +end + # Convenience for easy index creation """ ChunkKeywordsIndex( diff --git a/src/Experimental/RAGTools/rag_interface.jl b/src/Experimental/RAGTools/rag_interface.jl index 15c08fecc..f075392c2 100644 --- a/src/Experimental/RAGTools/rag_interface.jl +++ b/src/Experimental/RAGTools/rag_interface.jl @@ -161,6 +161,13 @@ Main abstract type for storing document chunks and their embeddings. It also sto """ abstract type AbstractChunkIndex <: AbstractDocumentIndex end +""" + AbstractPTPineconeIndex <: AbstractDocumentIndex + +Abstract type for working with Pinecone. For now, just an empty index. +""" +abstract type AbstractPTPineconeIndex <: AbstractDocumentIndex end + # ## Retrieval stage """ diff --git a/src/Experimental/RAGTools/retrieval.jl b/src/Experimental/RAGTools/retrieval.jl index 11bedc57d..264fe063c 100644 --- a/src/Experimental/RAGTools/retrieval.jl +++ b/src/Experimental/RAGTools/retrieval.jl @@ -610,6 +610,13 @@ function find_tags(method::NoTagFilter, index::AbstractMultiIndex, return nothing end +function find_tags(method::NoTagFilter, index::AbstractPTPineconeIndex, + tags::Union{T, AbstractVector{<:T}}; kwargs...) where {T <: + Union{ + AbstractString, Regex, Nothing}} + return nothing +end + ### Reranking """ @@ -943,6 +950,21 @@ Compared to SimpleRetriever, it adds rephrasing the query and reranking the resu reranker::AbstractReranker = CohereReranker() end +""" + PTPineconeRetriever <: AbstractRetriever + +Dispatch for `retrieve` for Pinecone. +""" +@kwdef mutable struct PTPineconeRetriever <: AbstractRetriever + rephraser::AbstractRephraser = NoRephraser() + embedder::AbstractEmbedder = NoEmbedder() + processor::AbstractProcessor = NoProcessor() + finder::AbstractSimilarityFinder = CosineSimilarity() + tagger::AbstractTagger = NoTagger() + filter::AbstractTagFilter = NoTagFilter() + reranker::AbstractReranker = NoReranker() +end + """ retrieve(retriever::AbstractRetriever, index::AbstractChunkIndex, @@ -1157,6 +1179,58 @@ function retrieve(retriever::AbstractRetriever, return result end +function retrieve(retriever::PTPineconeRetriever, + index::AbstractPTPineconeIndex, + question::AbstractString; + verbose::Integer = 1, + top_k::Integer = 100, + top_n::Integer = 10, + api_kwargs::NamedTuple = NamedTuple(), + rephraser::AbstractRephraser = retriever.rephraser, + rephraser_kwargs::NamedTuple = NamedTuple(), + embedder::AbstractEmbedder = retriever.embedder, + embedder_kwargs::NamedTuple = NamedTuple(), + processor::AbstractProcessor = retriever.processor, + processor_kwargs::NamedTuple = NamedTuple(), + finder::AbstractSimilarityFinder = retriever.finder, + finder_kwargs::NamedTuple = NamedTuple(), + tagger::AbstractTagger = retriever.tagger, + tagger_kwargs::NamedTuple = NamedTuple(), + filter::AbstractTagFilter = retriever.filter, + filter_kwargs::NamedTuple = NamedTuple(), + reranker::AbstractReranker = retriever.reranker, + reranker_kwargs::NamedTuple = NamedTuple(), + cost_tracker = Threads.Atomic{Float64}(0.0), + kwargs...) + ## Rephrase into one or more questions + rephraser_kwargs_ = isempty(api_kwargs) ? rephraser_kwargs : + merge(rephraser_kwargs, (; api_kwargs)) + rephrased_questions = rephrase( + rephraser, question; verbose = (verbose > 1), cost_tracker, rephraser_kwargs_...) + + ## Embed the question + index.embedding = Vector{Float64}(aiembed(index.schema, question).content) + embeddings = hcat([Vector{Float64}(aiembed(index.schema, x).content) for x in rephrased_questions]...) + + ## Get the context from Pinecone + pinecone_results = Pinecone.query(index.pinecone_context, index.pinecone_index, index.embedding, top_n, index.namespace, false, true) + pinecone_results_json = JSON3.read(pinecone_results) + context = map(x -> x.metadata.content, pinecone_results_json.matches) + + verbose > 0 && + @info "Retrieval done. Total cost: \$$(round(cost_tracker[], digits=2))." + + ## Return + result = RAGResult(; + question, + answer = nothing, + rephrased_questions, + final_answer = nothing, + context) + + return result +end + # Set default behavior DEFAULT_RETRIEVER = SimpleRetriever() function retrieve(index::AbstractChunkIndex, question::AbstractString; diff --git a/src/Experimental/RAGTools/types.jl b/src/Experimental/RAGTools/types.jl index 9582a28bf..135e8fde7 100644 --- a/src/Experimental/RAGTools/types.jl +++ b/src/Experimental/RAGTools/types.jl @@ -134,6 +134,24 @@ chunkdata(index::ChunkEmbeddingsIndex) = embeddings(index) # For backward compatibility const ChunkIndex = ChunkEmbeddingsIndex + +using Pinecone: Pinecone, PineconeContextv3, PineconeIndexv3 +""" + PTPineconeIndex + +Struct for storing index for working with Pinecone. +""" +@kwdef mutable struct PTPineconeIndex <: AbstractPTPineconeIndex + id::Symbol = gensym("PTPineconeIndex") + pinecone_context::Pinecone.PineconeContextv3 + pinecone_index::Pinecone.PineconeIndexv3 + namespace::AbstractString + schema::PromptingTools.AbstractPromptSchema + embedding::Vector{Float64} = Float64[] +end +HasKeywords(::PTPineconeIndex) = false +HasEmbeddings(::PTPineconeIndex) = false + abstract type AbstractDocumentTermMatrix end """ diff --git a/templates/persona-task/JuliaRAGAssistant.json b/templates/persona-task/JuliaRAGAssistant.json new file mode 100644 index 000000000..7b3c272f5 --- /dev/null +++ b/templates/persona-task/JuliaRAGAssistant.json @@ -0,0 +1,23 @@ +[ + { + "content": "Template Metadata", + "description": "For asking questions for Julia in a RAG context. Placeholders: `question` and `context`", + "version": "1", + "source": "", + "_type": "metadatamessage" + }, + { + "content": "Act as a world-class Julia language programmer with access to the latest Julia-related knowledge via Context Information. \n\n**Instructions:**\n- Answer the question based only on the provided Context.\n- Be precise and answer only when you're confident in the high quality of your answer.\n- Be brief and concise.\n\n**Context Information:**\n---\n{{context}}\n---\n", + "variables": [ + "context" + ], + "_type": "systemmessage" + }, + { + "content": "# Question\n\n{{question}}\n\n\n\n# Answer\n\n", + "variables": [ + "question" + ], + "_type": "usermessage" + } + ] \ No newline at end of file