-
Notifications
You must be signed in to change notification settings - Fork 45
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Support pai-llm-evals and refactor eval exp & pipeline (#278)
* Refactor eval exp & pipeline * Add llm-evals package * Add example file * Fix * Fix multi-modal qca_dataset_path * Fix persist_path
- Loading branch information
Showing
25 changed files
with
787 additions
and
293 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
5 changes: 5 additions & 0 deletions
5
example_data/eval_docs_crag_small/crag-task1-straightforward-eval-msgs_1.jsonl
Large diffs are not rendered by default.
Oops, something went wrong.
5 changes: 5 additions & 0 deletions
5
example_data/eval_docs_crag_small/crag-task1-straightforward-eval-msgs_2.jsonl
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
95 changes: 95 additions & 0 deletions
95
src/pai_rag/config/evaluation/settings_eval_for_crag_text.toml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
dynaconf_merge = true | ||
|
||
[rag] | ||
name = "pai_rag" | ||
version = "0.1.1" | ||
|
||
[rag.agent] | ||
custom_agent_config_file = "" | ||
agent_tool_type = "" | ||
|
||
[rag.chat_store] | ||
type = "Local" # [Local, Aliyun-Redis] | ||
host = "Aliyun-Redis host" | ||
password = "Aliyun-Redis user:pwd" | ||
persist_path = "localdata/eval_exp_data/storage" | ||
|
||
[rag.data_analysis] | ||
type = "pandas" | ||
nl2sql_prompt = "给定一个输入问题,创建一个语法正确的{dialect}查询语句来执行,不要从特定的表中查询所有列,只根据问题查询几个相关的列。请注意只使用你在schema descriptions 中看到的列名。\n=====\n 小心不要查询不存在的列。请注意哪个列位于哪个表中。必要时,请使用表名限定列名。\n=====\n 你必须使用以下格式,每项占一行:\n\n Question: Question here\n SQLQuery: SQL Query to run \n\n Only use tables listed below.\n {schema}\n\n Question: {query_str} \n SQLQuery: " | ||
|
||
[rag.data_reader] | ||
type = "SimpleDirectoryReader" | ||
|
||
# embedding configurations, source support API: OpenAI,DashScope; and local model:HuggingFace | ||
# if use API, need set OPENAI_API_KEY or DASHSCOPE_API_KEY in ENV, If HuggingFace, need set model | ||
# eg. | ||
# source = "HuggingFace" | ||
# model = "bge-large-zh-v1.5" | ||
# embed_batch_size = 10 | ||
[rag.embedding] | ||
source = "DashScope" | ||
embed_batch_size = 10 | ||
|
||
[rag.index] | ||
persist_path = "localdata/eval_exp_data/storage" | ||
enable_multimodal = false | ||
vector_store.type = "FAISS" | ||
|
||
# llm configurations, source support API: OpenAI,DashScope or PAI-EAS's deployment | ||
# eg. | ||
# source = "PaiEas" | ||
# model = "" | ||
# endpoint = "" | ||
# token = "" | ||
[rag.llm] | ||
source = "OpenAI" | ||
model = "gpt-4o-2024-08-06" | ||
|
||
[rag.multimodal_embedding] | ||
source = "cnclip" | ||
|
||
[rag.multimodal_llm] | ||
source = "dashscope" | ||
model = "qwen-vl-plus" | ||
|
||
[rag.node_enhancement] | ||
tree_depth = 3 | ||
max_clusters = 52 | ||
proba_threshold = 0.10 | ||
|
||
[rag.node_parser] | ||
type = "Sentence" | ||
chunk_size = 500 | ||
chunk_overlap = 10 | ||
enable_multimodal = false | ||
|
||
[rag.oss_store] | ||
bucket = "" | ||
endpoint = "oss-cn-hangzhou.aliyuncs.com" | ||
|
||
[rag.postprocessor] | ||
reranker_type = "no-reranker" # [simple-weighted-reranker, model-based-reranker] | ||
reranker_model = "bge-reranker-base" # [bge-reranker-base, bge-reranker-large] | ||
keyword_weight = 0.3 | ||
vector_weight = 0.7 | ||
similarity_threshold = 0.5 | ||
top_n = 2 | ||
|
||
[rag.query_transform] | ||
type = "" | ||
|
||
[rag.retriever] | ||
similarity_top_k = 5 | ||
retrieval_mode = "hybrid" # [hybrid, embedding, keyword, router] | ||
query_rewrite_n = 1 # set to 1 to disable query generation | ||
|
||
[rag.search] | ||
search_api_key = "" | ||
|
||
[rag.synthesizer] | ||
type = "SimpleSummarize" | ||
text_qa_template = "You are a helpful assistant.\nYou are given a Question and References. The references may or may not help answer the question. Your task is to answer the question in as few words as possible.\nPlease follow these guidelines when formulating your answer:\n1. If the question contains a false premise or assumption, answer “invalid question”.\n2. If you are uncertain or don’t know the answer, respond with “I don’t know”.\n\n### Question \n{query_str} \n\n### References:\n {context_str} \n\n### Answer:\n" | ||
|
||
[rag.trace] | ||
type = "arize_phoenix" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
from typing import Any | ||
from llama_index.core.indices import VectorStoreIndex | ||
from llama_index.core.ingestion import IngestionPipeline | ||
from pai_rag.integrations.readers.pai.pai_data_reader import PaiDataReader | ||
from loguru import logger | ||
|
||
|
||
class CragDataLoader: | ||
def __init__( | ||
self, | ||
data_reader: PaiDataReader, | ||
embed_model: Any = None, | ||
vector_index: VectorStoreIndex = None, | ||
): | ||
self._data_reader = data_reader | ||
self._embed_model = embed_model | ||
self._vector_index = vector_index | ||
|
||
def load_data( | ||
self, | ||
file_path_or_directory: str, | ||
from_oss: bool = False, | ||
oss_path: str = None, | ||
filter_pattern: str = None, | ||
enable_raptor: str = False, | ||
): | ||
"""Load data from a file or directory.""" | ||
documents = self._data_reader.load_data( | ||
file_path_or_directory=file_path_or_directory, | ||
filter_pattern=filter_pattern, | ||
oss_path=oss_path, | ||
from_oss=from_oss, | ||
) | ||
if from_oss: | ||
logger.info(f"Loaded {len(documents)} documents from {oss_path}") | ||
else: | ||
logger.info( | ||
f"Loaded {len(documents)} documents from {file_path_or_directory}" | ||
) | ||
|
||
transformations = [ | ||
self._embed_model, | ||
] | ||
|
||
ingestion_pipeline = IngestionPipeline(transformations=transformations) | ||
|
||
nodes = ingestion_pipeline.run(documents=documents) | ||
logger.info( | ||
f"[DataLoader] parsed {len(documents)} documents into {len(nodes)} nodes." | ||
) | ||
|
||
self._vector_index.insert_nodes(nodes) | ||
logger.info(f"[DataLoader] Inserted {len(nodes)} nodes.") | ||
logger.info("[DataLoader] Ingestion Completed!") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
"""Tabular parser-Excel parser. | ||
Contains parsers for tabular data files. | ||
""" | ||
|
||
from pathlib import Path | ||
from typing import Any, Dict, List, Optional | ||
from fsspec import AbstractFileSystem | ||
from llama_index.core.readers.base import BaseReader | ||
from llama_index.core.schema import Document | ||
import json | ||
|
||
|
||
class CragJsonLReader(BaseReader): | ||
"""JsonL reader.""" | ||
|
||
def __init__(self, *args: Any, **kwargs: Any) -> None: | ||
"""Init params.""" | ||
super().__init__(*args, **kwargs) | ||
|
||
def load_data( | ||
self, | ||
file_path: Path, | ||
extra_info: Optional[Dict] = None, | ||
fs: Optional[AbstractFileSystem] = None, | ||
) -> List[Document]: | ||
with open(file_path, "r", encoding="utf-8") as file: | ||
json_lines = [line.strip() for line in file.readlines()] | ||
|
||
docs = [] | ||
for i, text in enumerate(json_lines): | ||
json_data = json.loads(text) | ||
search_results = json_data["search_results"] | ||
for j, search_result in enumerate(search_results): | ||
extra_info["row_number"] = i + 1 | ||
extra_info["dataset_source"] = "crag" | ||
docs.append( | ||
Document( | ||
doc_id=f"{json_data['interaction_id']}__{j}", | ||
text=search_result["page_snippet"], | ||
metadata=extra_info, | ||
) | ||
) | ||
return docs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.