Skip to content

Commit

Permalink
Support multi evaluators and experiments pipeline (#247)
Browse files Browse the repository at this point in the history
* Add evaluator and metrics

* Add evaluator and metrics

* Add eval experiment pipeline

* Modify entry file

* Modify entry file

* Modify result file

* Fix

* Refactor evaluation
  • Loading branch information
wwxxzz authored Oct 25, 2024
1 parent 68842d4 commit 12a2862
Show file tree
Hide file tree
Showing 23 changed files with 1,110 additions and 376 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ docx = "^0.2.4"
pai_rag = "pai_rag.main:main"
load_data = "pai_rag.tool.load_data_tool:run"
load_model = "pai_rag.utils.download_models:load_models"
evaluation = "pai_rag.evaluation.eval_pipeline:run"
run_eval_exp = "pai_rag.evaluation.run_evaluation_experiments:run"

[[tool.poetry.source]]
name = "pytorch_cpu"
Expand Down
2 changes: 1 addition & 1 deletion pyproject_gpu.toml
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ peft = "^0.12.0"
pai_rag = "pai_rag.main:main"
load_data = "pai_rag.tool.load_data_tool:run"
load_model = "pai_rag.utils.download_models:load_models"
evaluation = "pai_rag.evaluation.eval_pipeline:run"
run_eval_exp = "pai_rag.evaluation.run_evaluation_experiments:run"

[tool.pytest.ini_options]
asyncio_mode = "auto"
2 changes: 1 addition & 1 deletion src/pai_rag/app/web/ui_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@
MLLM_MODEL_KEY_DICT = {
"dashscope": [
"qwen-vl-max",
"qwen-vl-turbo",
"qwen-vl-plus",
]
}

Expand Down
8 changes: 8 additions & 0 deletions src/pai_rag/config/evaluation/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
experiment:
# [custom knowledge dataset]
- name: "exp1"
data_path: "example_data/eval_docs"
setting_file: "src/pai_rag/config/evaluation/settings_eval.toml"
- name: "exp2"
data_path: "example_data/eval_docs_1"
setting_file: "src/pai_rag/config/evaluation/settings_eval.toml"
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,8 @@ name = "pai_rag"
version = "0.1.1"

[rag.agent]
type = "react"

[rag.agent.custom_config]
agent_file_path = ""

[rag.agent.intent_detection]
type = ""

[rag.agent.tool]
type = ""
custom_agent_config_file = ""
agent_tool_type = ""

[rag.chat_store]
type = "Local" # [Local, Aliyun-Redis]
Expand All @@ -23,12 +15,9 @@ password = "Aliyun-Redis user:pwd"
persist_path = "localdata/eval_exp_data/storage"

[rag.data_analysis]
analysis_type = "nl2pandas"
type = "pandas"
nl2sql_prompt = "给定一个输入问题,创建一个语法正确的{dialect}查询语句来执行,不要从特定的表中查询所有列,只根据问题查询几个相关的列。请注意只使用你在schema descriptions 中看到的列名。\n=====\n 小心不要查询不存在的列。请注意哪个列位于哪个表中。必要时,请使用表名限定列名。\n=====\n 你必须使用以下格式,每项占一行:\n\n Question: Question here\n SQLQuery: SQL Query to run \n\n Only use tables listed below.\n {schema}\n\n Question: {query_str} \n SQLQuery: "

[rag.data_loader]
type = "local"

[rag.data_reader]
type = "SimpleDirectoryReader"

Expand All @@ -42,9 +31,6 @@ type = "SimpleDirectoryReader"
source = "DashScope"
embed_batch_size = 10

[rag.embedding.multi_modal]
source = "cnclip"

[rag.index]
persist_path = "localdata/eval_exp_data/storage"
enable_multimodal = true
Expand All @@ -60,12 +46,11 @@ vector_store.type = "FAISS"
source = "DashScope"
model = "qwen-turbo"

[rag.llm.function_calling_llm]
source = ""
[rag.multimodal_embedding]
source = "cnclip"

[rag.llm.multi_modal]
enable = true
source = "DashScope"
[rag.multimodal_llm]
source = "dashscope"
model = "qwen-vl-plus"

[rag.node_enhancement]
Expand All @@ -81,20 +66,16 @@ enable_multimodal = true

[rag.oss_store]
bucket = ""
endpoint = ""
prefix = ""
endpoint = "oss-cn-hangzhou.aliyuncs.com"

[rag.postprocessor]
reranker_type = "simple-weighted-reranker" # [simple-weighted-reranker, model-based-reranker]
reranker_type = "no-reranker" # [simple-weighted-reranker, model-based-reranker]
reranker_model = "bge-reranker-base" # [bge-reranker-base, bge-reranker-large]
keyword_weight = 0.3
vector_weight = 0.7
similarity_threshold = 0.5
top_n = 2

[rag.query_engine]
type = "RetrieverQueryEngine"

[rag.query_transform]
type = ""

Expand All @@ -111,6 +92,6 @@ type = "SimpleSummarize"
text_qa_template = "参考内容信息如下\n---------------------\n{context_str}\n---------------------根据提供内容而非其他知识回答问题.\n问题: {query_str}\n答案: \n"

[rag.trace]
type = "pai-llm-trace"
type = "pai_trace"
endpoint = "http://tracing-analysis-dc-hz.aliyuncs.com:8090"
token = ""
111 changes: 111 additions & 0 deletions src/pai_rag/evaluation/dataset/rag_eval_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
from typing import List, Optional, Type, Dict
from llama_index.core.bridge.pydantic import Field
import json
from llama_index.core.bridge.pydantic import BaseModel
from pai_rag.evaluation.dataset.rag_qca_dataset import RagQcaSample


class EvaluationSample(RagQcaSample):
"""Response Evaluation RAG example class."""

hitrate: Optional[float] = Field(
default_factory=None,
description="The hitrate value for retrieval evaluation.",
)
mrr: Optional[float] = Field(
default_factory=None,
description="The mrr value for retrieval evaluation.",
)

faithfulness_score: Optional[float] = Field(
default_factory=None,
description="The faithfulness score for response evaluation.",
)

faithfulness_reason: Optional[str] = Field(
default_factory=None,
description="The faithfulness reason for response evaluation.",
)

correctness_score: Optional[float] = Field(
default_factory=None,
description="The correctness score for response evaluation.",
)

correctness_reason: Optional[str] = Field(
default_factory=None,
description="The correctness reason for response evaluation.",
)

@property
def class_name(self) -> str:
"""Data example class name."""
return "EvaluationSample"


class PaiRagEvalDataset(BaseModel):
_example_type: Type[EvaluationSample] = EvaluationSample # type: ignore[misc]
examples: List[EvaluationSample] = Field(
default=[], description="Data examples of this dataset."
)
results: Dict[str, Dict[str, float]] = Field(
default_factory=dict, description="Evaluation result of this dataset."
)
status: Dict[str, bool] = Field(
default_factory=dict, description="Status of this dataset."
)

@property
def class_name(self) -> str:
"""Class name."""
return "PaiRagEvalDataset"

def cal_mean_metric_score(self) -> float:
"""Calculate the mean metric score."""
self.results["retrieval"] = {}
self.results["response"] = {}
if self.status["retrieval"]:
self.results["retrieval"] = {
"mean_hitrate": sum(float(entry.hitrate) for entry in self.examples)
/ len(self.examples),
"mean_mrr": sum(float(entry.mrr) for entry in self.examples)
/ len(self.examples),
}
if self.status["response"]:
self.results["response"] = {
"mean_faithfulness_score": sum(
float(entry.faithfulness_score) for entry in self.examples
)
/ len(self.examples),
"mean_correctness_score": sum(
float(entry.correctness_score) for entry in self.examples
)
/ len(self.examples),
}

def save_json(self, path: str) -> None:
"""Save json."""
self.cal_mean_metric_score()

with open(path, "w", encoding="utf-8") as f:
examples = [self._example_type.dict(el) for el in self.examples]
data = {
"examples": examples,
"results": self.results,
"status": self.status,
}

json.dump(data, f, indent=4, ensure_ascii=False)
print(f"Saved dataset to {path}.")

@classmethod
def from_json(cls, path: str) -> "PaiRagEvalDataset":
"""Load json."""
with open(path) as f:
data = json.load(f)

examples = [cls._example_type.parse_obj(el) for el in data["examples"]]
results = data["results"]
status = data["status"]

return cls(examples=examples, results=results, status=status)
Original file line number Diff line number Diff line change
@@ -1,22 +1,15 @@
from typing import List, Optional
from typing import List, Optional, Type
from llama_index.core.bridge.pydantic import Field
from llama_index.core.llama_dataset.base import BaseLlamaDataExample
from llama_index.core.llama_dataset import CreatedBy
import json
from llama_index.core.bridge.pydantic import BaseModel


class LabelledRagQcaSample(BaseLlamaDataExample):
"""RAG example class. Analogous to traditional ML datasets, this dataset contains
class RagQcaSample(BaseLlamaDataExample):
"""Predicted RAG example class. Analogous to traditional ML datasets, this dataset contains
the "features" (i.e., query + context) to make a prediction and the "label" (i.e., response)
to evaluate the prediction.
Args:
query (str): The user query
query_by (CreatedBy): Query generated by human or ai (model-name)
reference_contexts (Optional[List[str]]): The contexts used for response
reference_node_id (Optional[List[str]]): The node id corresponding to the contexts
reference_answer ([str]): Reference answer to the query. An answer
that would receive full marks upon evaluation.
reference_answer_by: The reference answer generated by human or ai (model-name).
"""

query: str = Field(
Expand All @@ -40,18 +33,6 @@ class LabelledRagQcaSample(BaseLlamaDataExample):
default=None, description="What model generated the reference answer."
)

@property
def class_name(self) -> str:
"""Data example class name."""
return "LabelledRagQcaSample"


class PredictedRagQcaSample(LabelledRagQcaSample):
"""Predicted RAG example class. Analogous to traditional ML datasets, this dataset contains
the "features" (i.e., query + context) to make a prediction and the "label" (i.e., response)
to evaluate the prediction.
"""

predicted_contexts: Optional[List[str]] = Field(
default_factory=None,
description="The contexts used to generate the predicted answer.",
Expand All @@ -71,4 +52,50 @@ class PredictedRagQcaSample(LabelledRagQcaSample):
@property
def class_name(self) -> str:
"""Data example class name."""
return "PredictedRagQcaSample"
return "RagQcaSample"


class PaiRagQcaDataset(BaseModel):
_example_type: Type[RagQcaSample] = RagQcaSample # type: ignore[misc]
examples: List[RagQcaSample] = Field(
default=[], description="Data examples of this dataset."
)
labelled: bool = Field(
default=False, description="Whether the dataset is labelled or not."
)
predicted: bool = Field(
default=False, description="Whether the dataset is predicted or not."
)

@property
def class_name(self) -> str:
"""Class name."""
return "PaiRagQcaDataset"

def save_json(self, path: str) -> None:
"""Save json."""
with open(path, "w", encoding="utf-8") as f:
examples = [self._example_type.dict(el) for el in self.examples]
data = {
"examples": examples,
"labelled": self.labelled,
"predicted": self.predicted,
}

json.dump(data, f, indent=4, ensure_ascii=False)
print(f"Saved PaiRagQcaDataset to {path}.")

@classmethod
def from_json(cls, path: str) -> "PaiRagQcaDataset":
"""Load json."""
with open(path) as f:
data = json.load(f)

if len(data["examples"]) > 0:
examples = [cls._example_type.parse_obj(el) for el in data["examples"]]
labelled = data["labelled"]
predicted = data["predicted"]

return cls(examples=examples, labelled=labelled, predicted=predicted)
else:
return None
Loading

0 comments on commit 12a2862

Please sign in to comment.