From 12a2862bf89309b6c90697775257ab8a4d569f24 Mon Sep 17 00:00:00 2001 From: wwxxzz Date: Fri, 25 Oct 2024 15:49:13 +0800 Subject: [PATCH] Support multi evaluators and experiments pipeline (#247) * Add evaluator and metrics * Add evaluator and metrics * Add eval experiment pipeline * Modify entry file * Modify entry file * Modify result file * Fix * Refactor evaluation --- pyproject.toml | 2 +- pyproject_gpu.toml | 2 +- src/pai_rag/app/web/ui_constants.py | 2 +- src/pai_rag/config/evaluation/config.yaml | 8 + .../evaluation/settings_eval.toml | 39 ++--- .../evaluation/dataset/rag_eval_dataset.py | 111 +++++++++++++ .../rag_qca_dataset.py} | 77 ++++++--- src/pai_rag/evaluation/eval_pipeline.py | 131 --------------- .../evaluation/evaluator/base_evaluator.py | 132 +++++++++++++++ .../generator/labelled_qca_generator.py | 99 ----------- .../generator/predicted_qca_generator.py | 77 --------- .../evaluation/generator/rag_qca_generator.py | 155 ++++++++++++++++++ .../evaluation/metrics/response/base.py | 73 +++++++++ .../metrics/response/correctness.py | 143 ++++++++++++++++ .../metrics/response/faithfulness.py | 116 +++++++++++++ .../evaluation/metrics/retrieval/core.py | 36 ++++ .../evaluation/metrics/retrieval/hitrate.py | 51 ++++++ .../evaluation/metrics/retrieval/mrr.py | 51 ++++++ .../evaluation/run_evaluation_experiments.py | 64 ++++++++ .../evaluation/run_evaluation_pipeline.py | 99 +++++++++++ src/pai_rag/tools/agent_tool.py | 8 +- src/pai_rag/tools/load_data_tool.py | 4 +- src/pai_rag/tools/query_tool.py | 6 +- 23 files changed, 1110 insertions(+), 376 deletions(-) create mode 100644 src/pai_rag/config/evaluation/config.yaml rename src/pai_rag/{ => config}/evaluation/settings_eval.toml (83%) create mode 100644 src/pai_rag/evaluation/dataset/rag_eval_dataset.py rename src/pai_rag/evaluation/{generator/rag_qca_sample.py => dataset/rag_qca_dataset.py} (54%) delete mode 100644 src/pai_rag/evaluation/eval_pipeline.py create mode 100644 src/pai_rag/evaluation/evaluator/base_evaluator.py delete mode 100644 src/pai_rag/evaluation/generator/labelled_qca_generator.py delete mode 100644 src/pai_rag/evaluation/generator/predicted_qca_generator.py create mode 100644 src/pai_rag/evaluation/generator/rag_qca_generator.py create mode 100644 src/pai_rag/evaluation/metrics/response/base.py create mode 100644 src/pai_rag/evaluation/metrics/response/correctness.py create mode 100644 src/pai_rag/evaluation/metrics/response/faithfulness.py create mode 100644 src/pai_rag/evaluation/metrics/retrieval/core.py create mode 100644 src/pai_rag/evaluation/metrics/retrieval/hitrate.py create mode 100644 src/pai_rag/evaluation/metrics/retrieval/mrr.py create mode 100644 src/pai_rag/evaluation/run_evaluation_experiments.py create mode 100644 src/pai_rag/evaluation/run_evaluation_pipeline.py diff --git a/pyproject.toml b/pyproject.toml index dcf9df88..42011711 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -108,7 +108,7 @@ docx = "^0.2.4" pai_rag = "pai_rag.main:main" load_data = "pai_rag.tool.load_data_tool:run" load_model = "pai_rag.utils.download_models:load_models" -evaluation = "pai_rag.evaluation.eval_pipeline:run" +run_eval_exp = "pai_rag.evaluation.run_evaluation_experiments:run" [[tool.poetry.source]] name = "pytorch_cpu" diff --git a/pyproject_gpu.toml b/pyproject_gpu.toml index b96a93be..3d4a4cb4 100644 --- a/pyproject_gpu.toml +++ b/pyproject_gpu.toml @@ -100,7 +100,7 @@ peft = "^0.12.0" pai_rag = "pai_rag.main:main" load_data = "pai_rag.tool.load_data_tool:run" load_model = "pai_rag.utils.download_models:load_models" -evaluation = "pai_rag.evaluation.eval_pipeline:run" +run_eval_exp = "pai_rag.evaluation.run_evaluation_experiments:run" [tool.pytest.ini_options] asyncio_mode = "auto" diff --git a/src/pai_rag/app/web/ui_constants.py b/src/pai_rag/app/web/ui_constants.py index 1d67db52..0bff82d1 100644 --- a/src/pai_rag/app/web/ui_constants.py +++ b/src/pai_rag/app/web/ui_constants.py @@ -112,7 +112,7 @@ MLLM_MODEL_KEY_DICT = { "dashscope": [ "qwen-vl-max", - "qwen-vl-turbo", + "qwen-vl-plus", ] } diff --git a/src/pai_rag/config/evaluation/config.yaml b/src/pai_rag/config/evaluation/config.yaml new file mode 100644 index 00000000..76823aa8 --- /dev/null +++ b/src/pai_rag/config/evaluation/config.yaml @@ -0,0 +1,8 @@ +experiment: + # [custom knowledge dataset] + - name: "exp1" + data_path: "example_data/eval_docs" + setting_file: "src/pai_rag/config/evaluation/settings_eval.toml" + - name: "exp2" + data_path: "example_data/eval_docs_1" + setting_file: "src/pai_rag/config/evaluation/settings_eval.toml" diff --git a/src/pai_rag/evaluation/settings_eval.toml b/src/pai_rag/config/evaluation/settings_eval.toml similarity index 83% rename from src/pai_rag/evaluation/settings_eval.toml rename to src/pai_rag/config/evaluation/settings_eval.toml index e5b2114f..341c3949 100644 --- a/src/pai_rag/evaluation/settings_eval.toml +++ b/src/pai_rag/config/evaluation/settings_eval.toml @@ -5,16 +5,8 @@ name = "pai_rag" version = "0.1.1" [rag.agent] -type = "react" - -[rag.agent.custom_config] -agent_file_path = "" - -[rag.agent.intent_detection] -type = "" - -[rag.agent.tool] -type = "" +custom_agent_config_file = "" +agent_tool_type = "" [rag.chat_store] type = "Local" # [Local, Aliyun-Redis] @@ -23,12 +15,9 @@ password = "Aliyun-Redis user:pwd" persist_path = "localdata/eval_exp_data/storage" [rag.data_analysis] -analysis_type = "nl2pandas" +type = "pandas" nl2sql_prompt = "给定一个输入问题,创建一个语法正确的{dialect}查询语句来执行,不要从特定的表中查询所有列,只根据问题查询几个相关的列。请注意只使用你在schema descriptions 中看到的列名。\n=====\n 小心不要查询不存在的列。请注意哪个列位于哪个表中。必要时,请使用表名限定列名。\n=====\n 你必须使用以下格式,每项占一行:\n\n Question: Question here\n SQLQuery: SQL Query to run \n\n Only use tables listed below.\n {schema}\n\n Question: {query_str} \n SQLQuery: " -[rag.data_loader] -type = "local" - [rag.data_reader] type = "SimpleDirectoryReader" @@ -42,9 +31,6 @@ type = "SimpleDirectoryReader" source = "DashScope" embed_batch_size = 10 -[rag.embedding.multi_modal] -source = "cnclip" - [rag.index] persist_path = "localdata/eval_exp_data/storage" enable_multimodal = true @@ -60,12 +46,11 @@ vector_store.type = "FAISS" source = "DashScope" model = "qwen-turbo" -[rag.llm.function_calling_llm] -source = "" +[rag.multimodal_embedding] +source = "cnclip" -[rag.llm.multi_modal] -enable = true -source = "DashScope" +[rag.multimodal_llm] +source = "dashscope" model = "qwen-vl-plus" [rag.node_enhancement] @@ -81,20 +66,16 @@ enable_multimodal = true [rag.oss_store] bucket = "" -endpoint = "" -prefix = "" +endpoint = "oss-cn-hangzhou.aliyuncs.com" [rag.postprocessor] -reranker_type = "simple-weighted-reranker" # [simple-weighted-reranker, model-based-reranker] +reranker_type = "no-reranker" # [simple-weighted-reranker, model-based-reranker] reranker_model = "bge-reranker-base" # [bge-reranker-base, bge-reranker-large] keyword_weight = 0.3 vector_weight = 0.7 similarity_threshold = 0.5 top_n = 2 -[rag.query_engine] -type = "RetrieverQueryEngine" - [rag.query_transform] type = "" @@ -111,6 +92,6 @@ type = "SimpleSummarize" text_qa_template = "参考内容信息如下\n---------------------\n{context_str}\n---------------------根据提供内容而非其他知识回答问题.\n问题: {query_str}\n答案: \n" [rag.trace] -type = "pai-llm-trace" +type = "pai_trace" endpoint = "http://tracing-analysis-dc-hz.aliyuncs.com:8090" token = "" diff --git a/src/pai_rag/evaluation/dataset/rag_eval_dataset.py b/src/pai_rag/evaluation/dataset/rag_eval_dataset.py new file mode 100644 index 00000000..c9555e37 --- /dev/null +++ b/src/pai_rag/evaluation/dataset/rag_eval_dataset.py @@ -0,0 +1,111 @@ +from typing import List, Optional, Type, Dict +from llama_index.core.bridge.pydantic import Field +import json +from llama_index.core.bridge.pydantic import BaseModel +from pai_rag.evaluation.dataset.rag_qca_dataset import RagQcaSample + + +class EvaluationSample(RagQcaSample): + """Response Evaluation RAG example class.""" + + hitrate: Optional[float] = Field( + default_factory=None, + description="The hitrate value for retrieval evaluation.", + ) + mrr: Optional[float] = Field( + default_factory=None, + description="The mrr value for retrieval evaluation.", + ) + + faithfulness_score: Optional[float] = Field( + default_factory=None, + description="The faithfulness score for response evaluation.", + ) + + faithfulness_reason: Optional[str] = Field( + default_factory=None, + description="The faithfulness reason for response evaluation.", + ) + + correctness_score: Optional[float] = Field( + default_factory=None, + description="The correctness score for response evaluation.", + ) + + correctness_reason: Optional[str] = Field( + default_factory=None, + description="The correctness reason for response evaluation.", + ) + + @property + def class_name(self) -> str: + """Data example class name.""" + return "EvaluationSample" + + +class PaiRagEvalDataset(BaseModel): + _example_type: Type[EvaluationSample] = EvaluationSample # type: ignore[misc] + examples: List[EvaluationSample] = Field( + default=[], description="Data examples of this dataset." + ) + results: Dict[str, Dict[str, float]] = Field( + default_factory=dict, description="Evaluation result of this dataset." + ) + status: Dict[str, bool] = Field( + default_factory=dict, description="Status of this dataset." + ) + + @property + def class_name(self) -> str: + """Class name.""" + return "PaiRagEvalDataset" + + def cal_mean_metric_score(self) -> float: + """Calculate the mean metric score.""" + self.results["retrieval"] = {} + self.results["response"] = {} + if self.status["retrieval"]: + self.results["retrieval"] = { + "mean_hitrate": sum(float(entry.hitrate) for entry in self.examples) + / len(self.examples), + "mean_mrr": sum(float(entry.mrr) for entry in self.examples) + / len(self.examples), + } + if self.status["response"]: + self.results["response"] = { + "mean_faithfulness_score": sum( + float(entry.faithfulness_score) for entry in self.examples + ) + / len(self.examples), + "mean_correctness_score": sum( + float(entry.correctness_score) for entry in self.examples + ) + / len(self.examples), + } + + def save_json(self, path: str) -> None: + """Save json.""" + self.cal_mean_metric_score() + + with open(path, "w", encoding="utf-8") as f: + examples = [self._example_type.dict(el) for el in self.examples] + data = { + "examples": examples, + "results": self.results, + "status": self.status, + } + + json.dump(data, f, indent=4, ensure_ascii=False) + print(f"Saved dataset to {path}.") + + @classmethod + def from_json(cls, path: str) -> "PaiRagEvalDataset": + """Load json.""" + with open(path) as f: + data = json.load(f) + + examples = [cls._example_type.parse_obj(el) for el in data["examples"]] + results = data["results"] + status = data["status"] + + return cls(examples=examples, results=results, status=status) diff --git a/src/pai_rag/evaluation/generator/rag_qca_sample.py b/src/pai_rag/evaluation/dataset/rag_qca_dataset.py similarity index 54% rename from src/pai_rag/evaluation/generator/rag_qca_sample.py rename to src/pai_rag/evaluation/dataset/rag_qca_dataset.py index f0f37e9f..a4d0bd3d 100644 --- a/src/pai_rag/evaluation/generator/rag_qca_sample.py +++ b/src/pai_rag/evaluation/dataset/rag_qca_dataset.py @@ -1,22 +1,15 @@ -from typing import List, Optional +from typing import List, Optional, Type from llama_index.core.bridge.pydantic import Field from llama_index.core.llama_dataset.base import BaseLlamaDataExample from llama_index.core.llama_dataset import CreatedBy +import json +from llama_index.core.bridge.pydantic import BaseModel -class LabelledRagQcaSample(BaseLlamaDataExample): - """RAG example class. Analogous to traditional ML datasets, this dataset contains +class RagQcaSample(BaseLlamaDataExample): + """Predicted RAG example class. Analogous to traditional ML datasets, this dataset contains the "features" (i.e., query + context) to make a prediction and the "label" (i.e., response) to evaluate the prediction. - - Args: - query (str): The user query - query_by (CreatedBy): Query generated by human or ai (model-name) - reference_contexts (Optional[List[str]]): The contexts used for response - reference_node_id (Optional[List[str]]): The node id corresponding to the contexts - reference_answer ([str]): Reference answer to the query. An answer - that would receive full marks upon evaluation. - reference_answer_by: The reference answer generated by human or ai (model-name). """ query: str = Field( @@ -40,18 +33,6 @@ class LabelledRagQcaSample(BaseLlamaDataExample): default=None, description="What model generated the reference answer." ) - @property - def class_name(self) -> str: - """Data example class name.""" - return "LabelledRagQcaSample" - - -class PredictedRagQcaSample(LabelledRagQcaSample): - """Predicted RAG example class. Analogous to traditional ML datasets, this dataset contains - the "features" (i.e., query + context) to make a prediction and the "label" (i.e., response) - to evaluate the prediction. - """ - predicted_contexts: Optional[List[str]] = Field( default_factory=None, description="The contexts used to generate the predicted answer.", @@ -71,4 +52,50 @@ class PredictedRagQcaSample(LabelledRagQcaSample): @property def class_name(self) -> str: """Data example class name.""" - return "PredictedRagQcaSample" + return "RagQcaSample" + + +class PaiRagQcaDataset(BaseModel): + _example_type: Type[RagQcaSample] = RagQcaSample # type: ignore[misc] + examples: List[RagQcaSample] = Field( + default=[], description="Data examples of this dataset." + ) + labelled: bool = Field( + default=False, description="Whether the dataset is labelled or not." + ) + predicted: bool = Field( + default=False, description="Whether the dataset is predicted or not." + ) + + @property + def class_name(self) -> str: + """Class name.""" + return "PaiRagQcaDataset" + + def save_json(self, path: str) -> None: + """Save json.""" + with open(path, "w", encoding="utf-8") as f: + examples = [self._example_type.dict(el) for el in self.examples] + data = { + "examples": examples, + "labelled": self.labelled, + "predicted": self.predicted, + } + + json.dump(data, f, indent=4, ensure_ascii=False) + print(f"Saved PaiRagQcaDataset to {path}.") + + @classmethod + def from_json(cls, path: str) -> "PaiRagQcaDataset": + """Load json.""" + with open(path) as f: + data = json.load(f) + + if len(data["examples"]) > 0: + examples = [cls._example_type.parse_obj(el) for el in data["examples"]] + labelled = data["labelled"] + predicted = data["predicted"] + + return cls(examples=examples, labelled=labelled, predicted=predicted) + else: + return None diff --git a/src/pai_rag/evaluation/eval_pipeline.py b/src/pai_rag/evaluation/eval_pipeline.py deleted file mode 100644 index 19a76f44..00000000 --- a/src/pai_rag/evaluation/eval_pipeline.py +++ /dev/null @@ -1,131 +0,0 @@ -import click -import os -import asyncio -from pathlib import Path -from pai_rag.core.rag_config_manager import RagConfigManager -from pai_rag.core.rag_data_loader import RagDataLoader -from pai_rag.core.rag_module import ( - resolve, - resolve_data_loader, - resolve_llm, - resolve_vector_index, -) -from pai_rag.evaluation.generator.labelled_qca_generator import LabelledRagQcaGenerator -from pai_rag.evaluation.generator.predicted_qca_generator import ( - PredictedRagQcaGenerator, -) -from pai_rag.integrations.llms.pai.pai_multi_modal_llm import ( - PaiMultiModalLlm, -) -import logging - - -logger = logging.getLogger(__name__) -logging.basicConfig(level=logging.INFO) - - -_BASE_DIR = Path(__file__).parent.parent -DEFAULT_APPLICATION_CONFIG_FILE = os.path.join( - _BASE_DIR, "evaluation/settings_eval.toml" -) - - -def _create_data_loader(config_file, enable_raptor: bool = False) -> RagDataLoader: - config = RagConfigManager.from_file(config_file).get_value() - data_loader = resolve_data_loader(config) - vector_index = resolve_vector_index(config) - - return data_loader, vector_index - - -def _create_labelled_qca_generator(config_file, vector_index) -> None: - config = RagConfigManager.from_file(config_file).get_value() - llm = resolve_llm(config) - qca_generator = LabelledRagQcaGenerator( - llm=llm, vector_index=vector_index, persist_path=config.rag.index.persist_path - ) - return qca_generator - - -def _create_predicted_qca_generator(config_file, vector_index) -> None: - config = RagConfigManager.from_file(config_file).get_value() - # llm = resolve_llm(config) - multimodal_llm = resolve(cls=PaiMultiModalLlm, llm_config=config.multimodal_llm) - predicted_qca_generator = PredictedRagQcaGenerator( - llm=multimodal_llm, - vector_index=vector_index, - persist_path=config.rag.index.persist_path, - ) - return predicted_qca_generator - - -@click.command() -@click.option( - "-c", - "--config", - show_default=True, - help=f"Configuration file. Default: {DEFAULT_APPLICATION_CONFIG_FILE}", - default=DEFAULT_APPLICATION_CONFIG_FILE, -) -@click.option( - "-o", - "--oss_path", - type=str, - required=False, - default=None, - show_default=True, - help="oss path (file or directory) to ingest. Example: oss://rag-demo/testdata", -) -@click.option( - "-d", - "--data_path", - type=str, - required=False, - default=None, - show_default=True, - help="data path (file or directory) to ingest.", -) -@click.option( - "-p", - "--pattern", - required=False, - type=str, - default=None, - help="data pattern to ingest.", -) -@click.option( - "-r", - "--enable_raptor", - required=False, - is_flag=True, - show_default=True, - default=False, - help="use raptor for node enhancement.", -) -def run( - config=None, - oss_path=None, - data_path=None, - pattern=None, - enable_raptor=False, -): - assert (oss_path is not None) or ( - data_path is not None - ), "Must provide either local path or oss path." - assert (oss_path is None) or ( - data_path is None - ), f"Can not provide both local path '{data_path}' and oss path '{oss_path}'." - - data_loader, vector_index = _create_data_loader(config, enable_raptor) - data_loader.load_data( - file_path_or_directory=data_path, - filter_pattern=pattern, - oss_path=oss_path, - from_oss=oss_path is not None, - enable_raptor=enable_raptor, - ) - qca_generator = _create_labelled_qca_generator(config, vector_index) - asyncio.run(qca_generator.agenerate_labelled_qca_dataset()) - - predicted_qca_generator = _create_predicted_qca_generator(config, vector_index) - asyncio.run(predicted_qca_generator.agenerate_predicted_qca_dataset()) diff --git a/src/pai_rag/evaluation/evaluator/base_evaluator.py b/src/pai_rag/evaluation/evaluator/base_evaluator.py new file mode 100644 index 00000000..b3ccb56b --- /dev/null +++ b/src/pai_rag/evaluation/evaluator/base_evaluator.py @@ -0,0 +1,132 @@ +import os +from pai_rag.evaluation.metrics.retrieval.hitrate import HitRate +from pai_rag.evaluation.metrics.retrieval.mrr import MRR +from pai_rag.evaluation.metrics.response.faithfulness import Faithfulness +from pai_rag.evaluation.metrics.response.correctness import Correctness + +from llama_index.core.async_utils import run_jobs +from pai_rag.evaluation.dataset.rag_eval_dataset import ( + EvaluationSample, + PaiRagEvalDataset, +) +from pai_rag.evaluation.dataset.rag_qca_dataset import PaiRagQcaDataset + + +class BaseEvaluator: + def __init__(self, llm, persist_path: str = None): + self._llm = llm + self.persist_path = persist_path + self.hitrate = HitRate() + self.mrr = MRR() + self.retrieval_evaluators = [self.hitrate, self.mrr] + self.faithfulness_evaluator = Faithfulness( + llm=self._llm, + ) + self.correctness_evaluator = Correctness( + llm=self._llm, + ) + self.response_evaluators = [ + self.faithfulness_evaluator, + self.correctness_evaluator, + ] + self.evaluation_dataset_path = os.path.join( + self.persist_path, "evaluation_dataset.json" + ) + self.qca_dataset_path = os.path.join(self.persist_path, "qca_dataset.json") + self._show_progress = True + self._workers = 2 + + def load_qca_dataset(self) -> None: + if os.path.exists(self.qca_dataset_path): + rag_qca_dataset = PaiRagQcaDataset.from_json(self.qca_dataset_path) + if rag_qca_dataset.labelled and rag_qca_dataset.predicted: + print( + f"Labelled QCA dataset already exists at {self.qca_dataset_path}." + ) + return rag_qca_dataset + else: + raise ValueError( + "The QCA dataset exists but is not labelled and predicted. " + "Please either label it or provide a new one." + ) + else: + print("No existing QCA dataset found. You can proceed to create a new one.") + return None + + def load_evaluation_dataset(self) -> None: + if os.path.exists(self.evaluation_dataset_path): + print( + f"A evaluation dataset already exists at {self.evaluation_dataset_path}." + ) + evaluation_dataset = PaiRagEvalDataset.from_json( + self.evaluation_dataset_path + ) + return evaluation_dataset + else: + print( + "No existing evaluation dataset found. You can proceed to create a new one." + ) + return None + + async def compute_retrieval_metrics(self, qca_sample): + retrieval_eval_example = EvaluationSample(**vars(qca_sample)) + reference_node_id = retrieval_eval_example.reference_node_id + predicted_node_id = retrieval_eval_example.predicted_node_id + for metric in self.retrieval_evaluators: + metric_score = metric.compute(reference_node_id, predicted_node_id) + setattr(retrieval_eval_example, metric.metric_name, metric_score) + + return retrieval_eval_example + + async def compute_response_metrics(self, qca_sample): + response_eval_example = EvaluationSample(**vars(qca_sample)) + query = response_eval_example.query + response = response_eval_example.reference_answer + contexts = response_eval_example.predicted_contexts + for metric in self.response_evaluators: + metric_result = await metric.aevaluate(query, response, contexts) + setattr( + response_eval_example, f"{metric.metric_name}_score", metric_result[0] + ) + setattr( + response_eval_example, f"{metric.metric_name}_reason", metric_result[1] + ) + + return response_eval_example + + async def aevaluation(self, stage): + """Run evaluation with qca dataset.""" + _status = {"retrieval": False, "response": False} + evaluation_dataset = self.load_evaluation_dataset() + qca_dataset = self.load_qca_dataset() + if evaluation_dataset: + print( + f"A evaluation dataset already exists with status: [[{evaluation_dataset.status}]]" + ) + _status = evaluation_dataset.status + if _status[stage]: + return evaluation_dataset.results[stage] + else: + qca_dataset = evaluation_dataset + if qca_dataset: + print(f"Starting to generate evaluation dataset for stage: [[{stage}]]...") + eval_tasks = [] + for qca in qca_dataset.examples: + if stage == "retrieval": + eval_tasks.append(self.compute_retrieval_metrics(qca)) + elif stage == "response": + eval_tasks.append(self.compute_response_metrics(qca)) + else: + raise ValueError(f"Invalid stage: {stage}") + eval_examples = await run_jobs( + eval_tasks, self._show_progress, self._workers + ) + _status[stage] = True + eval_dataset = PaiRagEvalDataset(examples=eval_examples, status=_status) + eval_dataset.save_json(self.evaluation_dataset_path) + return eval_dataset.results[stage] + else: + raise ValueError( + "No QCA dataset found. Please provide a QCA dataset or " + "generate one first." + ) diff --git a/src/pai_rag/evaluation/generator/labelled_qca_generator.py b/src/pai_rag/evaluation/generator/labelled_qca_generator.py deleted file mode 100644 index 9fe9b024..00000000 --- a/src/pai_rag/evaluation/generator/labelled_qca_generator.py +++ /dev/null @@ -1,99 +0,0 @@ -from typing import List -from llama_index.core.indices import VectorStoreIndex -from pai_rag.utils.prompt_template import ( - DEFAULT_QUESTION_GENERATION_PROMPT, - DEFAULT_TEXT_QA_PROMPT_TMPL, - DEFAULT_QUESTION_GENERATION_QUERY, -) -from llama_index.core.base.response.schema import RESPONSE_TYPE -from llama_index.core.prompts.base import PromptTemplate -import re -from llama_index.core.async_utils import run_jobs -from pai_rag.evaluation.generator.rag_qca_sample import LabelledRagQcaSample -from llama_index.core.llama_dataset import ( - CreatedBy, - CreatedByType, - LabelledRagDataset, -) -import json -import os -import logging - -logger = logging.getLogger(__name__) - - -class LabelledRagQcaGenerator: - def __init__( - self, llm, vector_index: VectorStoreIndex = None, persist_path: str = None - ): - self._llm = llm - self._vector_index = vector_index._vector_index - self.question_gen_query = DEFAULT_QUESTION_GENERATION_QUERY.format( - num_questions_per_chunk=3 - ) - self.text_question_template = PromptTemplate(DEFAULT_QUESTION_GENERATION_PROMPT) - self.text_question_answer_template = PromptTemplate(DEFAULT_TEXT_QA_PROMPT_TMPL) - model_name = self._llm.metadata.model_name - self.created_by = CreatedBy(type=CreatedByType.AI, model_name=model_name) - self.persist_path = persist_path - self._show_progress = True - self._workers = 2 - - def save_labelled_qca_dataset_json(self, qas: LabelledRagDataset) -> None: - """Save json.""" - labelled_qca_dataset_path = os.path.join( - self.persist_path, "labelled_qca_dataset.json" - ) - with open(labelled_qca_dataset_path, "w", encoding="utf-8") as f: - json.dump(qas.dict(), f, indent=4, ensure_ascii=False) - logger.info(f"Saved labelled qca dataset to {labelled_qca_dataset_path}") - - async def agenerate_labelled_qca_dataset( - self, - ) -> LabelledRagDataset: - examples: List[LabelledRagQcaSample] = [] - docs = self._vector_index._docstore.docs - nodes = list(docs.values()) - query_tasks = [] - for node in nodes: - prompt_str = self.text_question_template.format( - context_str=node.text, num_questions_per_chunk=3 - ) - task = self._llm.acomplete(prompt=prompt_str) - query_tasks.append(task) - responses = await run_jobs(query_tasks, self._show_progress, self._workers) - for node, response in zip(nodes, responses): - result = str(response).strip().split("\n") - cleaned_questions = [ - re.sub(r"^\d+[\).\s]", "", question).strip() for question in result - ] - cleaned_questions = [ - question for question in cleaned_questions if len(question) > 0 - ] - qr_tasks = [] - for query in cleaned_questions: - # build summary index off of node (i.e. context) - prompt_str = self.text_question_answer_template.format( - context_str=node.text, query_str=query - ) - qr_task = self._llm.acomplete(prompt=prompt_str) - qr_tasks.append(qr_task) - answer_responses: List[RESPONSE_TYPE] = await run_jobs( - qr_tasks, self._show_progress, self._workers - ) - for ( - question, - answer_response, - ) in zip(cleaned_questions, answer_responses): - example = LabelledRagQcaSample( - query=question, - reference_answer=str(answer_response), - reference_contexts=[node.text], - reference_node_id=[node.node_id], - reference_answer_by=self.created_by, - query_by=self.created_by, - ) - examples.append(example) - labelled_qca_dataset = LabelledRagDataset(examples=examples) - self.save_labelled_qca_dataset_json(labelled_qca_dataset) - return labelled_qca_dataset diff --git a/src/pai_rag/evaluation/generator/predicted_qca_generator.py b/src/pai_rag/evaluation/generator/predicted_qca_generator.py deleted file mode 100644 index 2d1e1838..00000000 --- a/src/pai_rag/evaluation/generator/predicted_qca_generator.py +++ /dev/null @@ -1,77 +0,0 @@ -from typing import List -from llama_index.core.indices import VectorStoreIndex -from llama_index.core.async_utils import run_jobs -from pai_rag.evaluation.generator.rag_qca_sample import PredictedRagQcaSample -from llama_index.core.llama_dataset import ( - CreatedBy, - CreatedByType, - LabelledRagDataset, -) - -import json -import os -import logging - -logger = logging.getLogger(__name__) - - -class PredictedRagQcaGenerator: - def __init__( - self, llm, vector_index: VectorStoreIndex = None, persist_path: str = None - ): - self._vector_index = vector_index._vector_index - self._llm = llm - model_name = self._llm.metadata.model_name - self.created_by = CreatedBy(type=CreatedByType.AI, model_name=model_name) - self._query_engine = vector_index._vector_index.as_query_engine(llm=self._llm) - self.persist_path = persist_path - self._show_progress = True - self._workers = 2 - - def from_labelled_qca_dataset(self) -> LabelledRagDataset: - """Load json.""" - labelled_qca_dataset_path = os.path.join( - self.persist_path, "labelled_qca_dataset.json" - ) - with open(labelled_qca_dataset_path, encoding="utf-8") as f: - data = json.load(f) - return data - - def save_predicted_qca_dataset_json(self, qas: LabelledRagDataset) -> None: - """Save json.""" - predicted_qca_dataset_path = os.path.join( - self.persist_path, "predicted_qca_dataset.json" - ) - with open(predicted_qca_dataset_path, "w", encoding="utf-8") as f: - json.dump(qas.dict(), f, indent=4, ensure_ascii=False) - logger.info(f"Saved labelled qca dataset to {predicted_qca_dataset_path}") - - async def agenerate_predicted_qca_dataset( - self, - ): - labelled_qca_dataset = self.from_labelled_qca_dataset() - query_tasks = [] - labelled_qca_set = [] - for qca in labelled_qca_dataset["examples"]: - task = self._query_engine.aquery(qca["query"]) - labelled_qca_set.append(qca) - query_tasks.append(task) - responses = await run_jobs(query_tasks, self._show_progress, self._workers) - examples: List[PredictedRagQcaSample] = [] - for qca, response in zip(labelled_qca_set, responses): - example = PredictedRagQcaSample( - query=qca["query"], - reference_answer=qca["reference_answer"], - reference_contexts=qca["reference_contexts"], - reference_node_id=qca["reference_node_id"], - reference_answer_by=qca["reference_answer_by"], - query_by=qca["query_by"], - predicted_contexts=[node.node.text for node in response.source_nodes], - predicted_node_id=[node.node.node_id for node in response.source_nodes], - predicted_answer=response.response, - predicted_answer_by=self.created_by, - ) - examples.append(example) - predicted_qca_dataset = LabelledRagDataset(examples=examples) - self.save_predicted_qca_dataset_json(predicted_qca_dataset) - return predicted_qca_dataset diff --git a/src/pai_rag/evaluation/generator/rag_qca_generator.py b/src/pai_rag/evaluation/generator/rag_qca_generator.py new file mode 100644 index 00000000..6002462c --- /dev/null +++ b/src/pai_rag/evaluation/generator/rag_qca_generator.py @@ -0,0 +1,155 @@ +from typing import List +from llama_index.core.indices import VectorStoreIndex +from pai_rag.utils.prompt_template import ( + DEFAULT_QUESTION_GENERATION_PROMPT, + DEFAULT_TEXT_QA_PROMPT_TMPL, + DEFAULT_QUESTION_GENERATION_QUERY, +) +from llama_index.core.base.response.schema import RESPONSE_TYPE +from llama_index.core.prompts.base import PromptTemplate +import re +from llama_index.core.async_utils import run_jobs +from pai_rag.evaluation.dataset.rag_qca_dataset import RagQcaSample, PaiRagQcaDataset +from llama_index.core.llama_dataset import ( + CreatedBy, + CreatedByType, +) +from pai_rag.integrations.synthesizer.pai_synthesizer import PaiQueryBundle + +import os +import logging +from pai_rag.integrations.query_engine.pai_retriever_query_engine import ( + PaiRetrieverQueryEngine, +) + +logger = logging.getLogger(__name__) + + +class RagQcaGenerator: + def __init__( + self, + llm, + vector_index: VectorStoreIndex = None, + query_engine: PaiRetrieverQueryEngine = None, + persist_path: str = None, + ): + self._llm = llm + self._vector_index = vector_index._vector_index + self._query_engine = query_engine + self.question_gen_query = DEFAULT_QUESTION_GENERATION_QUERY.format( + num_questions_per_chunk=3 + ) + self.text_question_template = PromptTemplate(DEFAULT_QUESTION_GENERATION_PROMPT) + self.text_question_answer_template = PromptTemplate(DEFAULT_TEXT_QA_PROMPT_TMPL) + self.created_by = CreatedBy( + type=CreatedByType.AI, model_name=self._llm.metadata.model_name + ) + self.persist_path = persist_path + self.qca_dataset_path = os.path.join(self.persist_path, "qca_dataset.json") + self._show_progress = True + self._workers = 2 + + def load_qca_dataset(self) -> None: + if os.path.exists(self.qca_dataset_path): + rag_qca_dataset = PaiRagQcaDataset.from_json(self.qca_dataset_path) + print( + f"A RAG QCA dataset already exists at {self.qca_dataset_path} with status: [labelled: {rag_qca_dataset.labelled}, predicted: {rag_qca_dataset.predicted}]." + ) + return rag_qca_dataset + else: + print("No existing QCA dataset found. You can proceed to create a new one.") + return None + + async def agenerate_qca_dataset(self, stage): + rag_qca_dataset = self.load_qca_dataset() + if rag_qca_dataset and rag_qca_dataset.labelled: + if stage == "labelled": + print("Labelled QCA dataset already exists. Skipping labelled stage.") + return rag_qca_dataset.examples + elif stage == "predicted": + if rag_qca_dataset.predicted: + print( + "Predicted QCA dataset already exists. Skipping predicted stage." + ) + return rag_qca_dataset.examples + else: + return await self.agenerate_predicted_qca_dataset(rag_qca_dataset) + else: + raise ValueError(f"Invalid stage: {stage}") + else: + return await self.agenerate_labelled_qca_dataset() + + async def agenerate_labelled_qca_sample(self, node): + prompt_str = self.text_question_template.format( + context_str=node.text, num_questions_per_chunk=1 + ) + response = await self._llm.acomplete(prompt=prompt_str, image_documents=None) + result = str(response).strip().split("\n") + cleaned_questions = [ + re.sub(r"^\d+[\).\s]", "", question).strip() for question in result + ] + cleaned_questions = [ + question for question in cleaned_questions if len(question) > 0 + ] + qr_tasks = [] + for query in cleaned_questions: + prompt_str = self.text_question_answer_template.format( + context_str=node.text, query_str=query + ) + qr_task = self._llm.acomplete(prompt=prompt_str, image_documents=None) + qr_tasks.append(qr_task) + answer_responses: List[RESPONSE_TYPE] = await run_jobs( + qr_tasks, self._show_progress, self._workers + ) + for ( + question, + answer_response, + ) in zip(cleaned_questions, answer_responses): + sample = RagQcaSample( + query=question, + reference_answer=str(answer_response), + reference_contexts=[node.text], + reference_node_id=[node.node_id], + reference_answer_by=self.created_by, + query_by=self.created_by, + ) + return sample + + async def agenerate_labelled_qca_dataset( + self, + ): + print("Starting to generate QCA dataset for [[labelled]].") + docs = self._vector_index._docstore.docs + nodes = list(docs.values()) + tasks = [] + for node in nodes: + tasks.append(self.agenerate_labelled_qca_sample(node)) + examples = await run_jobs(tasks, self._show_progress, self._workers) + labelled_qca_dataset = PaiRagQcaDataset(examples=examples, labelled=True) + labelled_qca_dataset.save_json(self.qca_dataset_path) + return labelled_qca_dataset.examples + + async def agenerate_predicted_qca_sample(self, qca_sample): + query_bundle = PaiQueryBundle(query_str=qca_sample.query) + response = await self._query_engine.aquery(query_bundle) + qca_sample.predicted_answer = response.response + qca_sample.predicted_contexts = [ + node.node.text for node in response.source_nodes + ] + qca_sample.predicted_node_id = [ + node.node.node_id for node in response.source_nodes + ] + qca_sample.predicted_answer_by = self.created_by + return qca_sample + + async def agenerate_predicted_qca_dataset(self, rag_qca_dataset): + print("Starting to generate QCA dataset for [[predicted]].") + tasks = [] + for qca_sample in rag_qca_dataset.examples: + tasks.append(self.agenerate_predicted_qca_sample(qca_sample)) + predicted_examples = await run_jobs(tasks, self._show_progress, self._workers) + predicted_qca_dataset = PaiRagQcaDataset( + examples=predicted_examples, labelled=True, predicted=True + ) + predicted_qca_dataset.save_json(self.qca_dataset_path) + return predicted_qca_dataset diff --git a/src/pai_rag/evaluation/metrics/response/base.py b/src/pai_rag/evaluation/metrics/response/base.py new file mode 100644 index 00000000..d0bad74f --- /dev/null +++ b/src/pai_rag/evaluation/metrics/response/base.py @@ -0,0 +1,73 @@ +"""Llm metric for response evaluation.""" +from abc import abstractmethod +from typing import Any, Optional, Sequence, Union + +from llama_index.core.evaluation.base import EvaluationResult +from llama_index.core.llms.llm import LLM +from llama_index.core.prompts import BasePromptTemplate, PromptTemplate +from llama_index.core.prompts.mixin import PromptDictType +from llama_index.core.prompts.mixin import PromptMixin, PromptMixinType + +DEFAULT_EVAL_TEMPLATE = PromptTemplate( + "Information: {query_str}\n" "Context: {context_str}\n" "Answer: " "Reason: " +) + + +class LlmMetric(PromptMixin): + """ + Llm Metric. + """ + + metric_name: str = "base" + + def __init__( + self, + llm: Optional[LLM] = None, + raise_error: bool = False, + eval_template: Optional[Union[str, BasePromptTemplate]] = None, + ) -> None: + """Init params.""" + self._llm = llm + self._raise_error = raise_error + + self._eval_template: BasePromptTemplate + if isinstance(eval_template, str): + self._eval_template = PromptTemplate(eval_template) + else: + self._eval_template = eval_template or DEFAULT_EVAL_TEMPLATE + + def _get_prompts(self) -> PromptDictType: + """Get prompts.""" + return { + "eval_template": self._eval_template, + } + + def _update_prompts(self, prompts: PromptDictType) -> None: + """Update prompts.""" + if "eval_template" in prompts: + self._eval_template = prompts["eval_template"] + + @abstractmethod + async def parse_eval_result(self, eval_result: str) -> float: + """Parse eval_result.""" + raise NotImplementedError + + @abstractmethod + async def aevaluate( + self, + query: str | None = None, + response: str | None = None, + contexts: Sequence[str] | None = None, + **kwargs: Any, + ) -> EvaluationResult: + """Run evaluation with query string, retrieved contexts, + and generated response string. + + Subclasses can override this method to provide custom evaluation logic and + take in additional arguments. + """ + raise NotImplementedError + + def _get_prompt_modules(self) -> PromptMixinType: + """Get prompt modules.""" + return {} diff --git a/src/pai_rag/evaluation/metrics/response/correctness.py b/src/pai_rag/evaluation/metrics/response/correctness.py new file mode 100644 index 00000000..3e8b820a --- /dev/null +++ b/src/pai_rag/evaluation/metrics/response/correctness.py @@ -0,0 +1,143 @@ +"""Correctness evaluation.""" +import asyncio +from typing import Any, Optional, Sequence, Union + +from llama_index.core.evaluation.base import EvaluationResult +from llama_index.core.llms.llm import LLM +from llama_index.core.prompts import ( + BasePromptTemplate, + ChatMessage, + ChatPromptTemplate, + MessageRole, + PromptTemplate, +) +from pai_rag.evaluation.metrics.response.base import LlmMetric + +DEFAULT_SYSTEM_TEMPLATE = """ +You are an expert evaluation system for a question answering chatbot. + +You are given the following information: +- a user query, and +- a generated answer + +You may also be given a reference answer to use for reference in your evaluation. + +Your job is to judge the relevance and correctness of the generated answer. +Output a single score that represents a holistic evaluation. +You must return your response in a line with only the score. +Do not return answers in any other format. +On a separate line provide your reasoning for the score as well. + +Follow these guidelines for scoring: +- Your score has to be between 1 and 5, where 1 is the worst and 5 is the best. +- If the generated answer is not relevant to the user query, \ +you should give a score of 1. +- If the generated answer is relevant but contains mistakes, \ +you should give a score between 2 and 3. +- If the generated answer is relevant and fully correct, \ +you should give a score between 4 and 5. + +Example Response: +4.0 +The generated answer has the exact same metrics as the reference answer, \ + but it is not as concise. + +""" + +DEFAULT_USER_TEMPLATE = """ +## User Query +{query} + +## Reference Answer +{reference_answer} + +## Generated Answer +{generated_answer} +""" + +DEFAULT_EVAL_TEMPLATE = ChatPromptTemplate( + message_templates=[ + ChatMessage(role=MessageRole.SYSTEM, content=DEFAULT_SYSTEM_TEMPLATE), + ChatMessage(role=MessageRole.USER, content=DEFAULT_USER_TEMPLATE), + ] +) + + +class Correctness(LlmMetric): + """Correctness evaluator. + + Evaluates the correctness of a question answering system. + This evaluator depends on `reference` answer to be provided, in addition to the + query string and response string. + + It outputs a score between 1 and 5, where 1 is the worst and 5 is the best, + along with a reasoning for the score. + Passing is defined as a score greater than or equal to the given threshold. + + Args: + service_context (Optional[ServiceContext]): Service context. + eval_template (Optional[Union[BasePromptTemplate, str]]): + Template for the evaluation prompt. + score_threshold (float): Numerical threshold for passing the evaluation, + defaults to 4.0. + """ + + metric_name: str = "correctness" + + def __init__( + self, + llm: Optional[LLM] = None, + raise_error: bool = False, + eval_template: Optional[Union[str, BasePromptTemplate]] = None, + score_threshold: float = 4.0, + ) -> None: + if isinstance(eval_template, str): + eval_template = PromptTemplate(eval_template) + else: + eval_template = eval_template or DEFAULT_EVAL_TEMPLATE + + super().__init__(llm, raise_error, eval_template) + + self._score_threshold = score_threshold + + def parse_eval_result(self, eval_result: str): + if not eval_result.strip(): + # Return None or default values if the response is empty + return None, "No response" + + score_str, reasoning_str = eval_result.split("\n", 1) + + try: + score = float(score_str) + except ValueError: + score = None + + reasoning = reasoning_str.lstrip("\n") + return [score, reasoning] + + async def aevaluate( + self, + query: Optional[str] = None, + response: Optional[str] = None, + contexts: Optional[Sequence[str]] = None, + reference: Optional[str] = None, + sleep_time_in_seconds: int = 0, + **kwargs: Any, + ) -> EvaluationResult: + del kwargs # Unused + del contexts # Unused + + await asyncio.sleep(sleep_time_in_seconds) + + if query is None or response is None: + raise ValueError("query, and response must be provided") + + raw_response = await self._llm.apredict( + prompt=self._eval_template, + query=query, + generated_answer=response, + reference_answer=reference or "(NO REFERENCE ANSWER SUPPLIED)", + ) + + # Use the parser function + return self.parse_eval_result(raw_response) diff --git a/src/pai_rag/evaluation/metrics/response/faithfulness.py b/src/pai_rag/evaluation/metrics/response/faithfulness.py new file mode 100644 index 00000000..7e545963 --- /dev/null +++ b/src/pai_rag/evaluation/metrics/response/faithfulness.py @@ -0,0 +1,116 @@ +"""Faithfulness evaluation.""" +import asyncio +from typing import Any, Optional, Sequence, Union +from llama_index.core.llms.llm import LLM +from llama_index.core.prompts import ( + BasePromptTemplate, + PromptTemplate, +) +from llama_index.core.evaluation.base import EvaluationResult +from pai_rag.evaluation.metrics.response.base import LlmMetric + +DEFAULT_EVAL_TEMPLATE = PromptTemplate( + "Please tell if a given piece of information " + "is supported by the context.\n" + "You need to answer with either YES or NO.\n" + "Answer YES if any of the context supports the information, even " + "if most of the context is unrelated. " + "Some examples are provided below. \n\n" + "Information: Apple pie is generally double-crusted.\n" + "Context: An apple pie is a fruit pie in which the principal filling " + "ingredient is apples. \n" + "Apple pie is often served with whipped cream, ice cream " + "('apple pie à la mode'), custard or cheddar cheese.\n" + "It is generally double-crusted, with pastry both above " + "and below the filling; the upper crust may be solid or " + "latticed (woven of crosswise strips).\n" + "Answer: YES\n" + "Reason: The context explicitly states that 'It is generally double-crusted,' " + "which directly supports the information that 'Apple pie is generally double-crusted.' " + "Therefore, the information is confirmed by the context. \n\n" + "Information: Apple pies tastes bad.\n" + "Context: An apple pie is a fruit pie in which the principal filling " + "ingredient is apples. \n" + "Apple pie is often served with whipped cream, ice cream " + "('apple pie à la mode'), custard or cheddar cheese.\n" + "It is generally double-crusted, with pastry both above " + "and below the filling; the upper crust may be solid or " + "latticed (woven of crosswise strips).\n" + "Answer: NO\n" + "Reason: The context does not provide any information regarding the taste of apple pie. " + "It describes the ingredients and serving suggestions but does not support the claim that " + "'apple pies taste bad.' Therefore, the information is not supported by the context. \n" + "Information: {query_str}\n" + "Context: {context_str}\n" + "Answer: " + "Reason: " +) + + +class Faithfulness(LlmMetric): + """ + Faithfulness evaluator. + + Evaluates whether a response is faithful to the contexts + (i.e. whether the response is supported by the contexts or hallucinated.) + + This evaluator only considers the response string and the list of context strings. + + Args: + raise_error(bool): Whether to raise an error when the response is invalid. + Defaults to False. + eval_template(Optional[Union[str, BasePromptTemplate]]): + The template to use for evaluation. + """ + + metric_name: str = "faithfulness" + + def __init__( + self, + llm: Optional[LLM] = None, + raise_error: bool = False, + eval_template: Optional[Union[str, BasePromptTemplate]] = None, + ) -> None: + if isinstance(eval_template, str): + eval_template = PromptTemplate(eval_template) + else: + eval_template = eval_template or DEFAULT_EVAL_TEMPLATE + + super().__init__(llm, raise_error, eval_template) + + def parse_eval_result(self, eval_result: str): + raw_response_txt = eval_result.text.lower() + if "yes" in raw_response_txt: + passing = True + else: + passing = False + if self._raise_error: + raise ValueError("The response is invalid") + score = 1.0 if passing else 0.0 + reasoning = raw_response_txt + return [score, reasoning] + + async def aevaluate( + self, + query: str | None = None, + response: str | None = None, + contexts: Sequence[str] | None = None, + sleep_time_in_seconds: int = 0, + **kwargs: Any, + ) -> EvaluationResult: + """Evaluate whether the response is faithful to the contexts.""" + del kwargs # Unused + + await asyncio.sleep(sleep_time_in_seconds) + + if contexts is None or response is None: + raise ValueError("contexts and response must be provided") + + prompt_str = self._eval_template.format( + query_str=query, + context_str="\n".join(contexts), + ) + raw_response = await self._llm.acomplete(prompt=prompt_str) + + # Use the parser function + return self.parse_eval_result(raw_response) diff --git a/src/pai_rag/evaluation/metrics/retrieval/core.py b/src/pai_rag/evaluation/metrics/retrieval/core.py new file mode 100644 index 00000000..c25024e8 --- /dev/null +++ b/src/pai_rag/evaluation/metrics/retrieval/core.py @@ -0,0 +1,36 @@ +def default_hit_rate(expected_ids, retrieved_ids): + """Default HitRate calculation: Check if there is a single hit""" + is_hit = any(id in expected_ids for id in retrieved_ids) + score = 1.0 if is_hit else 0.0 + return score + + +def granular_hit_rate(expected_ids, retrieved_ids): + """Granular HitRate calculation: Calculate all hits and divide by the number of expected docs""" + expected_set = set(expected_ids) + hits = sum(1 for doc_id in retrieved_ids if doc_id in expected_set) + score = hits / len(expected_ids) if expected_ids else 0.0 + return score + + +def default_mrr(expected_ids, retrieved_ids): + """Default MRR calculation: Reciprocal rank of the first relevant document retrieved""" + for i, id in enumerate(retrieved_ids): + if id in expected_ids: + return 1.0 / (i + 1) + return 0.0 + + +def granular_mrr(expected_ids, retrieved_ids): + """Granular MRR calculation: All relevant retrieved docs have their reciprocal ranks summed and averaged.""" + expected_set = set(expected_ids) + reciprocal_rank_sum = 0.0 + relevant_docs_count = 0 + for index, doc_id in enumerate(retrieved_ids): + if doc_id in expected_set: + relevant_docs_count += 1 + reciprocal_rank_sum += 1.0 / (index + 1) + score = ( + reciprocal_rank_sum / relevant_docs_count if relevant_docs_count > 0 else 0.0 + ) + return score diff --git a/src/pai_rag/evaluation/metrics/retrieval/hitrate.py b/src/pai_rag/evaluation/metrics/retrieval/hitrate.py new file mode 100644 index 00000000..a51c1e65 --- /dev/null +++ b/src/pai_rag/evaluation/metrics/retrieval/hitrate.py @@ -0,0 +1,51 @@ +from typing import List, Optional +from pai_rag.evaluation.metrics.retrieval.core import ( + granular_hit_rate, + default_hit_rate, +) + + +class HitRate: + """Hit rate metric: Compute hit rate with two calculation options. + + - The default method checks for a single match between any of the retrieved docs and expected docs. + - The more granular method checks for all potential matches between retrieved docs and expected docs. + + Attributes: + metric_name (str): The name of the metric. + use_granular_hit_rate (bool): Determines whether to use the granular method for calculation. + """ + + metric_name: str = "hitrate" + use_granular_hit_rate: bool = False + + def compute( + self, + expected_ids: Optional[List[str]] = None, + retrieved_ids: Optional[List[str]] = None, + ): + """Compute metric based on the provided inputs. + + Parameters: + expected_ids (Optional[List[str]]): Expected document IDs. + retrieved_ids (Optional[List[str]]): Retrieved document IDs. + + Raises: + ValueError: If the necessary IDs are not provided. + + Returns: + RetrievalMetricResult: The result with the computed hit rate score. + """ + # Checking for the required arguments + if ( + retrieved_ids is None + or expected_ids is None + or not retrieved_ids + or not expected_ids + ): + raise ValueError("Retrieved ids and expected ids must be provided") + + if self.use_granular_hit_rate: + return granular_hit_rate(expected_ids, retrieved_ids) + else: + return default_hit_rate(expected_ids, retrieved_ids) diff --git a/src/pai_rag/evaluation/metrics/retrieval/mrr.py b/src/pai_rag/evaluation/metrics/retrieval/mrr.py new file mode 100644 index 00000000..ed92449b --- /dev/null +++ b/src/pai_rag/evaluation/metrics/retrieval/mrr.py @@ -0,0 +1,51 @@ +from typing import List, Optional +from pai_rag.evaluation.metrics.retrieval.core import ( + default_mrr, + granular_mrr, +) + + +class MRR: + """MRR (Mean Reciprocal Rank) metric with two calculation options. + + - The default method calculates the reciprocal rank of the first relevant retrieved document. + - The more granular method sums the reciprocal ranks of all relevant retrieved documents and divides by the count of relevant documents. + + Attributes: + metric_name (str): The name of the metric. + use_granular_mrr (bool): Determines whether to use the granular method for calculation. + """ + + metric_name: str = "mrr" + use_granular_mrr: bool = False + + def compute( + self, + expected_ids: Optional[List[str]] = None, + retrieved_ids: Optional[List[str]] = None, + ): + """Compute MRR based on the provided inputs and selected method. + + Parameters: + expected_ids (Optional[List[str]]): Expected document IDs. + retrieved_ids (Optional[List[str]]): Retrieved document IDs. + + Raises: + ValueError: If the necessary IDs are not provided. + + Returns: + RetrievalMetricResult: The result with the computed MRR score. + """ + # Checking for the required arguments + if ( + retrieved_ids is None + or expected_ids is None + or not retrieved_ids + or not expected_ids + ): + raise ValueError("Retrieved ids and expected ids must be provided") + + if self.use_granular_mrr: + return granular_mrr(expected_ids, retrieved_ids) + else: + return default_mrr(expected_ids, retrieved_ids) diff --git a/src/pai_rag/evaluation/run_evaluation_experiments.py b/src/pai_rag/evaluation/run_evaluation_experiments.py new file mode 100644 index 00000000..7cb0826b --- /dev/null +++ b/src/pai_rag/evaluation/run_evaluation_experiments.py @@ -0,0 +1,64 @@ +import yaml +import click +import logging +import time +import json +import hashlib +from pai_rag.evaluation.run_evaluation_pipeline import run_evaluation_pipeline + + +def validate_json_file(ctx, param, value): + """检查文件路径是否以 .json 结尾""" + if value is not None and not value.endswith(".json"): + raise click.BadParameter( + "Output path must be a JSON file with a .json extension." + ) + return value + + +def calculate_md5_from_json(data): + """计算 JSON 内容的 MD5 值""" + hasher = hashlib.md5() + # 将 JSON 对象转换为字符串,并确保顺序一致 + json_str = json.dumps(data, sort_keys=True) + hasher.update(json_str.encode("utf-8")) + return hasher.hexdigest() + + +def run_experiment(exp_params): + name = exp_params["name"] + logging.info(f"Running experiment with name={name}, exp_params={exp_params}") + try: + # 运行实验并获取结果 + result = run_evaluation_pipeline( + config=exp_params["setting_file"], + data_path=exp_params["data_path"], + name=name, + ) + logging.info(f"Finished experiment with name={name}") + except Exception as e: + logging.error(f"Error running experiment {name}: {e}") + + return {"name": exp_params["name"], "parameters": exp_params, "result": result} + + +@click.command() +@click.option("-i", "--input_exp_config", show_default=True) +@click.option("-o", "--output_path", callback=validate_json_file, show_default=True) +def run(input_exp_config=None, output_path=None): + with open(input_exp_config) as file: + configs = yaml.safe_load(file) + + if not output_path: + timestamp = time.strftime("%Y%m%d-%H%M%S") + file_key = calculate_md5_from_json(configs) + output_path = f"localdata/eval_exp_data/results_{file_key}_{timestamp}.json" + results = [] + for exp in configs["experiment"]: + result = run_experiment(exp) + results.append(result) + + with open(output_path, "w") as result_file: + json.dump(results, result_file, ensure_ascii=False, indent=4) + + logging.info(f"Results saved to {output_path}") diff --git a/src/pai_rag/evaluation/run_evaluation_pipeline.py b/src/pai_rag/evaluation/run_evaluation_pipeline.py new file mode 100644 index 00000000..67841a6b --- /dev/null +++ b/src/pai_rag/evaluation/run_evaluation_pipeline.py @@ -0,0 +1,99 @@ +import os +import asyncio +from pathlib import Path +from pai_rag.core.rag_config_manager import RagConfigManager +from pai_rag.core.rag_data_loader import RagDataLoader +from pai_rag.core.rag_module import ( + resolve, + resolve_data_loader, + resolve_llm, + resolve_vector_index, + resolve_query_engine, +) +from pai_rag.evaluation.generator.rag_qca_generator import RagQcaGenerator +from pai_rag.integrations.llms.pai.pai_multi_modal_llm import ( + PaiMultiModalLlm, +) +from pai_rag.evaluation.evaluator.base_evaluator import BaseEvaluator +import logging + + +logger = logging.getLogger(__name__) + +_BASE_DIR = Path(__file__).parent.parent +DEFAULT_APPLICATION_CONFIG_FILE = os.path.join( + _BASE_DIR, "evaluation/settings_eval.toml" +) + + +def _create_data_loader( + config_file, name, enable_raptor: bool = False +) -> RagDataLoader: + config = RagConfigManager.from_file(config_file).get_value() + + config.index.vector_store.persist_path = ( + f"{config.index.vector_store.persist_path}__{name}" + ) + data_loader = resolve_data_loader(config) + vector_index = resolve_vector_index(config) + query_engine = resolve_query_engine(config) + + return data_loader, vector_index, query_engine + + +def _create_qca_generator(config_file, name, vector_index, query_engine): + config = RagConfigManager.from_file(config_file).get_value() + multimodal_llm = resolve(cls=PaiMultiModalLlm, llm_config=config.multimodal_llm) + persist_path = f"{config.index.vector_store.persist_path}__{name}" + qca_generator = RagQcaGenerator( + llm=multimodal_llm, + vector_index=vector_index, + query_engine=query_engine, + persist_path=persist_path, + ) + return qca_generator + + +def _create_base_evaluator(config_file, name): + config = RagConfigManager.from_file(config_file).get_value() + llm = resolve_llm(config) + persist_path = f"{config.index.vector_store.persist_path}__{name}" + return BaseEvaluator( + llm=llm, + persist_path=persist_path, + ) + + +def run_evaluation_pipeline( + config=None, + oss_path=None, + data_path=None, + pattern=None, + enable_raptor=False, + name="default", +): + assert (oss_path is not None) or ( + data_path is not None + ), "Must provide either local path or oss path." + assert (oss_path is None) or ( + data_path is None + ), f"Can not provide both local path '{data_path}' and oss path '{oss_path}'." + + data_loader, vector_index, query_engine = _create_data_loader( + config, name, enable_raptor + ) + data_loader.load_data( + file_path_or_directory=data_path, + filter_pattern=pattern, + oss_path=oss_path, + from_oss=oss_path is not None, + enable_raptor=enable_raptor, + ) + qca_generator = _create_qca_generator(config, name, vector_index, query_engine) + _ = asyncio.run(qca_generator.agenerate_qca_dataset(stage="labelled")) + _ = asyncio.run(qca_generator.agenerate_qca_dataset(stage="predicted")) + evaluator = _create_base_evaluator(config, name) + retrieval_result = asyncio.run(evaluator.aevaluation(stage="retrieval")) + response_result = asyncio.run(evaluator.aevaluation(stage="response")) + print("retrieval_result", retrieval_result, "response_result", response_result) + return {"retrieval": retrieval_result, "response": response_result} diff --git a/src/pai_rag/tools/agent_tool.py b/src/pai_rag/tools/agent_tool.py index 396ed721..122903c3 100644 --- a/src/pai_rag/tools/agent_tool.py +++ b/src/pai_rag/tools/agent_tool.py @@ -1,7 +1,6 @@ import click import os from pathlib import Path -from pai_rag.core.rag_config import RagConfig from pai_rag.core.rag_config_manager import RagConfigManager from pai_rag.core.rag_module import resolve_agent import logging @@ -61,12 +60,11 @@ def run( config = RagConfigManager.from_file(config_file).get_value() if tool_definition_file: - config.rag.agent.tool_definition_file = tool_definition_file + config.agent.tool_definition_file = tool_definition_file if python_script_file: - config.rag.agent.python_script_file = python_script_file + config.agent.python_script_file = python_script_file - rag_config = RagConfig.model_validate(config.rag) - agent = resolve_agent(rag_config) + agent = resolve_agent(config) print("**Question**: ", question) response = agent.chat(question) diff --git a/src/pai_rag/tools/load_data_tool.py b/src/pai_rag/tools/load_data_tool.py index 120581ba..d3dd94cd 100644 --- a/src/pai_rag/tools/load_data_tool.py +++ b/src/pai_rag/tools/load_data_tool.py @@ -1,7 +1,6 @@ import click import os from pathlib import Path -from pai_rag.core.rag_config import RagConfig from pai_rag.core.rag_config_manager import RagConfigManager from pai_rag.core.rag_module import resolve_data_loader import logging @@ -72,9 +71,8 @@ def run( ), f"Can not provide both local path '{data_path}' and oss path '{oss_path}'." config = RagConfigManager.from_file(config_file).get_value() - rag_config = RagConfig.model_validate(config.rag) - data_loader = resolve_data_loader(rag_config) + data_loader = resolve_data_loader(config) data_loader.load_data( file_path_or_directory=data_path, filter_pattern=pattern, diff --git a/src/pai_rag/tools/query_tool.py b/src/pai_rag/tools/query_tool.py index 3ef6a693..8f303bc1 100644 --- a/src/pai_rag/tools/query_tool.py +++ b/src/pai_rag/tools/query_tool.py @@ -1,7 +1,6 @@ import click import os from pathlib import Path -from pai_rag.core.rag_config import RagConfig from pai_rag.core.rag_config_manager import RagConfigManager from pai_rag.core.rag_module import resolve_query_engine, setup_tracing from pai_rag.integrations.synthesizer.pai_synthesizer import PaiQueryBundle @@ -45,10 +44,9 @@ def run( logging.basicConfig(level=logging.INFO) config = RagConfigManager.from_file(config_file).get_value() - rag_config = RagConfig.model_validate(config.rag) - setup_tracing(rag_config.trace) + setup_tracing(config.trace) - query_engine = resolve_query_engine(rag_config) + query_engine = resolve_query_engine(config) print("**Question**: ", question)