Support multi evaluators and experiments pipeline (#247)

* Add evaluator and metrics * Add evaluator and metrics * Add eval experiment pipeline * Modify entry file * Modify entry file * Modify result file * Fix * Refactor evaluation
aigc-apps · Oct 25, 2024 · 12a2862 · 12a2862
1 parent 68842d4
commit 12a2862
Show file tree

Hide file tree

Showing 23 changed files with 1,110 additions and 376 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -108,7 +108,7 @@ docx = "^0.2.4"
 pai_rag = "pai_rag.main:main"
 load_data = "pai_rag.tool.load_data_tool:run"
 load_model = "pai_rag.utils.download_models:load_models"
-evaluation = "pai_rag.evaluation.eval_pipeline:run"
+run_eval_exp = "pai_rag.evaluation.run_evaluation_experiments:run"
 
 [[tool.poetry.source]]
 name = "pytorch_cpu"

diff --git a/pyproject_gpu.toml b/pyproject_gpu.toml
@@ -100,7 +100,7 @@ peft = "^0.12.0"
 pai_rag = "pai_rag.main:main"
 load_data = "pai_rag.tool.load_data_tool:run"
 load_model = "pai_rag.utils.download_models:load_models"
-evaluation = "pai_rag.evaluation.eval_pipeline:run"
+run_eval_exp = "pai_rag.evaluation.run_evaluation_experiments:run"
 
 [tool.pytest.ini_options]
 asyncio_mode = "auto"
diff --git a/src/pai_rag/app/web/ui_constants.py b/src/pai_rag/app/web/ui_constants.py
@@ -112,7 +112,7 @@
 MLLM_MODEL_KEY_DICT = {
     "dashscope": [
         "qwen-vl-max",
-        "qwen-vl-turbo",
+        "qwen-vl-plus",
     ]
 }
 

diff --git a/src/pai_rag/config/evaluation/config.yaml b/src/pai_rag/config/evaluation/config.yaml
@@ -0,0 +1,8 @@
+experiment:
+  # [custom knowledge dataset]
+  - name: "exp1"
+    data_path: "example_data/eval_docs"
+    setting_file: "src/pai_rag/config/evaluation/settings_eval.toml"
+  - name: "exp2"
+    data_path: "example_data/eval_docs_1"
+    setting_file: "src/pai_rag/config/evaluation/settings_eval.toml"
diff --git a/src/pai_rag/evaluation/settings_eval.toml → ..._rag/config/evaluation/settings_eval.toml b/src/pai_rag/evaluation/settings_eval.toml → ..._rag/config/evaluation/settings_eval.toml
@@ -5,16 +5,8 @@ name = "pai_rag"
 version = "0.1.1"
 
 [rag.agent]
-type = "react"
-
-[rag.agent.custom_config]
-agent_file_path = ""
-
-[rag.agent.intent_detection]
-type = ""
-
-[rag.agent.tool]
-type = ""
+custom_agent_config_file = ""
+agent_tool_type = ""
 
 [rag.chat_store]
 type = "Local" # [Local, Aliyun-Redis]
@@ -23,12 +15,9 @@ password = "Aliyun-Redis user:pwd"
 persist_path = "localdata/eval_exp_data/storage"
 
 [rag.data_analysis]
-analysis_type = "nl2pandas"
+type = "pandas"
 nl2sql_prompt = "给定一个输入问题，创建一个语法正确的{dialect}查询语句来执行，不要从特定的表中查询所有列，只根据问题查询几个相关的列。请注意只使用你在schema descriptions 中看到的列名。\n=====\n 小心不要查询不存在的列。请注意哪个列位于哪个表中。必要时，请使用表名限定列名。\n=====\n 你必须使用以下格式，每项占一行：\n\n Question: Question here\n SQLQuery: SQL Query to run \n\n Only use tables listed below.\n {schema}\n\n Question: {query_str} \n SQLQuery: "
 
-[rag.data_loader]
-type = "local"
-
 [rag.data_reader]
 type = "SimpleDirectoryReader"
 
@@ -42,9 +31,6 @@ type = "SimpleDirectoryReader"
 source = "DashScope"
 embed_batch_size = 10
 
-[rag.embedding.multi_modal]
-source = "cnclip"
-
 [rag.index]
 persist_path = "localdata/eval_exp_data/storage"
 enable_multimodal = true
@@ -60,12 +46,11 @@ vector_store.type = "FAISS"
 source = "DashScope"
 model = "qwen-turbo"
 
-[rag.llm.function_calling_llm]
-source = ""
+[rag.multimodal_embedding]
+source = "cnclip"
 
-[rag.llm.multi_modal]
-enable = true
-source = "DashScope"
+[rag.multimodal_llm]
+source = "dashscope"
 model = "qwen-vl-plus"
 
 [rag.node_enhancement]
@@ -81,20 +66,16 @@ enable_multimodal = true
 
 [rag.oss_store]
 bucket = ""
-endpoint = ""
-prefix = ""
+endpoint = "oss-cn-hangzhou.aliyuncs.com"
 
 [rag.postprocessor]
-reranker_type = "simple-weighted-reranker" # [simple-weighted-reranker, model-based-reranker]
+reranker_type = "no-reranker" # [simple-weighted-reranker, model-based-reranker]
 reranker_model = "bge-reranker-base" # [bge-reranker-base, bge-reranker-large]
 keyword_weight = 0.3
 vector_weight = 0.7
 similarity_threshold = 0.5
 top_n = 2
 
-[rag.query_engine]
-type = "RetrieverQueryEngine"
-
 [rag.query_transform]
 type = ""
 
@@ -111,6 +92,6 @@ type = "SimpleSummarize"
 text_qa_template = "参考内容信息如下\n---------------------\n{context_str}\n---------------------根据提供内容而非其他知识回答问题.\n问题: {query_str}\n答案: \n"
 
 [rag.trace]
-type = "pai-llm-trace"
+type = "pai_trace"
 endpoint = "http://tracing-analysis-dc-hz.aliyuncs.com:8090"
 token = ""
diff --git a/src/pai_rag/evaluation/dataset/rag_eval_dataset.py b/src/pai_rag/evaluation/dataset/rag_eval_dataset.py
@@ -0,0 +1,111 @@
+from typing import List, Optional, Type, Dict
+from llama_index.core.bridge.pydantic import Field
+import json
+from llama_index.core.bridge.pydantic import BaseModel
+from pai_rag.evaluation.dataset.rag_qca_dataset import RagQcaSample
+
+
+class EvaluationSample(RagQcaSample):
+    """Response Evaluation RAG example class."""
+
+    hitrate: Optional[float] = Field(
+        default_factory=None,
+        description="The hitrate value for retrieval evaluation.",
+    )
+    mrr: Optional[float] = Field(
+        default_factory=None,
+        description="The mrr value for retrieval evaluation.",
+    )
+
+    faithfulness_score: Optional[float] = Field(
+        default_factory=None,
+        description="The faithfulness score for response evaluation.",
+    )
+
+    faithfulness_reason: Optional[str] = Field(
+        default_factory=None,
+        description="The faithfulness reason for response evaluation.",
+    )
+
+    correctness_score: Optional[float] = Field(
+        default_factory=None,
+        description="The correctness score for response evaluation.",
+    )
+
+    correctness_reason: Optional[str] = Field(
+        default_factory=None,
+        description="The correctness reason for response evaluation.",
+    )
+
+    @property
+    def class_name(self) -> str:
+        """Data example class name."""
+        return "EvaluationSample"
+
+
+class PaiRagEvalDataset(BaseModel):
+    _example_type: Type[EvaluationSample] = EvaluationSample  # type: ignore[misc]
+    examples: List[EvaluationSample] = Field(
+        default=[], description="Data examples of this dataset."
+    )
+    results: Dict[str, Dict[str, float]] = Field(
+        default_factory=dict, description="Evaluation result of this dataset."
+    )
+    status: Dict[str, bool] = Field(
+        default_factory=dict, description="Status of this dataset."
+    )
+
+    @property
+    def class_name(self) -> str:
+        """Class name."""
+        return "PaiRagEvalDataset"
+
+    def cal_mean_metric_score(self) -> float:
+        """Calculate the mean metric score."""
+        self.results["retrieval"] = {}
+        self.results["response"] = {}
+        if self.status["retrieval"]:
+            self.results["retrieval"] = {
+                "mean_hitrate": sum(float(entry.hitrate) for entry in self.examples)
+                / len(self.examples),
+                "mean_mrr": sum(float(entry.mrr) for entry in self.examples)
+                / len(self.examples),
+            }
+        if self.status["response"]:
+            self.results["response"] = {
+                "mean_faithfulness_score": sum(
+                    float(entry.faithfulness_score) for entry in self.examples
+                )
+                / len(self.examples),
+                "mean_correctness_score": sum(
+                    float(entry.correctness_score) for entry in self.examples
+                )
+                / len(self.examples),
+            }
+
+    def save_json(self, path: str) -> None:
+        """Save json."""
+        self.cal_mean_metric_score()
+
+        with open(path, "w", encoding="utf-8") as f:
+            examples = [self._example_type.dict(el) for el in self.examples]
+            data = {
+                "examples": examples,
+                "results": self.results,
+                "status": self.status,
+            }
+
+            json.dump(data, f, indent=4, ensure_ascii=False)
+            print(f"Saved dataset to {path}.")
+
+    @classmethod
+    def from_json(cls, path: str) -> "PaiRagEvalDataset":
+        """Load json."""
+        with open(path) as f:
+            data = json.load(f)
+
+        examples = [cls._example_type.parse_obj(el) for el in data["examples"]]
+        results = data["results"]
+        status = data["status"]
+
+        return cls(examples=examples, results=results, status=status)
diff --git a/...ag/evaluation/generator/rag_qca_sample.py → ...rag/evaluation/dataset/rag_qca_dataset.py b/...ag/evaluation/generator/rag_qca_sample.py → ...rag/evaluation/dataset/rag_qca_dataset.py
@@ -1,22 +1,15 @@
-from typing import List, Optional
+from typing import List, Optional, Type
 from llama_index.core.bridge.pydantic import Field
 from llama_index.core.llama_dataset.base import BaseLlamaDataExample
 from llama_index.core.llama_dataset import CreatedBy
+import json
+from llama_index.core.bridge.pydantic import BaseModel
 
 
-class LabelledRagQcaSample(BaseLlamaDataExample):
-    """RAG example class. Analogous to traditional ML datasets, this dataset contains
+class RagQcaSample(BaseLlamaDataExample):
+    """Predicted RAG example class. Analogous to traditional ML datasets, this dataset contains
     the "features" (i.e., query + context) to make a prediction and the "label" (i.e., response)
     to evaluate the prediction.
-
-    Args:
-        query (str): The user query
-        query_by (CreatedBy): Query generated by human or ai (model-name)
-        reference_contexts (Optional[List[str]]): The contexts used for response
-        reference_node_id (Optional[List[str]]): The node id corresponding to the contexts
-        reference_answer ([str]): Reference answer to the query. An answer
-                                    that would receive full marks upon evaluation.
-        reference_answer_by: The reference answer generated by human or ai (model-name).
     """
 
     query: str = Field(
@@ -40,18 +33,6 @@ class LabelledRagQcaSample(BaseLlamaDataExample):
         default=None, description="What model generated the reference answer."
     )
 
-    @property
-    def class_name(self) -> str:
-        """Data example class name."""
-        return "LabelledRagQcaSample"
-
-
-class PredictedRagQcaSample(LabelledRagQcaSample):
-    """Predicted RAG example class. Analogous to traditional ML datasets, this dataset contains
-    the "features" (i.e., query + context) to make a prediction and the "label" (i.e., response)
-    to evaluate the prediction.
-    """
-
     predicted_contexts: Optional[List[str]] = Field(
         default_factory=None,
         description="The contexts used to generate the predicted answer.",
@@ -71,4 +52,50 @@ class PredictedRagQcaSample(LabelledRagQcaSample):
     @property
     def class_name(self) -> str:
         """Data example class name."""
-        return "PredictedRagQcaSample"
+        return "RagQcaSample"
+
+
+class PaiRagQcaDataset(BaseModel):
+    _example_type: Type[RagQcaSample] = RagQcaSample  # type: ignore[misc]
+    examples: List[RagQcaSample] = Field(
+        default=[], description="Data examples of this dataset."
+    )
+    labelled: bool = Field(
+        default=False, description="Whether the dataset is labelled or not."
+    )
+    predicted: bool = Field(
+        default=False, description="Whether the dataset is predicted or not."
+    )
+
+    @property
+    def class_name(self) -> str:
+        """Class name."""
+        return "PaiRagQcaDataset"
+
+    def save_json(self, path: str) -> None:
+        """Save json."""
+        with open(path, "w", encoding="utf-8") as f:
+            examples = [self._example_type.dict(el) for el in self.examples]
+            data = {
+                "examples": examples,
+                "labelled": self.labelled,
+                "predicted": self.predicted,
+            }
+
+            json.dump(data, f, indent=4, ensure_ascii=False)
+            print(f"Saved PaiRagQcaDataset to {path}.")
+
+    @classmethod
+    def from_json(cls, path: str) -> "PaiRagQcaDataset":
+        """Load json."""
+        with open(path) as f:
+            data = json.load(f)
+
+        if len(data["examples"]) > 0:
+            examples = [cls._example_type.parse_obj(el) for el in data["examples"]]
+            labelled = data["labelled"]
+            predicted = data["predicted"]
+
+            return cls(examples=examples, labelled=labelled, predicted=predicted)
+        else:
+            return None