Support pai-llm-evals and refactor eval exp & pipeline (#278)

* Refactor eval exp & pipeline * Add llm-evals package * Add example file * Fix * Fix multi-modal qca_dataset_path * Fix persist_path
aigc-apps · Nov 22, 2024 · 2c5b05c · 2c5b05c
1 parent e1c4318
commit 2c5b05c
Show file tree

Hide file tree

Showing 25 changed files with 787 additions and 293 deletions.
diff --git a/docs/qca_generation_and_evaluation.md b/docs/qca_generation_and_evaluation.md
@@ -19,7 +19,7 @@ RAG评估工具是一种用于测试和评估基于检索的文本生成系统
 ```yaml
 - name: "exp1"
   eval_data_path: "example_data/eval_docs_text"
-  eval_modal_llm:
+  eval_model_llm:
     source: "dashscope"
     model: "qwen-max"
     max_tokens: 1024
@@ -30,7 +30,7 @@ RAG评估工具是一种用于测试和评估基于检索的文本生成系统
 
 - name: 评估实验名称。
 - eval_data_path: 评估数据集路径，支持本地文件路径，或者oss路径。
-- eval_modal_llm: 用于评估大模型的配置，支持dashscope、openai、paieas等。
+- eval_model_llm: 用于评估大模型的配置，支持dashscope、openai、paieas等。
 - rag_setting_file: rag配置文件路径。
 
 3. 评估维度：
@@ -56,7 +56,7 @@ Response
 ```yaml
 - name: "exp2"
   eval_data_path: "example_data/eval_docs_text"
-  eval_modal_llm:
+  eval_model_llm:
     source: "dashscope"
     model: "qwen-max"
     max_tokens: 1024
@@ -71,7 +71,7 @@ Response
 
 - name: 评估实验名称。
 - eval_data_path: 评估数据集路径，支持本地文件路径，或者oss路径。
-- eval_modal_llm: 用于评估大模型的配置，支持dashscope、openai、paieas等。
+- eval_model_llm: 用于评估大模型的配置，支持dashscope、openai、paieas等。
 - rag_setting_file: rag配置文件路径。
 - tested_multimodal_llm: 待评估的评估大模型的配置
 
@@ -91,7 +91,7 @@ Response
 ```yaml
 - name: "exp3"
   qca_dataset_path: "data/eval_dataset/multimodal_eval_dataset_zh_example.json"
-  eval_modal_llm:
+  eval_model_llm:
     source: "dashscope"
     model: "qwen-max"
     max_tokens: 1024
@@ -106,7 +106,7 @@ Response
 
 - name: 评估实验名称。
 - qca_dataset_path: 评估数据集json文件路径，支持本地文件路径。
-- eval_modal_llm: 用于评估大模型的配置，支持dashscope、openai、paieas等。
+- eval_model_llm: 用于评估大模型的配置，支持dashscope、openai、paieas等。
 - rag_setting_file: rag配置文件路径。
 - tested_multimodal_llm: 待评估的评估大模型的配置
 
@@ -121,7 +121,7 @@ Response
       "reference_contexts": [
         ": 在赛事和政策的双重推动下，国民运动户外参与意愿高涨，超过六成的受访者表示近一年显著增加了运动户外的频率，各类运动项目正在快速走向“全民化”。新的一年，随着巴黎奥运会、美洲杯等赛事的举办，全民运动热情将进一步被激发。对于品牌而言，这是一个难得的市场机遇，通过精准地选中和锁定与运动相关的目标人群，品牌可以有效地实现用户收割。  \n\n  \n\n悦己驱动，运动边界向轻量泛户外持续延伸  \n\n国民参与运动户外活动更多来自“悦己”观念的驱动，近7成的受访者表示他们主要是为了“强身健体/享受大自然”，因此轻量级、易开展的活动项目更受广大普通受众的青睐。近三年，社交平台关于“泛户外运动”的讨论热度持续走高，更是在23年春夏期间迎来一波小高峰：细分到具体的活动项目上，垂钓讨论声量较高；露营也保持较高声量，其经历过22年的大爆发、23年的行业调整，预计24年已经进入更深精细化运营；此外城市骑行热度也在不断上升，成为当下新兴的小众活动。"
       ],
-      "reference_node_id": null,
+      "reference_node_ids": null,
       "reference_image_url_list": [
         "https://pai-rag.oss-cn-hangzhou.aliyuncs.com/pairag/doc_images/2024春夏淘宝天猫运动户外行业趋势白皮书_淘宝/d4e624aceb4043839c924e33c075e388.jpeg",
         "https://pai-rag.oss-cn-hangzhou.aliyuncs.com/pairag/doc_images/2024春夏淘宝天猫运动户外行业趋势白皮书_淘宝/52d1353d4577698891e7710ae12e18b1.jpeg",
@@ -130,8 +130,8 @@ Response
       "reference_answer": "根据给定的材料，2023年春夏期间，垂钓在社交平台上的讨论声量最高。\n\n![](https://pai-rag.oss-cn-hangzhou.aliyuncs.com/pairag/doc_images/2024春夏淘宝天猫运动户外行业趋势白皮书_淘宝/d4e624aceb4043839c924e33c075e388.jpeg)",
       "reference_answer_by": null,
       "predicted_contexts": null,
-      "predicted_node_id": null,
-      "predicted_node_score": null,
+      "predicted_node_ids": null,
+      "predicted_node_scores": null,
       "predicted_image_url_list": null,
       "predicted_answer": "",
       "predicted_answer_by": null

diff --git a/example_data/eval_docs_crag_small/crag-task1-straightforward-eval-msgs_1.jsonl b/example_data/eval_docs_crag_small/crag-task1-straightforward-eval-msgs_1.jsonl
diff --git a/example_data/eval_docs_crag_small/crag-task1-straightforward-eval-msgs_2.jsonl b/example_data/eval_docs_crag_small/crag-task1-straightforward-eval-msgs_2.jsonl
diff --git a/...t/multimodal_eval_dataset_zh_example.json → ...e/multimodal_eval_dataset_zh_example.json b/...t/multimodal_eval_dataset_zh_example.json → ...e/multimodal_eval_dataset_zh_example.json
@@ -6,7 +6,7 @@
       "reference_contexts": [
         ": 在赛事和政策的双重推动下，国民运动户外参与意愿高涨，超过六成的受访者表示近一年显著增加了运动户外的频率，各类运动项目正在快速走向“全民化”。新的一年，随着巴黎奥运会、美洲杯等赛事的举办，全民运动热情将进一步被激发。对于品牌而言，这是一个难得的市场机遇，通过精准地选中和锁定与运动相关的目标人群，品牌可以有效地实现用户收割。  \n\n  \n\n悦己驱动，运动边界向轻量泛户外持续延伸  \n\n国民参与运动户外活动更多来自“悦己”观念的驱动，近7成的受访者表示他们主要是为了“强身健体/享受大自然”，因此轻量级、易开展的活动项目更受广大普通受众的青睐。近三年，社交平台关于“泛户外运动”的讨论热度持续走高，更是在23年春夏期间迎来一波小高峰：细分到具体的活动项目上，垂钓讨论声量较高；露营也保持较高声量，其经历过22年的大爆发、23年的行业调整，预计24年已经进入更深精细化运营；此外城市骑行热度也在不断上升，成为当下新兴的小众活动。"
       ],
-      "reference_node_id": null,
+      "reference_node_ids": null,
       "reference_image_url_list": [
         "https://pai-rag.oss-cn-hangzhou.aliyuncs.com/pairag/doc_images/2024春夏淘宝天猫运动户外行业趋势白皮书_淘宝/d4e624aceb4043839c924e33c075e388.jpeg",
         "https://pai-rag.oss-cn-hangzhou.aliyuncs.com/pairag/doc_images/2024春夏淘宝天猫运动户外行业趋势白皮书_淘宝/52d1353d4577698891e7710ae12e18b1.jpeg",
@@ -15,8 +15,8 @@
       "reference_answer": "根据给定的材料，2023年春夏期间，垂钓在社交平台上的讨论声量最高。\n\n![](https://pai-rag.oss-cn-hangzhou.aliyuncs.com/pairag/doc_images/2024春夏淘宝天猫运动户外行业趋势白皮书_淘宝/d4e624aceb4043839c924e33c075e388.jpeg)",
       "reference_answer_by": null,
       "predicted_contexts": null,
-      "predicted_node_id": null,
-      "predicted_node_score": null,
+      "predicted_node_ids": null,
+      "predicted_node_scores": null,
       "predicted_image_url_list": null,
       "predicted_answer": "",
       "predicted_answer_by": null
@@ -25,15 +25,15 @@
       "query": "狄尔泰属于哪个教育学流派？",
       "query_by": null,
       "reference_contexts": [],
-      "reference_node_id": null,
+      "reference_node_ids": null,
       "reference_image_url_list": [
         "https://pai-rag.oss-cn-hangzhou.aliyuncs.com/pairag/doc_images/教育文档/思维导图.jpeg"
       ],
       "reference_answer": "狄尔泰属于文化教育学流派。\n\n![](https://pai-rag.oss-cn-hangzhou.aliyuncs.com/pairag/doc_images/教育文档/思维导图.jpeg)",
       "reference_answer_by": null,
       "predicted_contexts": null,
-      "predicted_node_id": null,
-      "predicted_node_score": null,
+      "predicted_node_ids": null,
+      "predicted_node_scores": null,
       "predicted_image_url_list": null,
       "predicted_answer": "",
       "predicted_answer_by": null

diff --git a/packages/pai_llm_evals-0.0.1-py3-none-any.whl b/packages/pai_llm_evals-0.0.1-py3-none-any.whl
diff --git a/src/pai_rag/config/evaluation/config.yaml b/src/pai_rag/config/evaluation/config.yaml
@@ -1,30 +1,42 @@
 experiment:
-  #   [custom knowledge dataset]
-  - name: "exp1"
+  # [text dataset][pai-eval]
+  - name: "text_exp1"
     eval_data_path: "example_data/eval_docs_text"
-    eval_modal_llm:
+    rag_setting_file: "src/pai_rag/config/evaluation/settings_eval_for_text.toml"
+    eval_model_llm:
       source: "dashscope"
       model: "qwen-max"
       max_tokens: 1024
-    rag_setting_file: "src/pai_rag/config/evaluation/settings_eval_for_text.toml"
-  - name: "exp2"
-    eval_data_path: "example_data/eval_docs_text"
-    eval_modal_llm:
+    use_pai_eval: true
+  # [custom text dataset][crag]
+  - name: "text_exp2"
+    dataset: "crag"
+    eval_data_path: "example_data/eval_docs_crag_small"
+    rag_setting_file: "src/pai_rag/config/evaluation/settings_eval_for_crag_text.toml"
+    eval_model_llm:
       source: "dashscope"
       model: "qwen-max"
       max_tokens: 1024
+  # [multi-modal dataset]
+  - name: "multi_modal_exp1"
+    eval_data_path: "example_data/eval_docs_image"
     rag_setting_file: "src/pai_rag/config/evaluation/settings_eval_for_image.toml"
-    tested_multimodal_llm:
+    eval_model_llm:
       source: "dashscope"
       model: "qwen-vl-max"
       max_tokens: 1024
-  - name: "exp3"
-    qca_dataset_path: "data/eval_dataset/multimodal_eval_dataset_zh_example.json"
-    eval_modal_llm:
+    tested_multimodal_llm:
       source: "dashscope"
-      model: "qwen-max"
+      model: "qwen-vl-max"
       max_tokens: 1024
+  # [custom multi-modal dataset]
+  - name: "multi_modal_exp2"
+    qca_dataset_path: "example_data/eval_docs_image_example/multimodal_eval_dataset_zh_example.json"
     rag_setting_file: "src/pai_rag/config/evaluation/settings_eval_for_image.toml"
+    eval_model_llm:
+      source: "dashscope"
+      model: "qwen-vl-max"
+      max_tokens: 1024
     tested_multimodal_llm:
       source: "dashscope"
       model: "qwen-vl-max"

diff --git a/src/pai_rag/config/evaluation/settings_eval_for_crag_text.toml b/src/pai_rag/config/evaluation/settings_eval_for_crag_text.toml
@@ -0,0 +1,95 @@
+dynaconf_merge = true
+
+[rag]
+name = "pai_rag"
+version = "0.1.1"
+
+[rag.agent]
+custom_agent_config_file = ""
+agent_tool_type = ""
+
+[rag.chat_store]
+type = "Local" # [Local, Aliyun-Redis]
+host = "Aliyun-Redis host"
+password = "Aliyun-Redis user:pwd"
+persist_path = "localdata/eval_exp_data/storage"
+
+[rag.data_analysis]
+type = "pandas"
+nl2sql_prompt = "给定一个输入问题，创建一个语法正确的{dialect}查询语句来执行，不要从特定的表中查询所有列，只根据问题查询几个相关的列。请注意只使用你在schema descriptions 中看到的列名。\n=====\n 小心不要查询不存在的列。请注意哪个列位于哪个表中。必要时，请使用表名限定列名。\n=====\n 你必须使用以下格式，每项占一行：\n\n Question: Question here\n SQLQuery: SQL Query to run \n\n Only use tables listed below.\n {schema}\n\n Question: {query_str} \n SQLQuery: "
+
+[rag.data_reader]
+type = "SimpleDirectoryReader"
+
+# embedding configurations, source support API: OpenAI,DashScope; and local model:HuggingFace
+# if use API, need set OPENAI_API_KEY or DASHSCOPE_API_KEY in ENV, If HuggingFace, need set model
+# eg.
+# source = "HuggingFace"
+# model = "bge-large-zh-v1.5"
+# embed_batch_size = 10
+[rag.embedding]
+source = "DashScope"
+embed_batch_size = 10
+
+[rag.index]
+persist_path = "localdata/eval_exp_data/storage"
+enable_multimodal = false
+vector_store.type = "FAISS"
+
+# llm configurations, source support API: OpenAI,DashScope or PAI-EAS's deployment
+# eg.
+# source = "PaiEas"
+# model = ""
+# endpoint = ""
+# token = ""
+[rag.llm]
+source = "OpenAI"
+model = "gpt-4o-2024-08-06"
+
+[rag.multimodal_embedding]
+source = "cnclip"
+
+[rag.multimodal_llm]
+source = "dashscope"
+model = "qwen-vl-plus"
+
+[rag.node_enhancement]
+tree_depth = 3
+max_clusters = 52
+proba_threshold = 0.10
+
+[rag.node_parser]
+type = "Sentence"
+chunk_size = 500
+chunk_overlap = 10
+enable_multimodal = false
+
+[rag.oss_store]
+bucket = ""
+endpoint = "oss-cn-hangzhou.aliyuncs.com"
+
+[rag.postprocessor]
+reranker_type = "no-reranker" # [simple-weighted-reranker, model-based-reranker]
+reranker_model = "bge-reranker-base" # [bge-reranker-base, bge-reranker-large]
+keyword_weight = 0.3
+vector_weight = 0.7
+similarity_threshold = 0.5
+top_n = 2
+
+[rag.query_transform]
+type = ""
+
+[rag.retriever]
+similarity_top_k = 5
+retrieval_mode = "hybrid" # [hybrid, embedding, keyword, router]
+query_rewrite_n = 1 # set to 1 to disable query generation
+
+[rag.search]
+search_api_key = ""
+
+[rag.synthesizer]
+type = "SimpleSummarize"
+text_qa_template = "You are a helpful assistant.\nYou are given a Question and References. The references may or may not help answer the question. Your task is to answer the question in as few words as possible.\nPlease follow these guidelines when formulating your answer:\n1. If the question contains a false premise or assumption, answer “invalid question”.\n2. If you are uncertain or don’t know the answer, respond with “I don’t know”.\n\n### Question \n{query_str} \n\n### References:\n {context_str} \n\n### Answer:\n"
+
+[rag.trace]
+type = "arize_phoenix"
diff --git a/src/pai_rag/evaluation/dataset/crag/crag_data_loader.py b/src/pai_rag/evaluation/dataset/crag/crag_data_loader.py
@@ -0,0 +1,54 @@
+from typing import Any
+from llama_index.core.indices import VectorStoreIndex
+from llama_index.core.ingestion import IngestionPipeline
+from pai_rag.integrations.readers.pai.pai_data_reader import PaiDataReader
+from loguru import logger
+
+
+class CragDataLoader:
+    def __init__(
+        self,
+        data_reader: PaiDataReader,
+        embed_model: Any = None,
+        vector_index: VectorStoreIndex = None,
+    ):
+        self._data_reader = data_reader
+        self._embed_model = embed_model
+        self._vector_index = vector_index
+
+    def load_data(
+        self,
+        file_path_or_directory: str,
+        from_oss: bool = False,
+        oss_path: str = None,
+        filter_pattern: str = None,
+        enable_raptor: str = False,
+    ):
+        """Load data from a file or directory."""
+        documents = self._data_reader.load_data(
+            file_path_or_directory=file_path_or_directory,
+            filter_pattern=filter_pattern,
+            oss_path=oss_path,
+            from_oss=from_oss,
+        )
+        if from_oss:
+            logger.info(f"Loaded {len(documents)} documents from {oss_path}")
+        else:
+            logger.info(
+                f"Loaded {len(documents)} documents from {file_path_or_directory}"
+            )
+
+        transformations = [
+            self._embed_model,
+        ]
+
+        ingestion_pipeline = IngestionPipeline(transformations=transformations)
+
+        nodes = ingestion_pipeline.run(documents=documents)
+        logger.info(
+            f"[DataLoader] parsed {len(documents)} documents into {len(nodes)} nodes."
+        )
+
+        self._vector_index.insert_nodes(nodes)
+        logger.info(f"[DataLoader] Inserted {len(nodes)} nodes.")
+        logger.info("[DataLoader] Ingestion Completed!")
diff --git a/src/pai_rag/evaluation/dataset/crag/crag_jsonl_reader.py b/src/pai_rag/evaluation/dataset/crag/crag_jsonl_reader.py
@@ -0,0 +1,45 @@
+"""Tabular parser-Excel parser.
+
+Contains parsers for tabular data files.
+
+"""
+
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+from fsspec import AbstractFileSystem
+from llama_index.core.readers.base import BaseReader
+from llama_index.core.schema import Document
+import json
+
+
+class CragJsonLReader(BaseReader):
+    """JsonL reader."""
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        """Init params."""
+        super().__init__(*args, **kwargs)
+
+    def load_data(
+        self,
+        file_path: Path,
+        extra_info: Optional[Dict] = None,
+        fs: Optional[AbstractFileSystem] = None,
+    ) -> List[Document]:
+        with open(file_path, "r", encoding="utf-8") as file:
+            json_lines = [line.strip() for line in file.readlines()]
+
+        docs = []
+        for i, text in enumerate(json_lines):
+            json_data = json.loads(text)
+            search_results = json_data["search_results"]
+            for j, search_result in enumerate(search_results):
+                extra_info["row_number"] = i + 1
+                extra_info["dataset_source"] = "crag"
+                docs.append(
+                    Document(
+                        doc_id=f"{json_data['interaction_id']}__{j}",
+                        text=search_result["page_snippet"],
+                        metadata=extra_info,
+                    )
+                )
+        return docs
diff --git a/src/pai_rag/evaluation/dataset/rag_qca_dataset.py b/src/pai_rag/evaluation/dataset/rag_qca_dataset.py
@@ -23,7 +23,7 @@ class RagQcaSample(BaseLlamaDataExample):
         default_factory=None,
         description="The contexts used to generate the reference answer.",
     )
-    reference_node_id: Optional[List[str]] = Field(
+    reference_node_ids: Optional[List[str]] = Field(
         default_factory=None, description="The node id corresponding to the contexts"
     )
     reference_image_url_list: Optional[List[str]] = Field(
@@ -42,11 +42,11 @@ class RagQcaSample(BaseLlamaDataExample):
         default_factory=None,
         description="The contexts used to generate the predicted answer.",
     )
-    predicted_node_id: Optional[List[str]] = Field(
+    predicted_node_ids: Optional[List[str]] = Field(
         default_factory=None,
         description="The node id corresponding to the predicted contexts",
     )
-    predicted_node_score: Optional[List[float]] = Field(
+    predicted_node_scores: Optional[List[float]] = Field(
         default_factory=None,
         description="The node score corresponding to the predicted contexts",
     )