fix_multimodal_eval_pipeline (#277)

* fix_multimodal_eval_pipeline * fix_multimodal_eval_pipeline * fix_multimodal_eval_pipeline
aigc-apps · Nov 21, 2024 · e1c4318 · e1c4318
1 parent 37bbd04
commit e1c4318
Show file tree

Hide file tree

Showing 6 changed files with 229 additions and 39 deletions.
diff --git a/data/eval_dataset/multimodal_eval_dataset_zh_example.json b/data/eval_dataset/multimodal_eval_dataset_zh_example.json
@@ -0,0 +1,44 @@
+{
+  "examples": [
+    {
+      "query": "2023年春夏期间，哪种泛户外运动在社交平台上的讨论声量最高？",
+      "query_by": null,
+      "reference_contexts": [
+        ": 在赛事和政策的双重推动下，国民运动户外参与意愿高涨，超过六成的受访者表示近一年显著增加了运动户外的频率，各类运动项目正在快速走向“全民化”。新的一年，随着巴黎奥运会、美洲杯等赛事的举办，全民运动热情将进一步被激发。对于品牌而言，这是一个难得的市场机遇，通过精准地选中和锁定与运动相关的目标人群，品牌可以有效地实现用户收割。  \n\n  \n\n悦己驱动，运动边界向轻量泛户外持续延伸  \n\n国民参与运动户外活动更多来自“悦己”观念的驱动，近7成的受访者表示他们主要是为了“强身健体/享受大自然”，因此轻量级、易开展的活动项目更受广大普通受众的青睐。近三年，社交平台关于“泛户外运动”的讨论热度持续走高，更是在23年春夏期间迎来一波小高峰：细分到具体的活动项目上，垂钓讨论声量较高；露营也保持较高声量，其经历过22年的大爆发、23年的行业调整，预计24年已经进入更深精细化运营；此外城市骑行热度也在不断上升，成为当下新兴的小众活动。"
+      ],
+      "reference_node_id": null,
+      "reference_image_url_list": [
+        "https://pai-rag.oss-cn-hangzhou.aliyuncs.com/pairag/doc_images/2024春夏淘宝天猫运动户外行业趋势白皮书_淘宝/d4e624aceb4043839c924e33c075e388.jpeg",
+        "https://pai-rag.oss-cn-hangzhou.aliyuncs.com/pairag/doc_images/2024春夏淘宝天猫运动户外行业趋势白皮书_淘宝/52d1353d4577698891e7710ae12e18b1.jpeg",
+        "https://pai-rag.oss-cn-hangzhou.aliyuncs.com/pairag/doc_images/2024春夏淘宝天猫运动户外行业趋势白皮书_淘宝/4f77ded6421ddadd519ab9ef1601a784.jpeg"
+      ],
+      "reference_answer": "根据给定的材料，2023年春夏期间，垂钓在社交平台上的讨论声量最高。\n\n![](https://pai-rag.oss-cn-hangzhou.aliyuncs.com/pairag/doc_images/2024春夏淘宝天猫运动户外行业趋势白皮书_淘宝/d4e624aceb4043839c924e33c075e388.jpeg)",
+      "reference_answer_by": null,
+      "predicted_contexts": null,
+      "predicted_node_id": null,
+      "predicted_node_score": null,
+      "predicted_image_url_list": null,
+      "predicted_answer": "",
+      "predicted_answer_by": null
+    },
+    {
+      "query": "狄尔泰属于哪个教育学流派？",
+      "query_by": null,
+      "reference_contexts": [],
+      "reference_node_id": null,
+      "reference_image_url_list": [
+        "https://pai-rag.oss-cn-hangzhou.aliyuncs.com/pairag/doc_images/教育文档/思维导图.jpeg"
+      ],
+      "reference_answer": "狄尔泰属于文化教育学流派。\n\n![](https://pai-rag.oss-cn-hangzhou.aliyuncs.com/pairag/doc_images/教育文档/思维导图.jpeg)",
+      "reference_answer_by": null,
+      "predicted_contexts": null,
+      "predicted_node_id": null,
+      "predicted_node_score": null,
+      "predicted_image_url_list": null,
+      "predicted_answer": "",
+      "predicted_answer_by": null
+    }
+  ],
+  "labelled": true,
+  "predicted": false
+}
diff --git a/docs/qca_generation_and_evaluation.md b/docs/qca_generation_and_evaluation.md
@@ -0,0 +1,154 @@
+# QCA GENERATION AND EVALUATION
+
+RAG评估工具是一种用于测试和评估基于检索的文本生成系统的方法或框架，评估的内容包括检索的准确性、生成内容的质量和相关性等，评估指标包括精确度、召回率、一致性和合理性等。它可以帮助开发人员更好地了解和优化 RAG 应用，使其更适用于实际应用。相对于人工评估，RAG 评估工具更加客观、准确和高效，并且可以通过自动化的方式进行大规模的评估，从而让应用更快地进行迭代和优化。
+
+## 评估方式
+
+在yaml文件里配置评估实验，并运行以下命令进行评估
+
+```bash
+   run_eval_exp [-i yaml_path] [--o output_path]
+```
+
+配置案例可参考src/pai_rag/config/evaluation/config.yaml
+
+### 实验类型一：根据文档内容，创建评估数据集，进行RAG系统评估
+
+1. 示例配置如下
+
+```yaml
+- name: "exp1"
+  eval_data_path: "example_data/eval_docs_text"
+  eval_modal_llm:
+    source: "dashscope"
+    model: "qwen-max"
+    max_tokens: 1024
+  rag_setting_file: "src/pai_rag/config/evaluation/settings_eval_for_text.toml"
+```
+
+2. 参数说明：
+
+- name: 评估实验名称。
+- eval_data_path: 评估数据集路径，支持本地文件路径，或者oss路径。
+- eval_modal_llm: 用于评估大模型的配置，支持dashscope、openai、paieas等。
+- rag_setting_file: rag配置文件路径。
+
+3. 评估维度：
+
+Retrieval
+
+| 指标    | 说明              |
+| ------- | ----------------- |
+| hitrate | 分数属于[0,1]区间 |
+| mrr     | 分数属于[0,1]区间 |
+
+Response
+
+| 指标         | 说明                                 |
+| ------------ | ------------------------------------ |
+| faithfulness | 分数为0或1，其中1是相关，0是不相关   |
+| correctness  | 分数在1到5之间，其中1为最低，5为最高 |
+
+### 实验类型二：根据文档内容，创建评估数据集，进行多模态大模型评估
+
+1. 示例配置如下
+
+```yaml
+- name: "exp2"
+  eval_data_path: "example_data/eval_docs_text"
+  eval_modal_llm:
+    source: "dashscope"
+    model: "qwen-max"
+    max_tokens: 1024
+  rag_setting_file: "src/pai_rag/config/evaluation/settings_eval_for_image.toml"
+  tested_multimodal_llm:
+    source: "dashscope"
+    model: "qwen-vl-max"
+    max_tokens: 1024
+```
+
+2. 参数说明：
+
+- name: 评估实验名称。
+- eval_data_path: 评估数据集路径，支持本地文件路径，或者oss路径。
+- eval_modal_llm: 用于评估大模型的配置，支持dashscope、openai、paieas等。
+- rag_setting_file: rag配置文件路径。
+- tested_multimodal_llm: 待评估的评估大模型的配置
+
+3. 评估维度：
+
+Response
+
+| 指标         | 说明                                 |
+| ------------ | ------------------------------------ |
+| faithfulness | 分数为0或1，其中1是相关，0是不相关   |
+| correctness  | 分数在1到5之间，其中1为最低，5为最高 |
+
+### 实验类型三：已有评估数据集json文件, 进行多模态大模型评估
+
+1. 示例配置如下
+
+```yaml
+- name: "exp3"
+  qca_dataset_path: "data/eval_dataset/multimodal_eval_dataset_zh_example.json"
+  eval_modal_llm:
+    source: "dashscope"
+    model: "qwen-max"
+    max_tokens: 1024
+  rag_setting_file: "src/pai_rag/config/evaluation/settings_eval_for_image.toml"
+  tested_multimodal_llm:
+    source: "dashscope"
+    model: "qwen-vl-max"
+    max_tokens: 1024
+```
+
+2. 参数说明：
+
+- name: 评估实验名称。
+- qca_dataset_path: 评估数据集json文件路径，支持本地文件路径。
+- eval_modal_llm: 用于评估大模型的配置，支持dashscope、openai、paieas等。
+- rag_setting_file: rag配置文件路径。
+- tested_multimodal_llm: 待评估的评估大模型的配置
+
+3. 评估数据集格式
+
+```json
+{
+  "examples": [
+    {
+      "query": "2023年春夏期间，哪种泛户外运动在社交平台上的讨论声量最高？",
+      "query_by": null,
+      "reference_contexts": [
+        ": 在赛事和政策的双重推动下，国民运动户外参与意愿高涨，超过六成的受访者表示近一年显著增加了运动户外的频率，各类运动项目正在快速走向“全民化”。新的一年，随着巴黎奥运会、美洲杯等赛事的举办，全民运动热情将进一步被激发。对于品牌而言，这是一个难得的市场机遇，通过精准地选中和锁定与运动相关的目标人群，品牌可以有效地实现用户收割。  \n\n  \n\n悦己驱动，运动边界向轻量泛户外持续延伸  \n\n国民参与运动户外活动更多来自“悦己”观念的驱动，近7成的受访者表示他们主要是为了“强身健体/享受大自然”，因此轻量级、易开展的活动项目更受广大普通受众的青睐。近三年，社交平台关于“泛户外运动”的讨论热度持续走高，更是在23年春夏期间迎来一波小高峰：细分到具体的活动项目上，垂钓讨论声量较高；露营也保持较高声量，其经历过22年的大爆发、23年的行业调整，预计24年已经进入更深精细化运营；此外城市骑行热度也在不断上升，成为当下新兴的小众活动。"
+      ],
+      "reference_node_id": null,
+      "reference_image_url_list": [
+        "https://pai-rag.oss-cn-hangzhou.aliyuncs.com/pairag/doc_images/2024春夏淘宝天猫运动户外行业趋势白皮书_淘宝/d4e624aceb4043839c924e33c075e388.jpeg",
+        "https://pai-rag.oss-cn-hangzhou.aliyuncs.com/pairag/doc_images/2024春夏淘宝天猫运动户外行业趋势白皮书_淘宝/52d1353d4577698891e7710ae12e18b1.jpeg",
+        "https://pai-rag.oss-cn-hangzhou.aliyuncs.com/pairag/doc_images/2024春夏淘宝天猫运动户外行业趋势白皮书_淘宝/4f77ded6421ddadd519ab9ef1601a784.jpeg"
+      ],
+      "reference_answer": "根据给定的材料，2023年春夏期间，垂钓在社交平台上的讨论声量最高。\n\n![](https://pai-rag.oss-cn-hangzhou.aliyuncs.com/pairag/doc_images/2024春夏淘宝天猫运动户外行业趋势白皮书_淘宝/d4e624aceb4043839c924e33c075e388.jpeg)",
+      "reference_answer_by": null,
+      "predicted_contexts": null,
+      "predicted_node_id": null,
+      "predicted_node_score": null,
+      "predicted_image_url_list": null,
+      "predicted_answer": "",
+      "predicted_answer_by": null
+    }
+  ],
+  "labelled": true,
+  "predicted": false
+}
+```
+
+说明：必须要有query、reference_contexts或reference_image_url_list、reference_answer字段。 并且labelled: true、predicted: false
+
+4. 评估维度：
+
+Response
+
+| 指标         | 说明                                 |
+| ------------ | ------------------------------------ |
+| faithfulness | 分数为0或1，其中1是相关，0是不相关   |
+| correctness  | 分数在1到5之间，其中1为最低，5为最高 |
diff --git a/src/pai_rag/config/evaluation/config.yaml b/src/pai_rag/config/evaluation/config.yaml
@@ -2,22 +2,28 @@ experiment:
   #   [custom knowledge dataset]
   - name: "exp1"
     eval_data_path: "example_data/eval_docs_text"
-    eval_model_source: "Dashscope"
-    eval_model_name: "qwen-max"
+    eval_modal_llm:
+      source: "dashscope"
+      model: "qwen-max"
+      max_tokens: 1024
     rag_setting_file: "src/pai_rag/config/evaluation/settings_eval_for_text.toml"
   - name: "exp2"
     eval_data_path: "example_data/eval_docs_text"
-    eval_model_source: "Dashscope"
-    eval_model_name: "qwen-vl-max"
+    eval_modal_llm:
+      source: "dashscope"
+      model: "qwen-max"
+      max_tokens: 1024
     rag_setting_file: "src/pai_rag/config/evaluation/settings_eval_for_image.toml"
     tested_multimodal_llm:
       source: "dashscope"
       model: "qwen-vl-max"
       max_tokens: 1024
   - name: "exp3"
-    qca_dataset_path: "data/eval_dataset/multimodal_eval_dataset_zh.json"
-    eval_model_source: "Dashscope"
-    eval_model_name: "qwen-vl-max"
+    qca_dataset_path: "data/eval_dataset/multimodal_eval_dataset_zh_example.json"
+    eval_modal_llm:
+      source: "dashscope"
+      model: "qwen-max"
+      max_tokens: 1024
     rag_setting_file: "src/pai_rag/config/evaluation/settings_eval_for_image.toml"
     tested_multimodal_llm:
       source: "dashscope"

diff --git a/src/pai_rag/evaluation/metrics/response/faithfulness.py b/src/pai_rag/evaluation/metrics/response/faithfulness.py
@@ -38,9 +38,9 @@
 
 DEFAULT_MULTIMODAL_EVAL_TEMPLATE = PromptTemplate(
     """
-    请告诉我一段信息是否得到上下文的支持。
+    请告诉我一段信息是否得到上下文的支持。如果上下文中有图片链接，请读取图片获取信息。\n\n"
     你需要回答“是”或“否”。
-    如果任何上下文支持该信息，即使大部分上下文无关，也请回答“是”。
+    如果任何上下文或者图片中的信息支持该信息，即使大部分上下文无关或者图片内容无关，也请回答“是”。
     下面提供了一些示例。\n\n
     信息：苹果派通常是双皮的。
     上下文：苹果派是一种水果派，主要填充成分是苹果。
@@ -103,7 +103,7 @@ def __init__(
 
     def parse_eval_result(self, eval_result: str):
         raw_response_txt = eval_result.lower()
-        if "yes" in raw_response_txt:
+        if "yes" in raw_response_txt or "答案：是" in raw_response_txt:
             passing = True
         else:
             passing = False
@@ -170,9 +170,12 @@ async def aevaluate_multimodal(
         ) or response_answer is None:
             raise ValueError("contexts and response must be provided")
 
+        image_context_str = "\n\n".join(reference_image_url_list)
+        text_context_str = "\n\n".join(contexts)
+
         prompt_str = self._multimodal_eval_template.format(
             response_str=response_answer,
-            context_str="\n".join(contexts),
+            context_str=f"{text_context_str}\n\n图片链接列表: \n\n{image_context_str}\n\n",
             reference_image_url_list=reference_image_url_list or "(没有提供参考图片链接)",
         )
         image_documents = load_image_urls(reference_image_url_list)

diff --git a/src/pai_rag/evaluation/run_evaluation_experiments.py b/src/pai_rag/evaluation/run_evaluation_experiments.py
@@ -37,8 +37,7 @@ def run_experiment(exp_params):
             config=exp_params["rag_setting_file"],
             data_path=exp_params["eval_data_path"],
             exp_name=exp_name,
-            eval_model_source=exp_params["eval_model_source"],
-            eval_model_name=exp_params["eval_model_name"],
+            eval_model_llm_config=exp_params["eval_modal_llm"],
         )
         logger.info(f"Finished experiment with name={exp_name}")
     except Exception as e:
@@ -57,8 +56,7 @@ def run_multimodal_experiment(exp_params):
             qca_dataset_path=exp_params.get("qca_dataset_path", None),
             data_path=exp_params.get("eval_data_path", None),
             exp_name=exp_name,
-            eval_model_source=exp_params["eval_model_source"],
-            eval_model_name=exp_params["eval_model_name"],
+            eval_model_llm_config=exp_params["eval_modal_llm"],
             tested_multimodal_llm_config=exp_params.get("tested_multimodal_llm", None),
         )
         logger.info(f"Finished experiment with name={exp_name}")

diff --git a/src/pai_rag/evaluation/run_evaluation_pipeline.py b/src/pai_rag/evaluation/run_evaluation_pipeline.py
@@ -27,9 +27,7 @@
 )
 
 
-def _create_components(
-    config_file, exp_name, eval_model_source, eval_model_name
-) -> None:
+def _create_components(config_file, exp_name, eval_model_llm_config) -> None:
     """Create all components from the default config file."""
     config = RagConfigManager.from_file(config_file).get_value()
     mode = "image" if config.retriever.search_image else "text"
@@ -43,12 +41,7 @@ def _create_components(
     data_loader = resolve_data_loader(config)
     vector_index = resolve_vector_index(config)
     query_engine = resolve_query_engine(config)
-    eval_llm_config_data = {
-        "source": eval_model_source.lower(),
-        "model": eval_model_name,
-        "max_tokens": 1024,
-    }
-    eval_llm_config = parse_llm_config(eval_llm_config_data)
+    eval_llm_config = parse_llm_config(eval_model_llm_config)
     if mode == "text":
         llm = resolve(cls=PaiLlm, llm_config=config.llm)
         eval_llm = create_llm(eval_llm_config)
@@ -76,8 +69,7 @@ def _create_components(
 def _create_multimodal_components(
     config_file,
     exp_name,
-    eval_model_source,
-    eval_model_name,
+    eval_model_llm_config,
     tested_multimodal_llm_config,
     qca_dataset_path: str = None,
 ) -> None:
@@ -94,12 +86,7 @@ def _create_multimodal_components(
     data_loader = resolve_data_loader(config)
     vector_index = resolve_vector_index(config)
     query_engine = resolve_query_engine(config)
-    eval_llm_config_data = {
-        "source": eval_model_source.lower(),
-        "model": eval_model_name,
-        "max_tokens": 1024,
-    }
-    eval_llm_config = parse_llm_config(eval_llm_config_data)
+    eval_llm_config = parse_llm_config(eval_model_llm_config)
     eval_llm = create_multi_modal_llm(eval_llm_config)
 
     llm = resolve(cls=PaiMultiModalLlm, llm_config=config.multimodal_llm)
@@ -117,6 +104,7 @@ def _create_multimodal_components(
     )
     if qca_dataset_path:
         persist_path = os.path.join("localdata/eval_exp_data", exp_name)
+        os.makedirs(persist_path, exist_ok=True)
     else:
         persist_path = config.index.vector_store.persist_path
     evaluator = BaseEvaluator(
@@ -135,8 +123,7 @@ def run_evaluation_pipeline(
     data_path=None,
     pattern=None,
     exp_name="default",
-    eval_model_source=None,
-    eval_model_name=None,
+    eval_model_llm_config=None,
 ):
     assert (oss_path is not None) or (
         data_path is not None
@@ -146,7 +133,7 @@ def run_evaluation_pipeline(
     ), f"Can not provide both local path '{data_path}' and oss path '{oss_path}'."
 
     data_loader, qca_generator, evaluator = _create_components(
-        config, exp_name, eval_model_source, eval_model_name
+        config, exp_name, eval_model_llm_config
     )
     data_loader.load_data(
         file_path_or_directory=data_path,
@@ -173,15 +160,13 @@ def run_multimodal_evaluation_pipeline(
     data_path=None,
     pattern=None,
     exp_name="default",
-    eval_model_source=None,
-    eval_model_name=None,
+    eval_model_llm_config=None,
     tested_multimodal_llm_config=None,
 ):
     data_loader, multimodal_qca_generator, evaluator = _create_multimodal_components(
         config,
         exp_name,
-        eval_model_source,
-        eval_model_name,
+        eval_model_llm_config,
         tested_multimodal_llm_config,
         qca_dataset_path,
     )