Support auto evaluation for multi-modal (#258)

* Support MM: text and image eval * Support MultiModal Eval
aigc-apps · Oct 30, 2024 · 7de2849 · 7de2849
1 parent 403b7e7
commit 7de2849
Show file tree

Hide file tree

Showing 16 changed files with 653 additions and 181 deletions.
diff --git a/example_data/eval_docs_image/跑鞋推荐.pdf b/example_data/eval_docs_image/跑鞋推荐.pdf
diff --git a/example_data/eval_docs_text/EasyRec.txt b/example_data/eval_docs_text/EasyRec.txt
@@ -0,0 +1,44 @@
+EasyRec是一个易于使用的推荐框架¶
+EasyRec 实现了常见推荐任务中使用的最先进的机器学习模型：候选生成（匹配）、评分（排名）和多任务学习。 它通过简单的配置和超参数调整（HPO）提高了生成高性能模型的效率。
+
+EasyRec视频介绍
+为什么选择 EasyRec？¶
+到处运行¶
+MaxCompute / 数据科学 / DLC / 本地
+TF1.12-1.15 / TF2.x / PAI-TF
+多样的输入数据¶
+MaxCompute表
+HDFS 文件
+操作系统文件
+卡夫卡流
+本地 CSV
+
+配置简单¶
+灵活的功能配置和简单的模型配置
+高效、鲁棒的特征生成[淘宝使用]
+漂亮的网络界面正在开发中
+
+它很聪明¶
+EarlyStop / 最佳检查站保护程序
+超参数搜索/AutoFeatureCross
+开发中：NAS、知识蒸馏、多模式
+
+规模大、部署方便¶
+支持大规模嵌入，增量保存
+许多并行策略：ParameterServer、Mirrored、MultiWorker
+轻松部署到 EAS：自动扩展、轻松监控
+一致性保证：训练和服务
+
+多种模型
+DSSM / MIND / DropoutNet / CoMetricLearningI2I / PDN
+W&D / DeepFM / MultiTower / DCN / DIN / BST
+MMoE / ESMM / DBMTL / PLE
+CMBF / 联合
+
+易于定制¶
+易于实现定制模型
+无需关心数据管道
+快速向量检索¶
+在分布式环境中运行向量的 knn 算法
+
+欢迎加入【EasyRec推荐算法交流群】，钉钉群号 : 32260796
diff --git a/example_data/eval_docs_text/PAI.txt b/example_data/eval_docs_text/PAI.txt
@@ -0,0 +1,14 @@
+机器学习PAI（Platform of Artificial Intelligence）是阿里云人工智能平台，提供一站式的机器学习解决方案。本文为您介绍什么是机器学习PAI。
+
+什么是机器学习
+机器学习是指机器通过统计学算法，对大量历史数据进行学习，进而利用生成的经验模型指导业务。目前机器学习主要应用在以下场景：
+营销类场景：商品推荐、用户群体画像或广告精准投放。
+金融类场景：贷款发放预测、金融风险控制、股票走势预测或黄金价格预测。
+社交网络服务关系挖掘场景：微博粉丝领袖分析或社交关系链分析。
+文本类场景：新闻分类、关键词提取、文章摘要或文本内容分析。
+非结构化数据处理场景：图片分类或图片文本内容提取。
+其它各类预测场景：降雨预测或足球比赛结果预测。
+机器学习包括传统机器学习和深度学习。传统机器学习分为以下几类：
+有监督学习（Supervised Learning）：每个样本都有对应的期望值，通过搭建模型，实现从输入特征向量到目标值的映射。例如解决回归和分类问题。
+无监督学习（Unsupervised Learning）：所有样本没有目标值，期望从数据本身发现一些潜在规律。例如解决聚类问题。
+增强学习（Reinforcement Learning）：相对比较复杂，系统和外界环境不断交互，根据外界反馈决定自身行为，达到目标最优化。例如阿尔法围棋和无人驾驶。
diff --git a/src/pai_rag/config/evaluation/config.yaml b/src/pai_rag/config/evaluation/config.yaml
@@ -1,8 +1,12 @@
 experiment:
   # [custom knowledge dataset]
   - name: "exp1"
-    data_path: "example_data/eval_docs"
-    setting_file: "src/pai_rag/config/evaluation/settings_eval.toml"
+    eval_data_path: "example_data/eval_docs_text"
+    eval_model_source: "Dashscope"
+    eval_model_name: "qwen-max"
+    rag_setting_file: "src/pai_rag/config/evaluation/settings_eval_for_text.toml"
   - name: "exp2"
-    data_path: "example_data/eval_docs_1"
-    setting_file: "src/pai_rag/config/evaluation/settings_eval.toml"
+    eval_data_path: "example_data/eval_docs_image"
+    eval_model_source: "Dashscope"
+    eval_model_name: "qwen-vl-max"
+    rag_setting_file: "src/pai_rag/config/evaluation/settings_eval_for_image.toml"
diff --git a/src/pai_rag/config/evaluation/settings_eval_for_image.toml b/src/pai_rag/config/evaluation/settings_eval_for_image.toml
@@ -0,0 +1,99 @@
+dynaconf_merge = true
+
+[rag]
+name = "pai_rag"
+version = "0.1.1"
+
+[rag.agent]
+custom_agent_config_file = ""
+agent_tool_type = ""
+
+[rag.chat_store]
+type = "Local" # [Local, Aliyun-Redis]
+host = "Aliyun-Redis host"
+password = "Aliyun-Redis user:pwd"
+persist_path = "localdata/eval_exp_data/storage"
+
+[rag.data_analysis]
+type = "pandas"
+nl2sql_prompt = "给定一个输入问题，创建一个语法正确的{dialect}查询语句来执行，不要从特定的表中查询所有列，只根据问题查询几个相关的列。请注意只使用你在schema descriptions 中看到的列名。\n=====\n 小心不要查询不存在的列。请注意哪个列位于哪个表中。必要时，请使用表名限定列名。\n=====\n 你必须使用以下格式，每项占一行：\n\n Question: Question here\n SQLQuery: SQL Query to run \n\n Only use tables listed below.\n {schema}\n\n Question: {query_str} \n SQLQuery: "
+
+[rag.data_reader]
+type = "SimpleDirectoryReader"
+
+# embedding configurations, source support API: OpenAI,DashScope; and local model:HuggingFace
+# if use API, need set OPENAI_API_KEY or DASHSCOPE_API_KEY in ENV, If HuggingFace, need set model
+# eg.
+# source = "HuggingFace"
+# model = "bge-large-zh-v1.5"
+# embed_batch_size = 10
+[rag.embedding]
+source = "DashScope"
+embed_batch_size = 10
+
+[rag.index]
+persist_path = "localdata/eval_exp_data/storage"
+enable_multimodal = true
+vector_store.type = "FAISS"
+
+# llm configurations, source support API: OpenAI,DashScope or PAI-EAS's deployment
+# eg.
+# source = "PaiEas"
+# model = ""
+# endpoint = ""
+# token = ""
+[rag.llm]
+source = "DashScope"
+model = "qwen-turbo"
+
+[rag.multimodal_embedding]
+source = "cnclip"
+
+[rag.multimodal_llm]
+source = "dashscope"
+model = "qwen-vl-plus"
+
+[rag.node_enhancement]
+tree_depth = 3
+max_clusters = 52
+proba_threshold = 0.10
+
+[rag.node_parser]
+type = "Sentence"
+chunk_size = 500
+chunk_overlap = 10
+enable_multimodal = true
+
+[rag.oss_store]
+bucket = "pai-rag"
+endpoint = "oss-cn-hangzhou.aliyuncs.com"
+prefix = "evaluation"
+
+[rag.postprocessor]
+reranker_type = "no-reranker" # [simple-weighted-reranker, model-based-reranker]
+reranker_model = "bge-reranker-base" # [bge-reranker-base, bge-reranker-large]
+keyword_weight = 0.3
+vector_weight = 0.7
+similarity_threshold = 0.5
+top_n = 2
+
+[rag.query_transform]
+type = ""
+
+[rag.retriever]
+similarity_top_k = 3
+retrieval_mode = "hybrid" # [hybrid, embedding, keyword, router]
+query_rewrite_n = 1 # set to 1 to disable query generation
+search_image = true
+
+[rag.search]
+search_api_key = ""
+
+[rag.synthesizer]
+type = "SimpleSummarize"
+text_qa_template = "参考内容信息如下\n---------------------\n{context_str}\n---------------------根据提供内容而非其他知识回答问题.\n问题: {query_str}\n答案: \n"
+
+[rag.trace]
+type = "pai_trace"
+endpoint = "http://tracing-analysis-dc-hz.aliyuncs.com:8090"
+token = ""
diff --git a/..._rag/config/evaluation/settings_eval.toml → ...ig/evaluation/settings_eval_for_text.toml b/..._rag/config/evaluation/settings_eval.toml → ...ig/evaluation/settings_eval_for_text.toml
@@ -33,7 +33,7 @@ embed_batch_size = 10
 
 [rag.index]
 persist_path = "localdata/eval_exp_data/storage"
-enable_multimodal = true
+enable_multimodal = false
 vector_store.type = "FAISS"
 
 # llm configurations, source support API: OpenAI,DashScope or PAI-EAS's deployment
@@ -62,7 +62,7 @@ proba_threshold = 0.10
 type = "Sentence"
 chunk_size = 500
 chunk_overlap = 10
-enable_multimodal = true
+enable_multimodal = false
 
 [rag.oss_store]
 bucket = ""

diff --git a/src/pai_rag/evaluation/dataset/rag_eval_dataset.py b/src/pai_rag/evaluation/dataset/rag_eval_dataset.py
@@ -3,6 +3,7 @@
 import json
 from llama_index.core.bridge.pydantic import BaseModel
 from pai_rag.evaluation.dataset.rag_qca_dataset import RagQcaSample
+from llama_index.core.llama_dataset import CreatedBy
 
 
 class EvaluationSample(RagQcaSample):
@@ -36,6 +37,9 @@ class EvaluationSample(RagQcaSample):
         default_factory=None,
         description="The correctness reason for response evaluation.",
     )
+    evaluated_by: Optional[CreatedBy] = Field(
+        default=None, description="What model generated the evaluation result."
+    )
 
     @property
     def class_name(self) -> str:

diff --git a/src/pai_rag/evaluation/dataset/rag_qca_dataset.py b/src/pai_rag/evaluation/dataset/rag_qca_dataset.py
@@ -25,6 +25,10 @@ class RagQcaSample(BaseLlamaDataExample):
     reference_node_id: Optional[List[str]] = Field(
         default_factory=None, description="The node id corresponding to the contexts"
     )
+    reference_image_url_list: Optional[List[str]] = Field(
+        default_factory=None,
+        description="The image urls used to generate the reference answer.",
+    )
     reference_answer: str = Field(
         default_factory=str,
         description="The reference (ground-truth) answer to the example.",
@@ -41,6 +45,10 @@ class RagQcaSample(BaseLlamaDataExample):
         default_factory=None,
         description="The node id corresponding to the predicted contexts",
     )
+    predicted_image_url_list: Optional[List[str]] = Field(
+        default_factory=None,
+        description="The image urls used to generate the reference answer.",
+    )
     predicted_answer: str = Field(
         default_factory=str,
         description="The predicted answer to the example.",

diff --git a/src/pai_rag/evaluation/evaluator/base_evaluator.py b/src/pai_rag/evaluation/evaluator/base_evaluator.py
@@ -9,11 +9,15 @@
     EvaluationSample,
     PaiRagEvalDataset,
 )
+from llama_index.core.llama_dataset import (
+    CreatedBy,
+    CreatedByType,
+)
 from pai_rag.evaluation.dataset.rag_qca_dataset import PaiRagQcaDataset
 
 
 class BaseEvaluator:
-    def __init__(self, llm, persist_path: str = None):
+    def __init__(self, llm, persist_path: str = None, enable_multi_modal: bool = False):
         self._llm = llm
         self.persist_path = persist_path
         self.hitrate = HitRate()
@@ -32,9 +36,13 @@ def __init__(self, llm, persist_path: str = None):
         self.evaluation_dataset_path = os.path.join(
             self.persist_path, "evaluation_dataset.json"
         )
+        self.created_by = CreatedBy(
+            type=CreatedByType.AI, model_name=self._llm.metadata.model_name
+        )
         self.qca_dataset_path = os.path.join(self.persist_path, "qca_dataset.json")
         self._show_progress = True
         self._workers = 2
+        self.enable_multi_modal = enable_multi_modal
 
     def load_qca_dataset(self) -> None:
         if os.path.exists(self.qca_dataset_path):
@@ -75,22 +83,44 @@ async def compute_retrieval_metrics(self, qca_sample):
         for metric in self.retrieval_evaluators:
             metric_score = metric.compute(reference_node_id, predicted_node_id)
             setattr(retrieval_eval_example, metric.metric_name, metric_score)
+            setattr(retrieval_eval_example, "evaluated_by", self.created_by)
 
         return retrieval_eval_example
 
     async def compute_response_metrics(self, qca_sample):
         response_eval_example = EvaluationSample(**vars(qca_sample))
         query = response_eval_example.query
-        response = response_eval_example.reference_answer
+        reference_answer = response_eval_example.reference_answer
+        response_answer = response_eval_example.predicted_answer
+        reference_image_url_list = response_eval_example.reference_image_url_list
         contexts = response_eval_example.predicted_contexts
+
         for metric in self.response_evaluators:
-            metric_result = await metric.aevaluate(query, response, contexts)
+            if self.enable_multi_modal:
+                metric_result = await metric.aevaluate_multimodal(
+                    query,
+                    reference_answer,
+                    contexts,
+                    reference_image_url_list,
+                    response_answer,
+                    sleep_time_in_seconds=0.5,
+                )
+            else:
+                metric_result = await metric.aevaluate(
+                    query,
+                    reference_answer,
+                    contexts,
+                    response_answer,
+                    sleep_time_in_seconds=0.5,
+                )
+
             setattr(
                 response_eval_example, f"{metric.metric_name}_score", metric_result[0]
             )
             setattr(
                 response_eval_example, f"{metric.metric_name}_reason", metric_result[1]
             )
+            setattr(response_eval_example, "evaluated_by", self.created_by)
 
         return response_eval_example