diff --git a/.gitignore b/.gitignore
index 97c5698..3cdc8ee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
/.vscode/
/output/
+/dist/
__pycache__/
diff --git a/README.md b/README.md
index d581390..84dd794 100644
--- a/README.md
+++ b/README.md
@@ -4,8 +4,8 @@
What does this repository include?
- UHGEval: An unconstrained hallucination evaluation benchmark.
- Eval Suite: A user-friendly evaluation framework for hallucination tasks.
+ UHGEval: An unconstrained hallucination evaluation benchmark.
+ Eval Suite: A user-friendly evaluation framework for hallucination tasks.
Eval Suite supports other benchmarks, such as HalluQA and HaluEval.
@@ -31,36 +31,32 @@
## Quick Start
```bash
-# Clone the repository
-git clone https://github.com/IAAR-Shanghai/UHGEval.git
-cd UHGEval
-
-# Install dependencies
+# Install Eval Suite
conda create -n uhg python=3.10
conda activate uhg
-pip install -r requirements.txt
+pip install eval-suite
# Run evaluations with OpenAI Compatible API
-python -m eval.cli eval openai \
+eval_suite eval openai \
--model_name gpt-4o \
--api_key your_api_key \
--base_url https://api.openai.com/v1 \
--evaluators ExampleQAEvaluator UHGSelectiveEvaluator
# Or run evaluations with Hugging Face Transformers
-python -m eval.cli eval huggingface \
+eval_suite eval huggingface \
--model_name_or_path Qwen/Qwen2-0.5B-Instruct \
--apply_chat_template \
--evaluators ExampleQAEvaluator UHGSelectiveEvaluator
# After evaluation, you can gather statistics of the evaluation results
-python -m eval.cli stat
+eval_suite stat
# List all available evaluators
-python -m eval.cli list
+eval_suite list
# Get help
-python -m eval.cli --help
+eval_suite --help
```
> [!Tip]
@@ -113,13 +109,13 @@ UHGEval is a large-scale benchmark designed for evaluating hallucination in prof
To facilitate evaluation, we have developed a user-friendly evaluation framework called Eval Suite. Currently, Eval Suite supports common hallucination evaluation benchmarks, allowing for comprehensive evaluation of the same LLM with just one command as shown in the [Quick Start](#quick-start) section.
-| Benchmark | Evaluator | More Information |
-| --------- | -------------------------------------------------------------------------------------------------------------- | ---------------------------------------------- |
-| C-Eval | `CEvalEvaluator` | [eval/benchs/ceval](eval/benchs/ceval) |
-| ExampleQA | `ExampleQAEvaluator` | [eval/benchs/exampleqa](eval/benchs/exampleqa) |
-| HalluQA | `HalluQAMCEvaluator` | [eval/benchs/halluqa](eval/benchs/halluqa) |
-| HaluEval | `HaluEvalDialogEvaluator`
`HaluEvalQAEvaluator`
`HaluEvalSummaEvaluator` | [eval/benchs/halueval](eval/benchs/halueval) |
-| UHGEval | `UHGDiscKeywordEvaluator`
`UHGDiscSentenceEvaluator`
`UHGGenerativeEvaluator`
`UHGSelectiveEvaluator` | [eval/benchs/uhgeval](eval/benchs/uhgeval) |
+| Benchmark | Evaluator | More Information |
+| --------- | -------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------ |
+| C-Eval | `CEvalEvaluator` | [src/eval_suite/benchs/ceval](src/eval_suite/benchs/ceval) |
+| ExampleQA | `ExampleQAEvaluator` | [src/eval_suite/benchs/exampleqa](src/eval_suite/benchs/exampleqa) |
+| HalluQA | `HalluQAMCEvaluator` | [src/eval_suite/benchs/halluqa](src/eval_suite/benchs/halluqa) |
+| HaluEval | `HaluEvalDialogEvaluator`
`HaluEvalQAEvaluator`
`HaluEvalSummaEvaluator` | [src/eval_suite/benchs/halueval](src/eval_suite/benchs/halueval) |
+| UHGEval | `UHGDiscKeywordEvaluator`
`UHGDiscSentenceEvaluator`
`UHGGenerativeEvaluator`
`UHGSelectiveEvaluator` | [src/eval_suite/benchs/uhgeval](src/eval_suite/benchs/uhgeval) |
## Learn More
@@ -162,8 +158,6 @@ To facilitate evaluation, we have developed a user-friendly evaluation framework
Click me to show all TODOs
- [ ] feat: vLLM offline inference benchmarking
-- [ ] build: packaging
- [ ] feat(benchs): add TruthfulQA benchmark
-- [ ] other: promotion
-
+- [ ] ci: auto release to PyPI
diff --git a/demo.ipynb b/demo.ipynb
index 864488c..588e7d8 100644
--- a/demo.ipynb
+++ b/demo.ipynb
@@ -15,7 +15,11 @@
"metadata": {},
"outputs": [],
"source": [
- "from eval_suite.benchs import ExampleQAEvaluator, get_all_evaluator_classes, load_evaluator\n",
+ "from eval_suite.benchs import (\n",
+ " ExampleQAEvaluator,\n",
+ " get_all_evaluator_classes,\n",
+ " load_evaluator,\n",
+ ")\n",
"from eval_suite.llms import HuggingFace, OpenAIAPI\n",
"from eval_suite.utils import save_stats"
]
diff --git a/docs/experiments/20240822/expt.py b/docs/experiments/20240822/expt.py
index a4dc58e..f0b9147 100644
--- a/docs/experiments/20240822/expt.py
+++ b/docs/experiments/20240822/expt.py
@@ -1,10 +1,10 @@
-from eval.benchs import (
+from eval_suite.benchs import (
UHGDiscKeywordEvaluator,
UHGDiscSentenceEvaluator,
UHGGenerativeEvaluator,
UHGSelectiveEvaluator,
)
-from eval.llms import OpenAIAPI
+from eval_suite.llms import OpenAIAPI
glm = OpenAIAPI(
model_name="THUDM/glm-4-9b-chat",
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..dfb1f85
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,58 @@
+[build-system]
+requires = ["hatchling", "hatch-vcs"]
+build-backend = "hatchling.build"
+
+[project]
+name = "eval_suite"
+dependencies = [
+ # Common
+ "torch",
+ "tqdm",
+ "ipykernel",
+
+ # OpenAI API
+ "openai",
+ "tenacity",
+
+ # Hugging Face Transformers
+ "transformers",
+ "accelerate",
+ "sentencepiece",
+
+ # Metrics
+ "nltk",
+ "rouge_score",
+ "text2vec",
+ "absl-py",
+
+ # Formatting
+ "black",
+ "isort",
+]
+authors = [{ name = "Shichao Song", email = "songshichao517@gmail.com" }]
+description = "User-friendly evaluation framework: Eval Suite & Benchmarks: UHGEval, HaluEval, HalluQA, etc."
+license = { file = "LICENSE" }
+keywords = [
+ "UHGEval",
+ "Chinese",
+ "hallucination",
+ "evaluation",
+ "llm",
+ "eval_suite",
+]
+requires-python = ">=3.10"
+classifiers = [
+ "Programming Language :: Python :: 3",
+ "License :: OSI Approved :: Apache Software License",
+ "Operating System :: OS Independent",
+]
+dynamic = ["readme", "version"]
+
+[project.urls]
+Repository = "https://github.com/IAAR-Shanghai/UHGEval"
+
+[project.scripts]
+eval_suite = "eval_suite.cli:main"
+
+[tool.hatch.version]
+source = "vcs"
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 7788a99..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-# Common
-torch
-tqdm
-ipykernel
-
-# OpenAI API
-openai
-tenacity
-
-# Hugging Face Transformers
-transformers
-accelerate
-sentencepiece
-
-# Metrics
-nltk
-rouge_score
-text2vec
-absl-py
-
-# Formatting
-black
-isort
diff --git a/src/eval_suite/benchs/exampleqa/eval_exampleqa.py b/src/eval_suite/benchs/exampleqa/eval_exampleqa.py
index d9410a9..0912f0b 100644
--- a/src/eval_suite/benchs/exampleqa/eval_exampleqa.py
+++ b/src/eval_suite/benchs/exampleqa/eval_exampleqa.py
@@ -1,7 +1,6 @@
import os
from ...llms.base_llm import BaseLLM
-
from ..base_evaluator import BaseEvaluator
from .dataset import ExampleQADataset
diff --git a/src/eval_suite/cli.py b/src/eval_suite/cli.py
index e80782e..50c5c26 100644
--- a/src/eval_suite/cli.py
+++ b/src/eval_suite/cli.py
@@ -51,7 +51,7 @@ def parse_args():
# fmt: on
-if __name__ == "__main__":
+def main():
args = parse_args()
logger.info(f"Start the CLI with args: {args}")
@@ -80,3 +80,7 @@ def parse_args():
elif args.operation_name == "list":
print("All evaluators:")
pprint(all_evaluators)
+
+
+if __name__ == "__main__":
+ main()