From 50ba2c70b7130452d1916f1812608edc6b161630 Mon Sep 17 00:00:00 2001 From: Ki-Seki Date: Thu, 3 Oct 2024 23:15:40 +0800 Subject: [PATCH 1/2] chore: rename eval to eval_suite --- .env | 1 + .github/CONTRIBUTING.md | 2 +- .github/workflows/python-package.yml | 2 +- demo.ipynb | 6 +++--- docs/add-bench-or-model.md | 4 ++-- docs/architecture.md | 2 +- {eval => src/eval_suite}/__init__.py | 0 {eval => src/eval_suite}/benchs/__init__.py | 0 {eval => src/eval_suite}/benchs/base_dataset.py | 0 {eval => src/eval_suite}/benchs/base_evaluator.py | 0 {eval => src/eval_suite}/benchs/ceval/README.md | 0 {eval => src/eval_suite}/benchs/ceval/dataset.py | 0 {eval => src/eval_suite}/benchs/ceval/eval_ceval.py | 0 {eval => src/eval_suite}/benchs/ceval/subject_mapping.json | 0 {eval => src/eval_suite}/benchs/ceval/utils.py | 0 {eval => src/eval_suite}/benchs/exampleqa/README.md | 0 {eval => src/eval_suite}/benchs/exampleqa/dataset.py | 0 .../eval_suite}/benchs/exampleqa/dataset_exampleqa.jsonl | 0 {eval => src/eval_suite}/benchs/exampleqa/eval_exampleqa.py | 2 +- {eval => src/eval_suite}/benchs/halluqa/README.md | 0 {eval => src/eval_suite}/benchs/halluqa/dataset.py | 0 .../eval_suite}/benchs/halluqa/dataset_halluqa.json | 0 .../eval_suite}/benchs/halluqa/dataset_halluqa_mc.json | 0 {eval => src/eval_suite}/benchs/halluqa/eval_base.py | 0 {eval => src/eval_suite}/benchs/halluqa/eval_halluqa_mc.py | 0 {eval => src/eval_suite}/benchs/halueval/README.md | 0 {eval => src/eval_suite}/benchs/halueval/dataset.py | 0 .../benchs/halueval/dataset_halueval_dialogue.jsonl | 0 .../eval_suite}/benchs/halueval/dataset_halueval_qa.jsonl | 0 .../benchs/halueval/dataset_halueval_summarization.jsonl | 0 {eval => src/eval_suite}/benchs/halueval/eval_base.py | 0 .../eval_suite}/benchs/halueval/eval_halueval_dialog.py | 0 .../eval_suite}/benchs/halueval/eval_halueval_qa.py | 0 .../eval_suite}/benchs/halueval/eval_halueval_summa.py | 0 {eval => src/eval_suite}/benchs/uhgeval/README.md | 0 {eval => src/eval_suite}/benchs/uhgeval/dataset.py | 0 .../benchs/uhgeval/dataset_uhgeval_concise.jsonl | 0 .../eval_suite}/benchs/uhgeval/dataset_uhgeval_full.jsonl | 0 {eval => src/eval_suite}/benchs/uhgeval/eval_base.py | 0 .../eval_suite}/benchs/uhgeval/eval_disc_keyword.py | 0 .../eval_suite}/benchs/uhgeval/eval_disc_sentence.py | 0 {eval => src/eval_suite}/benchs/uhgeval/eval_gene.py | 0 {eval => src/eval_suite}/benchs/uhgeval/eval_sele.py | 0 {eval => src/eval_suite}/cli.py | 0 {eval => src/eval_suite}/llms/__init__.py | 0 {eval => src/eval_suite}/llms/base_llm.py | 0 {eval => src/eval_suite}/llms/huggingface.py | 0 {eval => src/eval_suite}/llms/openai_api.py | 0 {eval => src/eval_suite}/logging.py | 0 {eval => src/eval_suite}/metrics.py | 0 {eval => src/eval_suite}/utils.py | 0 tests/{test_benchs => benchs}/__init__.py | 0 tests/{test_benchs => benchs}/test_base_dataset.py | 2 +- tests/{test_benchs => benchs}/test_base_evaluator.py | 4 ++-- tests/{test_llms => llms}/__init__.py | 0 tests/{test_llms => llms}/test_base_llm.py | 2 +- tests/{test_llms => llms}/test_huggingface.py | 2 +- tests/{test_llms => llms}/test_openai_api.py | 2 +- tests/test_metrics.py | 2 +- 59 files changed, 17 insertions(+), 16 deletions(-) create mode 100644 .env rename {eval => src/eval_suite}/__init__.py (100%) rename {eval => src/eval_suite}/benchs/__init__.py (100%) rename {eval => src/eval_suite}/benchs/base_dataset.py (100%) rename {eval => src/eval_suite}/benchs/base_evaluator.py (100%) rename {eval => src/eval_suite}/benchs/ceval/README.md (100%) rename {eval => src/eval_suite}/benchs/ceval/dataset.py (100%) rename {eval => src/eval_suite}/benchs/ceval/eval_ceval.py (100%) rename {eval => src/eval_suite}/benchs/ceval/subject_mapping.json (100%) rename {eval => src/eval_suite}/benchs/ceval/utils.py (100%) rename {eval => src/eval_suite}/benchs/exampleqa/README.md (100%) rename {eval => src/eval_suite}/benchs/exampleqa/dataset.py (100%) rename {eval => src/eval_suite}/benchs/exampleqa/dataset_exampleqa.jsonl (100%) rename {eval => src/eval_suite}/benchs/exampleqa/eval_exampleqa.py (98%) rename {eval => src/eval_suite}/benchs/halluqa/README.md (100%) rename {eval => src/eval_suite}/benchs/halluqa/dataset.py (100%) rename {eval => src/eval_suite}/benchs/halluqa/dataset_halluqa.json (100%) rename {eval => src/eval_suite}/benchs/halluqa/dataset_halluqa_mc.json (100%) rename {eval => src/eval_suite}/benchs/halluqa/eval_base.py (100%) rename {eval => src/eval_suite}/benchs/halluqa/eval_halluqa_mc.py (100%) rename {eval => src/eval_suite}/benchs/halueval/README.md (100%) rename {eval => src/eval_suite}/benchs/halueval/dataset.py (100%) rename {eval => src/eval_suite}/benchs/halueval/dataset_halueval_dialogue.jsonl (100%) rename {eval => src/eval_suite}/benchs/halueval/dataset_halueval_qa.jsonl (100%) rename {eval => src/eval_suite}/benchs/halueval/dataset_halueval_summarization.jsonl (100%) rename {eval => src/eval_suite}/benchs/halueval/eval_base.py (100%) rename {eval => src/eval_suite}/benchs/halueval/eval_halueval_dialog.py (100%) rename {eval => src/eval_suite}/benchs/halueval/eval_halueval_qa.py (100%) rename {eval => src/eval_suite}/benchs/halueval/eval_halueval_summa.py (100%) rename {eval => src/eval_suite}/benchs/uhgeval/README.md (100%) rename {eval => src/eval_suite}/benchs/uhgeval/dataset.py (100%) rename {eval => src/eval_suite}/benchs/uhgeval/dataset_uhgeval_concise.jsonl (100%) rename {eval => src/eval_suite}/benchs/uhgeval/dataset_uhgeval_full.jsonl (100%) rename {eval => src/eval_suite}/benchs/uhgeval/eval_base.py (100%) rename {eval => src/eval_suite}/benchs/uhgeval/eval_disc_keyword.py (100%) rename {eval => src/eval_suite}/benchs/uhgeval/eval_disc_sentence.py (100%) rename {eval => src/eval_suite}/benchs/uhgeval/eval_gene.py (100%) rename {eval => src/eval_suite}/benchs/uhgeval/eval_sele.py (100%) rename {eval => src/eval_suite}/cli.py (100%) rename {eval => src/eval_suite}/llms/__init__.py (100%) rename {eval => src/eval_suite}/llms/base_llm.py (100%) rename {eval => src/eval_suite}/llms/huggingface.py (100%) rename {eval => src/eval_suite}/llms/openai_api.py (100%) rename {eval => src/eval_suite}/logging.py (100%) rename {eval => src/eval_suite}/metrics.py (100%) rename {eval => src/eval_suite}/utils.py (100%) rename tests/{test_benchs => benchs}/__init__.py (100%) rename tests/{test_benchs => benchs}/test_base_dataset.py (97%) rename tests/{test_benchs => benchs}/test_base_evaluator.py (97%) rename tests/{test_llms => llms}/__init__.py (100%) rename tests/{test_llms => llms}/test_base_llm.py (95%) rename tests/{test_llms => llms}/test_huggingface.py (96%) rename tests/{test_llms => llms}/test_openai_api.py (92%) diff --git a/.env b/.env new file mode 100644 index 0000000..d28a0ee --- /dev/null +++ b/.env @@ -0,0 +1 @@ +PYTHONPATH=src \ No newline at end of file diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index bba9730..8728866 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -6,7 +6,7 @@ We appreciate your interest in contributing. To ensure a smooth collaboration, p > Please ensure that your code passes all tests and `black` code formatting before opening a pull request. > You can run the following commands to check your code: > ```bash -> python -m unittest discover -s tests/ -p 'test*.py' -v +> PYTHONPATH=src python -m unittest discover -s tests/ -p 'test*.py' -v > black . --check > ``` diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index db51653..d2352ca 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -31,7 +31,7 @@ jobs: if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Test with unittest run: | - python -m unittest discover -s tests/ -p 'test*.py' -v + PYTHONPATH=src python -m unittest discover -s tests/ -p 'test*.py' -v - name: Test linting with black run: | black . --check diff --git a/demo.ipynb b/demo.ipynb index d9f18d7..864488c 100644 --- a/demo.ipynb +++ b/demo.ipynb @@ -15,9 +15,9 @@ "metadata": {}, "outputs": [], "source": [ - "from eval.benchs import ExampleQAEvaluator, get_all_evaluator_classes, load_evaluator\n", - "from eval.llms import HuggingFace, OpenAIAPI\n", - "from eval.utils import save_stats" + "from eval_suite.benchs import ExampleQAEvaluator, get_all_evaluator_classes, load_evaluator\n", + "from eval_suite.llms import HuggingFace, OpenAIAPI\n", + "from eval_suite.utils import save_stats" ] }, { diff --git a/docs/add-bench-or-model.md b/docs/add-bench-or-model.md index 351b096..51eb76a 100644 --- a/docs/add-bench-or-model.md +++ b/docs/add-bench-or-model.md @@ -2,7 +2,7 @@ ## Adding a New Benchmark -You can refer to the structure of the `eval/benchs/exampleqa` folder, which serves as a minimal benchmark example. Additionally, you might want to check the `eval/benchs/base_dataset.py` and `eval/benchs/base_evaluator.py` files, as they provide the base classes for benchmarks. +You can refer to the structure of the `src/eval_suite/benchs/exampleqa` folder, which serves as a minimal benchmark example. Additionally, you might want to check the `src/eval_suite/benchs/base_dataset.py` and `src/eval_suite/benchs/base_evaluator.py` files, as they provide the base classes for benchmarks. 1. **Creating a Benchmark Folder** - Create a new folder under the `benchs` directory. @@ -33,7 +33,7 @@ You can refer to the structure of the `eval/benchs/exampleqa` folder, which serv ## Adding a New Model Loader -You can refer to the `eval/llms/huggingface.py` and `eval/llms/openai_api.py` files as examples for loading LLMs. +You can refer to the `src/eval_suite/llms/huggingface.py` and `src/eval_suite/llms/openai_api.py` files as examples for loading LLMs. 1. **Language Model Loader** - Create a new file under the `llms` directory. diff --git a/docs/architecture.md b/docs/architecture.md index d4f1ac7..c39e0dc 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -9,7 +9,7 @@ A base evaluator and dataset under `benchs` provide default evaluation logic and ## Structure ```bash -eval +src/eval_suite/ ├── __init__.py ├── cli.py # Command line interface ├── logging.py # Global logging configuration diff --git a/eval/__init__.py b/src/eval_suite/__init__.py similarity index 100% rename from eval/__init__.py rename to src/eval_suite/__init__.py diff --git a/eval/benchs/__init__.py b/src/eval_suite/benchs/__init__.py similarity index 100% rename from eval/benchs/__init__.py rename to src/eval_suite/benchs/__init__.py diff --git a/eval/benchs/base_dataset.py b/src/eval_suite/benchs/base_dataset.py similarity index 100% rename from eval/benchs/base_dataset.py rename to src/eval_suite/benchs/base_dataset.py diff --git a/eval/benchs/base_evaluator.py b/src/eval_suite/benchs/base_evaluator.py similarity index 100% rename from eval/benchs/base_evaluator.py rename to src/eval_suite/benchs/base_evaluator.py diff --git a/eval/benchs/ceval/README.md b/src/eval_suite/benchs/ceval/README.md similarity index 100% rename from eval/benchs/ceval/README.md rename to src/eval_suite/benchs/ceval/README.md diff --git a/eval/benchs/ceval/dataset.py b/src/eval_suite/benchs/ceval/dataset.py similarity index 100% rename from eval/benchs/ceval/dataset.py rename to src/eval_suite/benchs/ceval/dataset.py diff --git a/eval/benchs/ceval/eval_ceval.py b/src/eval_suite/benchs/ceval/eval_ceval.py similarity index 100% rename from eval/benchs/ceval/eval_ceval.py rename to src/eval_suite/benchs/ceval/eval_ceval.py diff --git a/eval/benchs/ceval/subject_mapping.json b/src/eval_suite/benchs/ceval/subject_mapping.json similarity index 100% rename from eval/benchs/ceval/subject_mapping.json rename to src/eval_suite/benchs/ceval/subject_mapping.json diff --git a/eval/benchs/ceval/utils.py b/src/eval_suite/benchs/ceval/utils.py similarity index 100% rename from eval/benchs/ceval/utils.py rename to src/eval_suite/benchs/ceval/utils.py diff --git a/eval/benchs/exampleqa/README.md b/src/eval_suite/benchs/exampleqa/README.md similarity index 100% rename from eval/benchs/exampleqa/README.md rename to src/eval_suite/benchs/exampleqa/README.md diff --git a/eval/benchs/exampleqa/dataset.py b/src/eval_suite/benchs/exampleqa/dataset.py similarity index 100% rename from eval/benchs/exampleqa/dataset.py rename to src/eval_suite/benchs/exampleqa/dataset.py diff --git a/eval/benchs/exampleqa/dataset_exampleqa.jsonl b/src/eval_suite/benchs/exampleqa/dataset_exampleqa.jsonl similarity index 100% rename from eval/benchs/exampleqa/dataset_exampleqa.jsonl rename to src/eval_suite/benchs/exampleqa/dataset_exampleqa.jsonl diff --git a/eval/benchs/exampleqa/eval_exampleqa.py b/src/eval_suite/benchs/exampleqa/eval_exampleqa.py similarity index 98% rename from eval/benchs/exampleqa/eval_exampleqa.py rename to src/eval_suite/benchs/exampleqa/eval_exampleqa.py index 582cca5..d9410a9 100644 --- a/eval/benchs/exampleqa/eval_exampleqa.py +++ b/src/eval_suite/benchs/exampleqa/eval_exampleqa.py @@ -1,6 +1,6 @@ import os -from eval.llms.base_llm import BaseLLM +from ...llms.base_llm import BaseLLM from ..base_evaluator import BaseEvaluator from .dataset import ExampleQADataset diff --git a/eval/benchs/halluqa/README.md b/src/eval_suite/benchs/halluqa/README.md similarity index 100% rename from eval/benchs/halluqa/README.md rename to src/eval_suite/benchs/halluqa/README.md diff --git a/eval/benchs/halluqa/dataset.py b/src/eval_suite/benchs/halluqa/dataset.py similarity index 100% rename from eval/benchs/halluqa/dataset.py rename to src/eval_suite/benchs/halluqa/dataset.py diff --git a/eval/benchs/halluqa/dataset_halluqa.json b/src/eval_suite/benchs/halluqa/dataset_halluqa.json similarity index 100% rename from eval/benchs/halluqa/dataset_halluqa.json rename to src/eval_suite/benchs/halluqa/dataset_halluqa.json diff --git a/eval/benchs/halluqa/dataset_halluqa_mc.json b/src/eval_suite/benchs/halluqa/dataset_halluqa_mc.json similarity index 100% rename from eval/benchs/halluqa/dataset_halluqa_mc.json rename to src/eval_suite/benchs/halluqa/dataset_halluqa_mc.json diff --git a/eval/benchs/halluqa/eval_base.py b/src/eval_suite/benchs/halluqa/eval_base.py similarity index 100% rename from eval/benchs/halluqa/eval_base.py rename to src/eval_suite/benchs/halluqa/eval_base.py diff --git a/eval/benchs/halluqa/eval_halluqa_mc.py b/src/eval_suite/benchs/halluqa/eval_halluqa_mc.py similarity index 100% rename from eval/benchs/halluqa/eval_halluqa_mc.py rename to src/eval_suite/benchs/halluqa/eval_halluqa_mc.py diff --git a/eval/benchs/halueval/README.md b/src/eval_suite/benchs/halueval/README.md similarity index 100% rename from eval/benchs/halueval/README.md rename to src/eval_suite/benchs/halueval/README.md diff --git a/eval/benchs/halueval/dataset.py b/src/eval_suite/benchs/halueval/dataset.py similarity index 100% rename from eval/benchs/halueval/dataset.py rename to src/eval_suite/benchs/halueval/dataset.py diff --git a/eval/benchs/halueval/dataset_halueval_dialogue.jsonl b/src/eval_suite/benchs/halueval/dataset_halueval_dialogue.jsonl similarity index 100% rename from eval/benchs/halueval/dataset_halueval_dialogue.jsonl rename to src/eval_suite/benchs/halueval/dataset_halueval_dialogue.jsonl diff --git a/eval/benchs/halueval/dataset_halueval_qa.jsonl b/src/eval_suite/benchs/halueval/dataset_halueval_qa.jsonl similarity index 100% rename from eval/benchs/halueval/dataset_halueval_qa.jsonl rename to src/eval_suite/benchs/halueval/dataset_halueval_qa.jsonl diff --git a/eval/benchs/halueval/dataset_halueval_summarization.jsonl b/src/eval_suite/benchs/halueval/dataset_halueval_summarization.jsonl similarity index 100% rename from eval/benchs/halueval/dataset_halueval_summarization.jsonl rename to src/eval_suite/benchs/halueval/dataset_halueval_summarization.jsonl diff --git a/eval/benchs/halueval/eval_base.py b/src/eval_suite/benchs/halueval/eval_base.py similarity index 100% rename from eval/benchs/halueval/eval_base.py rename to src/eval_suite/benchs/halueval/eval_base.py diff --git a/eval/benchs/halueval/eval_halueval_dialog.py b/src/eval_suite/benchs/halueval/eval_halueval_dialog.py similarity index 100% rename from eval/benchs/halueval/eval_halueval_dialog.py rename to src/eval_suite/benchs/halueval/eval_halueval_dialog.py diff --git a/eval/benchs/halueval/eval_halueval_qa.py b/src/eval_suite/benchs/halueval/eval_halueval_qa.py similarity index 100% rename from eval/benchs/halueval/eval_halueval_qa.py rename to src/eval_suite/benchs/halueval/eval_halueval_qa.py diff --git a/eval/benchs/halueval/eval_halueval_summa.py b/src/eval_suite/benchs/halueval/eval_halueval_summa.py similarity index 100% rename from eval/benchs/halueval/eval_halueval_summa.py rename to src/eval_suite/benchs/halueval/eval_halueval_summa.py diff --git a/eval/benchs/uhgeval/README.md b/src/eval_suite/benchs/uhgeval/README.md similarity index 100% rename from eval/benchs/uhgeval/README.md rename to src/eval_suite/benchs/uhgeval/README.md diff --git a/eval/benchs/uhgeval/dataset.py b/src/eval_suite/benchs/uhgeval/dataset.py similarity index 100% rename from eval/benchs/uhgeval/dataset.py rename to src/eval_suite/benchs/uhgeval/dataset.py diff --git a/eval/benchs/uhgeval/dataset_uhgeval_concise.jsonl b/src/eval_suite/benchs/uhgeval/dataset_uhgeval_concise.jsonl similarity index 100% rename from eval/benchs/uhgeval/dataset_uhgeval_concise.jsonl rename to src/eval_suite/benchs/uhgeval/dataset_uhgeval_concise.jsonl diff --git a/eval/benchs/uhgeval/dataset_uhgeval_full.jsonl b/src/eval_suite/benchs/uhgeval/dataset_uhgeval_full.jsonl similarity index 100% rename from eval/benchs/uhgeval/dataset_uhgeval_full.jsonl rename to src/eval_suite/benchs/uhgeval/dataset_uhgeval_full.jsonl diff --git a/eval/benchs/uhgeval/eval_base.py b/src/eval_suite/benchs/uhgeval/eval_base.py similarity index 100% rename from eval/benchs/uhgeval/eval_base.py rename to src/eval_suite/benchs/uhgeval/eval_base.py diff --git a/eval/benchs/uhgeval/eval_disc_keyword.py b/src/eval_suite/benchs/uhgeval/eval_disc_keyword.py similarity index 100% rename from eval/benchs/uhgeval/eval_disc_keyword.py rename to src/eval_suite/benchs/uhgeval/eval_disc_keyword.py diff --git a/eval/benchs/uhgeval/eval_disc_sentence.py b/src/eval_suite/benchs/uhgeval/eval_disc_sentence.py similarity index 100% rename from eval/benchs/uhgeval/eval_disc_sentence.py rename to src/eval_suite/benchs/uhgeval/eval_disc_sentence.py diff --git a/eval/benchs/uhgeval/eval_gene.py b/src/eval_suite/benchs/uhgeval/eval_gene.py similarity index 100% rename from eval/benchs/uhgeval/eval_gene.py rename to src/eval_suite/benchs/uhgeval/eval_gene.py diff --git a/eval/benchs/uhgeval/eval_sele.py b/src/eval_suite/benchs/uhgeval/eval_sele.py similarity index 100% rename from eval/benchs/uhgeval/eval_sele.py rename to src/eval_suite/benchs/uhgeval/eval_sele.py diff --git a/eval/cli.py b/src/eval_suite/cli.py similarity index 100% rename from eval/cli.py rename to src/eval_suite/cli.py diff --git a/eval/llms/__init__.py b/src/eval_suite/llms/__init__.py similarity index 100% rename from eval/llms/__init__.py rename to src/eval_suite/llms/__init__.py diff --git a/eval/llms/base_llm.py b/src/eval_suite/llms/base_llm.py similarity index 100% rename from eval/llms/base_llm.py rename to src/eval_suite/llms/base_llm.py diff --git a/eval/llms/huggingface.py b/src/eval_suite/llms/huggingface.py similarity index 100% rename from eval/llms/huggingface.py rename to src/eval_suite/llms/huggingface.py diff --git a/eval/llms/openai_api.py b/src/eval_suite/llms/openai_api.py similarity index 100% rename from eval/llms/openai_api.py rename to src/eval_suite/llms/openai_api.py diff --git a/eval/logging.py b/src/eval_suite/logging.py similarity index 100% rename from eval/logging.py rename to src/eval_suite/logging.py diff --git a/eval/metrics.py b/src/eval_suite/metrics.py similarity index 100% rename from eval/metrics.py rename to src/eval_suite/metrics.py diff --git a/eval/utils.py b/src/eval_suite/utils.py similarity index 100% rename from eval/utils.py rename to src/eval_suite/utils.py diff --git a/tests/test_benchs/__init__.py b/tests/benchs/__init__.py similarity index 100% rename from tests/test_benchs/__init__.py rename to tests/benchs/__init__.py diff --git a/tests/test_benchs/test_base_dataset.py b/tests/benchs/test_base_dataset.py similarity index 97% rename from tests/test_benchs/test_base_dataset.py rename to tests/benchs/test_base_dataset.py index 535b0ca..a16afed 100644 --- a/tests/test_benchs/test_base_dataset.py +++ b/tests/benchs/test_base_dataset.py @@ -1,6 +1,6 @@ import unittest -from eval.benchs.base_dataset import DummyDataset +from eval_suite.benchs.base_dataset import DummyDataset class TestDummyDataset(unittest.TestCase): diff --git a/tests/test_benchs/test_base_evaluator.py b/tests/benchs/test_base_evaluator.py similarity index 97% rename from tests/test_benchs/test_base_evaluator.py rename to tests/benchs/test_base_evaluator.py index 6f69ba0..8ef35d1 100644 --- a/tests/test_benchs/test_base_evaluator.py +++ b/tests/benchs/test_base_evaluator.py @@ -2,8 +2,8 @@ import unittest from unittest.mock import MagicMock -from eval.benchs.base_evaluator import DummyEvaluator -from eval.llms.base_llm import BaseLLM +from eval_suite.benchs.base_evaluator import DummyEvaluator +from eval_suite.llms.base_llm import BaseLLM class TestDummyEvaluator(unittest.TestCase): diff --git a/tests/test_llms/__init__.py b/tests/llms/__init__.py similarity index 100% rename from tests/test_llms/__init__.py rename to tests/llms/__init__.py diff --git a/tests/test_llms/test_base_llm.py b/tests/llms/test_base_llm.py similarity index 95% rename from tests/test_llms/test_base_llm.py rename to tests/llms/test_base_llm.py index 7731c04..e068d84 100644 --- a/tests/test_llms/test_base_llm.py +++ b/tests/llms/test_base_llm.py @@ -1,7 +1,7 @@ import unittest from unittest.mock import MagicMock -from eval.llms.base_llm import BaseLLM +from eval_suite.llms.base_llm import BaseLLM class TestBaseLLM(unittest.TestCase): diff --git a/tests/test_llms/test_huggingface.py b/tests/llms/test_huggingface.py similarity index 96% rename from tests/test_llms/test_huggingface.py rename to tests/llms/test_huggingface.py index 3b9856b..7142624 100644 --- a/tests/test_llms/test_huggingface.py +++ b/tests/llms/test_huggingface.py @@ -2,7 +2,7 @@ import torch -from eval.llms.huggingface import HuggingFace +from eval_suite.llms.huggingface import HuggingFace class TestHuggingFace(unittest.TestCase): diff --git a/tests/test_llms/test_openai_api.py b/tests/llms/test_openai_api.py similarity index 92% rename from tests/test_llms/test_openai_api.py rename to tests/llms/test_openai_api.py index c829070..7f103e2 100644 --- a/tests/test_llms/test_openai_api.py +++ b/tests/llms/test_openai_api.py @@ -1,6 +1,6 @@ import unittest -from eval.llms.openai_api import OpenAIAPI +from eval_suite.llms.openai_api import OpenAIAPI class TestOpenAIAPI(unittest.TestCase): diff --git a/tests/test_metrics.py b/tests/test_metrics.py index d48cb2e..417b968 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -1,6 +1,6 @@ import unittest -from eval.metrics import bert_score, bleu_4, keyword_precision, rouge_l +from eval_suite.metrics import bert_score, bleu_4, keyword_precision, rouge_l class TestEvaluationFunctions(unittest.TestCase): From 0568d2483c37d53353c425be0c9c6f96ae25615f Mon Sep 17 00:00:00 2001 From: Ki-Seki Date: Fri, 4 Oct 2024 02:15:04 +0800 Subject: [PATCH 2/2] build: packaging to PyPI --- .gitignore | 1 + README.md | 40 ++++++------- demo.ipynb | 6 +- docs/experiments/20240822/expt.py | 4 +- pyproject.toml | 58 +++++++++++++++++++ requirements.txt | 23 -------- .../benchs/exampleqa/eval_exampleqa.py | 1 - src/eval_suite/cli.py | 6 +- 8 files changed, 88 insertions(+), 51 deletions(-) create mode 100644 pyproject.toml delete mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore index 97c5698..3cdc8ee 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ /.vscode/ /output/ +/dist/ __pycache__/ diff --git a/README.md b/README.md index d581390..84dd794 100644 --- a/README.md +++ b/README.md @@ -4,8 +4,8 @@

What does this repository include?
- UHGEval: An unconstrained hallucination evaluation benchmark.
- Eval Suite: A user-friendly evaluation framework for hallucination tasks.
+ UHGEval: An unconstrained hallucination evaluation benchmark.
+ Eval Suite: A user-friendly evaluation framework for hallucination tasks.
Eval Suite supports other benchmarks, such as HalluQA and HaluEval.

@@ -31,36 +31,32 @@ ## Quick Start ```bash -# Clone the repository -git clone https://github.com/IAAR-Shanghai/UHGEval.git -cd UHGEval - -# Install dependencies +# Install Eval Suite conda create -n uhg python=3.10 conda activate uhg -pip install -r requirements.txt +pip install eval-suite # Run evaluations with OpenAI Compatible API -python -m eval.cli eval openai \ +eval_suite eval openai \ --model_name gpt-4o \ --api_key your_api_key \ --base_url https://api.openai.com/v1 \ --evaluators ExampleQAEvaluator UHGSelectiveEvaluator # Or run evaluations with Hugging Face Transformers -python -m eval.cli eval huggingface \ +eval_suite eval huggingface \ --model_name_or_path Qwen/Qwen2-0.5B-Instruct \ --apply_chat_template \ --evaluators ExampleQAEvaluator UHGSelectiveEvaluator # After evaluation, you can gather statistics of the evaluation results -python -m eval.cli stat +eval_suite stat # List all available evaluators -python -m eval.cli list +eval_suite list # Get help -python -m eval.cli --help +eval_suite --help ``` > [!Tip] @@ -113,13 +109,13 @@ UHGEval is a large-scale benchmark designed for evaluating hallucination in prof To facilitate evaluation, we have developed a user-friendly evaluation framework called Eval Suite. Currently, Eval Suite supports common hallucination evaluation benchmarks, allowing for comprehensive evaluation of the same LLM with just one command as shown in the [Quick Start](#quick-start) section. -| Benchmark | Evaluator | More Information | -| --------- | -------------------------------------------------------------------------------------------------------------- | ---------------------------------------------- | -| C-Eval | `CEvalEvaluator` | [eval/benchs/ceval](eval/benchs/ceval) | -| ExampleQA | `ExampleQAEvaluator` | [eval/benchs/exampleqa](eval/benchs/exampleqa) | -| HalluQA | `HalluQAMCEvaluator` | [eval/benchs/halluqa](eval/benchs/halluqa) | -| HaluEval | `HaluEvalDialogEvaluator`
`HaluEvalQAEvaluator`
`HaluEvalSummaEvaluator` | [eval/benchs/halueval](eval/benchs/halueval) | -| UHGEval | `UHGDiscKeywordEvaluator`
`UHGDiscSentenceEvaluator`
`UHGGenerativeEvaluator`
`UHGSelectiveEvaluator` | [eval/benchs/uhgeval](eval/benchs/uhgeval) | +| Benchmark | Evaluator | More Information | +| --------- | -------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------ | +| C-Eval | `CEvalEvaluator` | [src/eval_suite/benchs/ceval](src/eval_suite/benchs/ceval) | +| ExampleQA | `ExampleQAEvaluator` | [src/eval_suite/benchs/exampleqa](src/eval_suite/benchs/exampleqa) | +| HalluQA | `HalluQAMCEvaluator` | [src/eval_suite/benchs/halluqa](src/eval_suite/benchs/halluqa) | +| HaluEval | `HaluEvalDialogEvaluator`
`HaluEvalQAEvaluator`
`HaluEvalSummaEvaluator` | [src/eval_suite/benchs/halueval](src/eval_suite/benchs/halueval) | +| UHGEval | `UHGDiscKeywordEvaluator`
`UHGDiscSentenceEvaluator`
`UHGGenerativeEvaluator`
`UHGSelectiveEvaluator` | [src/eval_suite/benchs/uhgeval](src/eval_suite/benchs/uhgeval) | ## Learn More @@ -162,8 +158,6 @@ To facilitate evaluation, we have developed a user-friendly evaluation framework
Click me to show all TODOs - [ ] feat: vLLM offline inference benchmarking -- [ ] build: packaging - [ ] feat(benchs): add TruthfulQA benchmark -- [ ] other: promotion - +- [ ] ci: auto release to PyPI
diff --git a/demo.ipynb b/demo.ipynb index 864488c..588e7d8 100644 --- a/demo.ipynb +++ b/demo.ipynb @@ -15,7 +15,11 @@ "metadata": {}, "outputs": [], "source": [ - "from eval_suite.benchs import ExampleQAEvaluator, get_all_evaluator_classes, load_evaluator\n", + "from eval_suite.benchs import (\n", + " ExampleQAEvaluator,\n", + " get_all_evaluator_classes,\n", + " load_evaluator,\n", + ")\n", "from eval_suite.llms import HuggingFace, OpenAIAPI\n", "from eval_suite.utils import save_stats" ] diff --git a/docs/experiments/20240822/expt.py b/docs/experiments/20240822/expt.py index a4dc58e..f0b9147 100644 --- a/docs/experiments/20240822/expt.py +++ b/docs/experiments/20240822/expt.py @@ -1,10 +1,10 @@ -from eval.benchs import ( +from eval_suite.benchs import ( UHGDiscKeywordEvaluator, UHGDiscSentenceEvaluator, UHGGenerativeEvaluator, UHGSelectiveEvaluator, ) -from eval.llms import OpenAIAPI +from eval_suite.llms import OpenAIAPI glm = OpenAIAPI( model_name="THUDM/glm-4-9b-chat", diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..dfb1f85 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,58 @@ +[build-system] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" + +[project] +name = "eval_suite" +dependencies = [ + # Common + "torch", + "tqdm", + "ipykernel", + + # OpenAI API + "openai", + "tenacity", + + # Hugging Face Transformers + "transformers", + "accelerate", + "sentencepiece", + + # Metrics + "nltk", + "rouge_score", + "text2vec", + "absl-py", + + # Formatting + "black", + "isort", +] +authors = [{ name = "Shichao Song", email = "songshichao517@gmail.com" }] +description = "User-friendly evaluation framework: Eval Suite & Benchmarks: UHGEval, HaluEval, HalluQA, etc." +license = { file = "LICENSE" } +keywords = [ + "UHGEval", + "Chinese", + "hallucination", + "evaluation", + "llm", + "eval_suite", +] +requires-python = ">=3.10" +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", +] +dynamic = ["readme", "version"] + +[project.urls] +Repository = "https://github.com/IAAR-Shanghai/UHGEval" + +[project.scripts] +eval_suite = "eval_suite.cli:main" + +[tool.hatch.version] +source = "vcs" diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 7788a99..0000000 --- a/requirements.txt +++ /dev/null @@ -1,23 +0,0 @@ -# Common -torch -tqdm -ipykernel - -# OpenAI API -openai -tenacity - -# Hugging Face Transformers -transformers -accelerate -sentencepiece - -# Metrics -nltk -rouge_score -text2vec -absl-py - -# Formatting -black -isort diff --git a/src/eval_suite/benchs/exampleqa/eval_exampleqa.py b/src/eval_suite/benchs/exampleqa/eval_exampleqa.py index d9410a9..0912f0b 100644 --- a/src/eval_suite/benchs/exampleqa/eval_exampleqa.py +++ b/src/eval_suite/benchs/exampleqa/eval_exampleqa.py @@ -1,7 +1,6 @@ import os from ...llms.base_llm import BaseLLM - from ..base_evaluator import BaseEvaluator from .dataset import ExampleQADataset diff --git a/src/eval_suite/cli.py b/src/eval_suite/cli.py index e80782e..50c5c26 100644 --- a/src/eval_suite/cli.py +++ b/src/eval_suite/cli.py @@ -51,7 +51,7 @@ def parse_args(): # fmt: on -if __name__ == "__main__": +def main(): args = parse_args() logger.info(f"Start the CLI with args: {args}") @@ -80,3 +80,7 @@ def parse_args(): elif args.operation_name == "list": print("All evaluators:") pprint(all_evaluators) + + +if __name__ == "__main__": + main()