Setting trust_remote_code to True for HuggingFace datasets compatibil…

…ity (EleutherAI#1487) * setting trust_remote_code * dataset list no notebooks * respect trust remote code * Address changes, move cli options and change datasets * fix task for tests * headqa * remove kobest * pin datasets and address comments * clean up space
mozilla-ai · Mar 3, 2024 · 9516792 · 9516792
1 parent e5e35fc
commit 9516792
Show file tree

Hide file tree

Showing 31 changed files with 75 additions and 1 deletion.
diff --git a/lm_eval/__main__.py b/lm_eval/__main__.py
@@ -201,6 +201,12 @@ def parse_eval_args() -> argparse.Namespace:
             "E.g, `--seed 42` sets all three seeds to 42."
         ),
     )
+    parser.add_argument(
+        "--trust_remote_code",
+        default=True,
+        help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub",
+    )
+
     return parser.parse_args()
 
 
@@ -290,6 +296,16 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
             path.mkdir(parents=True, exist_ok=True)
             output_path_file = path.joinpath("results.json")
 
+    # Respect user's value passed in via CLI, otherwise default to True and add to comma-separated model args
+    if args.trust_remote_code:
+        os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = (
+            args.trust_remote_code if args.trust_remote_code else True
+        )
+        args.model_args = (
+            args.model_args
+            + f",trust_remote_code={os.environ['HF_DATASETS_TRUST_REMOTE_CODE']}"
+        )
+
     eval_logger.info(f"Selected Tasks: {task_names}")
     eval_logger.info("Loading selected tasks...")
 

diff --git a/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml b/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml
@@ -14,3 +14,5 @@ metric_list:
     higher_is_better: true
 metadata:
   version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/arithmetic/arithmetic_2da.yaml b/lm_eval/tasks/arithmetic/arithmetic_2da.yaml
@@ -1,3 +1,5 @@
 include: arithmetic_1dc.yaml
 task: arithmetic_2da
 dataset_name: arithmetic_2da
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/arithmetic/arithmetic_2dm.yaml b/lm_eval/tasks/arithmetic/arithmetic_2dm.yaml
@@ -1,3 +1,5 @@
 include: arithmetic_1dc.yaml
 task: arithmetic_2dm
 dataset_name: arithmetic_2dm
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/arithmetic/arithmetic_2ds.yaml b/lm_eval/tasks/arithmetic/arithmetic_2ds.yaml
@@ -1,3 +1,5 @@
 include: arithmetic_1dc.yaml
 task: arithmetic_2ds
 dataset_name: arithmetic_2ds
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/arithmetic/arithmetic_3da.yaml b/lm_eval/tasks/arithmetic/arithmetic_3da.yaml
@@ -1,3 +1,5 @@
 include: arithmetic_1dc.yaml
 task: arithmetic_3da
 dataset_name: arithmetic_3da
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/arithmetic/arithmetic_3ds.yaml b/lm_eval/tasks/arithmetic/arithmetic_3ds.yaml
@@ -1,3 +1,5 @@
 include: arithmetic_1dc.yaml
 task: arithmetic_3ds
 dataset_name: arithmetic_3ds
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/arithmetic/arithmetic_4da.yaml b/lm_eval/tasks/arithmetic/arithmetic_4da.yaml
@@ -1,3 +1,5 @@
 include: arithmetic_1dc.yaml
 task: arithmetic_4da
 dataset_name: arithmetic_4da
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/arithmetic/arithmetic_4ds.yaml b/lm_eval/tasks/arithmetic/arithmetic_4ds.yaml
@@ -1,3 +1,5 @@
 include: arithmetic_1dc.yaml
 task: arithmetic_4ds
 dataset_name: arithmetic_4ds
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/arithmetic/arithmetic_5da.yaml b/lm_eval/tasks/arithmetic/arithmetic_5da.yaml
@@ -1,3 +1,5 @@
 include: arithmetic_1dc.yaml
 task: arithmetic_5da
 dataset_name: arithmetic_5da
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/arithmetic/arithmetic_5ds.yaml b/lm_eval/tasks/arithmetic/arithmetic_5ds.yaml
@@ -1,3 +1,5 @@
 include: arithmetic_1dc.yaml
 task: arithmetic_5ds
 dataset_name: arithmetic_5ds
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/asdiv/default.yaml b/lm_eval/tasks/asdiv/default.yaml
@@ -12,3 +12,5 @@ metric_list:
     higher_is_better: true
 metadata:
   version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/coqa/default.yaml b/lm_eval/tasks/coqa/default.yaml
@@ -20,3 +20,5 @@ metric_list:
     higher_is_better: true
 metadata:
   version: 3.0
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/drop/default.yaml b/lm_eval/tasks/drop/default.yaml
@@ -22,3 +22,5 @@ metric_list:
     higher_is_better: true
 metadata:
   version: 3.0
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/kobest/kobest_sentineg.yaml b/lm_eval/tasks/kobest/kobest_sentineg.yaml
@@ -21,3 +21,5 @@ metric_list:
     higher_is_better: True
 metadata:
   version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/kobest/kobest_wic.yaml b/lm_eval/tasks/kobest/kobest_wic.yaml
@@ -21,3 +21,5 @@ metric_list:
     higher_is_better: True
 metadata:
   version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/lambada/lambada_openai.yaml b/lm_eval/tasks/lambada/lambada_openai.yaml
@@ -18,3 +18,5 @@ metric_list:
     higher_is_better: true
 metadata:
   version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/logiqa/logiqa.yaml b/lm_eval/tasks/logiqa/logiqa.yaml
@@ -19,3 +19,5 @@ metric_list:
     higher_is_better: true
 metadata:
   version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/logiqa2/logieval.yaml b/lm_eval/tasks/logiqa2/logieval.yaml
@@ -25,3 +25,5 @@ filter_list:
       - function: "take_first"
 metadata:
   version: 0.0
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml b/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
@@ -23,3 +23,5 @@ num_fewshot: 0
 metadata:
   version: 1.0
   num_fewshot: 4
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml
@@ -12,3 +12,5 @@ metric_list:
   - metric: acc
 metadata:
   version: 0.0
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml
@@ -12,3 +12,5 @@ metric_list:
   - metric: acc
 metadata:
   version: 0.0
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml
@@ -12,3 +12,5 @@ metric_list:
   - metric: acc
 metadata:
   version: 0.0
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/mutual/mutual.yaml b/lm_eval/tasks/mutual/mutual.yaml
@@ -23,3 +23,5 @@ metric_list:
     higher_is_better: true
 metadata:
   version: 2.0
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/race/race.yaml b/lm_eval/tasks/race/race.yaml
@@ -12,3 +12,5 @@ metric_list:
     higher_is_better: true
 metadata:
   version: 2.0
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/unscramble/anagrams1.yaml b/lm_eval/tasks/unscramble/anagrams1.yaml
@@ -18,3 +18,5 @@ metric_list:
     ignore_punctuation: false
 metadata:
   version: 2.0
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/unscramble/anagrams2.yaml b/lm_eval/tasks/unscramble/anagrams2.yaml
@@ -18,3 +18,5 @@ metric_list:
     ignore_punctuation: false
 metadata:
   version: 2.0
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/unscramble/cycle_letters.yaml b/lm_eval/tasks/unscramble/cycle_letters.yaml
@@ -18,3 +18,5 @@ metric_list:
     ignore_punctuation: false
 metadata:
   version: 2.0
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/unscramble/random_insertion.yaml b/lm_eval/tasks/unscramble/random_insertion.yaml
@@ -18,3 +18,5 @@ metric_list:
     ignore_punctuation: false
 metadata:
   version: 2.0
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/wikitext/wikitext.yaml b/lm_eval/tasks/wikitext/wikitext.yaml
@@ -16,3 +16,5 @@ metric_list:
   - metric: bits_per_byte
 metadata:
   version: 2.0
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,7 +21,7 @@ license = { "text" = "MIT" }
 dependencies = [
     "accelerate>=0.21.0",
     "evaluate",
-    "datasets>=2.14.0",
+    "datasets>=2.16.0",
     "evaluate>=0.4.0",
     "jsonlines",
     "numexpr",