From 59c782728db65980503ac645612e415050abb3ba Mon Sep 17 00:00:00 2001
From: Vicki Boykis <vicki@mozilla.ai>
Date: Mon, 29 Jan 2024 14:02:50 -0500
Subject: [PATCH 01/16] initial eval server commit

---
 src/flamingo/jobs/lm_harness/config.py     |  8 +++++-
 src/flamingo/jobs/lm_harness/entrypoint.py | 32 +++++++++++++++-------
 2 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/src/flamingo/jobs/lm_harness/config.py b/src/flamingo/jobs/lm_harness/config.py
index f5d33b86..69eec279 100644
--- a/src/flamingo/jobs/lm_harness/config.py
+++ b/src/flamingo/jobs/lm_harness/config.py
@@ -23,11 +23,17 @@ class LMHarnessEvaluatorConfig(BaseFlamingoConfig):
     num_fewshot: int | None = None
     limit: int | float | None = None
 
+class InferenceServerConfig(BaseFlamingoConfig):
+    """Inference Server URL endpoint path"""
+
+    base_url: String | None
+
+
 
 class LMHarnessJobConfig(BaseFlamingoConfig):
     """Configuration to run an lm-evaluation-harness evaluation job."""
 
-    model: AutoModelConfig
+    model: AutoModelConfig | InferenceServerConfig
     evaluator: LMHarnessEvaluatorConfig
     quantization: QuantizationConfig | None = None
     tracking: WandbRunConfig | None = None
diff --git a/src/flamingo/jobs/lm_harness/entrypoint.py b/src/flamingo/jobs/lm_harness/entrypoint.py
index ec2efadf..d1e5769f 100644
--- a/src/flamingo/jobs/lm_harness/entrypoint.py
+++ b/src/flamingo/jobs/lm_harness/entrypoint.py
@@ -4,6 +4,7 @@
 import ray
 import wandb
 from lm_eval.models.huggingface import HFLM
+from lm_eval.models.OpenaiCompletionsLM import OpenaiCompletionsLM
 from peft import PeftConfig
 
 from flamingo.integrations.huggingface import resolve_loadable_path
@@ -30,19 +31,30 @@ def log_evaluation_artifact(run_name: str, results: dict[str, dict[str, Any]]) -
     return wandb.log_artifact(artifact)
 
 
-def load_harness_model(config: LMHarnessJobConfig) -> HFLM:
+def load_harness_model(config: LMHarnessJobConfig) -> HFLM | OpenaiCompletionsLM:
     # Helper method to return lm-harness model wrapper
     def loader(pretrained: str, tokenizer: str, peft: str | None):
         quantization_kwargs = config.quantization.dict() if config.quantization else {}
-        return HFLM(
-            pretrained=pretrained,
-            tokenizer=tokenizer,
-            peft=peft,
-            device="cuda" if config.ray.num_gpus > 0 else None,
-            trust_remote_code=config.model.trust_remote_code,
-            dtype=config.model.torch_dtype if config.model.torch_dtype else "auto",
-            **quantization_kwargs,
-        )
+
+        """Load model directly from HF if HF path, otherwise from an inference server URL"""
+
+        if isinstance(config.model) == AutoModelConfig:
+            return HFLM(
+                pretrained=pretrained,
+                tokenizer=tokenizer,
+                peft=peft,
+                device="cuda" if config.ray.num_gpus > 0 else None,
+                trust_remote_code=config.model.trust_remote_code,
+                dtype=config.model.torch_dtype if config.model.torch_dtype else "auto",
+                **quantization_kwargs,
+            )
+        elif isinstance(config.model) == InferenceServerConfig:
+            return OpenaiCompletionsLM(
+                model=pretrained,
+                base_url = base_url,
+                tokenizer = tokenizer,
+            )
+
 
     # We don't know if the checkpoint is adapter weights or merged model weights
     # Try to load as an adapter and fall back to the checkpoint containing the full model

From 72f097327a1075dcffc000f57be9161dd1c7ff60 Mon Sep 17 00:00:00 2001
From: Vicki Boykis <vicki@mozilla.ai>
Date: Mon, 29 Jan 2024 16:28:21 -0500
Subject: [PATCH 02/16] Adding Pydantic vllm types

---
 .../integrations/huggingface/model_config.py  |  3 ++-
 .../integrations/huggingface/repo_config.py   |  5 ++--
 src/flamingo/integrations/vllm/__init__.py    |  4 +++
 .../integrations/vllm/model_config.py         | 18 +++++++++++++
 src/flamingo/integrations/vllm/path_config.py | 25 +++++++++++++++++++
 src/flamingo/jobs/lm_harness/config.py        | 14 +++++------
 src/flamingo/jobs/lm_harness/entrypoint.py    | 20 +++++++--------
 7 files changed, 67 insertions(+), 22 deletions(-)
 create mode 100644 src/flamingo/integrations/vllm/__init__.py
 create mode 100644 src/flamingo/integrations/vllm/model_config.py
 create mode 100644 src/flamingo/integrations/vllm/path_config.py

diff --git a/src/flamingo/integrations/huggingface/model_config.py b/src/flamingo/integrations/huggingface/model_config.py
index d8abec54..ad9b6117 100644
--- a/src/flamingo/integrations/huggingface/model_config.py
+++ b/src/flamingo/integrations/huggingface/model_config.py
@@ -1,6 +1,6 @@
 from pydantic import validator
 
-from flamingo.integrations.huggingface import HuggingFaceRepoConfig, convert_string_to_repo_config
+from flamingo.integrations.huggingface import HuggingFaceRepoConfig,convert_string_to_repo_config
 from flamingo.integrations.wandb import WandbArtifactConfig
 from flamingo.types import BaseFlamingoConfig, TorchDtypeString
 
@@ -18,3 +18,4 @@ class AutoModelConfig(BaseFlamingoConfig):
     _validate_load_from_string = validator("load_from", pre=True, allow_reuse=True)(
         convert_string_to_repo_config
     )
+
diff --git a/src/flamingo/integrations/huggingface/repo_config.py b/src/flamingo/integrations/huggingface/repo_config.py
index 461a19fb..32e47429 100644
--- a/src/flamingo/integrations/huggingface/repo_config.py
+++ b/src/flamingo/integrations/huggingface/repo_config.py
@@ -42,6 +42,5 @@ def validate_repo_id(cls, x):
             raise ValueError(f"{x} is not a valid HuggingFace repo ID.")
         return x
 
-
-LoadFromConfig = HuggingFaceRepoConfig | WandbArtifactConfig
-"""Config that can be resolved to a HuggingFace name/path."""
+LoadFromLocalConfig = HuggingFaceRepoConfig | WandbArtifactConfig
+"""Config that can be resolved to a HuggingFace name/path or a local path."""
diff --git a/src/flamingo/integrations/vllm/__init__.py b/src/flamingo/integrations/vllm/__init__.py
new file mode 100644
index 00000000..e0f203ed
--- /dev/null
+++ b/src/flamingo/integrations/vllm/__init__.py
@@ -0,0 +1,4 @@
+# ruff: noqa: I001
+from flamingo.integrations.vllm.model_config import *
+from flamingo.integrations.vllm.path_config import *
+
diff --git a/src/flamingo/integrations/vllm/model_config.py b/src/flamingo/integrations/vllm/model_config.py
new file mode 100644
index 00000000..698c9d92
--- /dev/null
+++ b/src/flamingo/integrations/vllm/model_config.py
@@ -0,0 +1,18 @@
+from pydantic import validator
+
+from flamingo.integrations.wandb import WandbArtifactConfig
+from flamingo.types import BaseFlamingoConfig, TorchDtypeString
+from flamingo.integrations.vllm import LocalServerConfig
+
+
+class InferenceServerConfig(BaseFlamingoConfig):
+    """Inference Server URL endpoint path"""
+
+    load_from: LocalServerConfig | WandbArtifactConfig
+
+    trust_remote_code: bool = False
+    torch_dtype: TorchDtypeString | None = None
+
+    _validate_load_from_string = validator("load_from", pre=True, allow_reuse=True)(
+        convert_string_to_repo_config
+    )
\ No newline at end of file
diff --git a/src/flamingo/integrations/vllm/path_config.py b/src/flamingo/integrations/vllm/path_config.py
new file mode 100644
index 00000000..95bdf3c4
--- /dev/null
+++ b/src/flamingo/integrations/vllm/path_config.py
@@ -0,0 +1,25 @@
+from typing import Any
+
+from huggingface_hub.utils import HFValidationError, validate_repo_id
+from pydantic import validator
+
+from flamingo.integrations.wandb import WandbArtifactConfig
+from flamingo.types import BaseFlamingoConfig
+
+
+class LocalServerConfig(BaseFlamingoConfig):
+    """Configuration for a HuggingFace Hub repository."""
+
+    __match_args__ = ("path")
+
+    path: str
+
+    @validator("path", pre=True)
+    def validate_repo_id(cls, x):
+        if isinstance(x, str) and "v1/completions" not in x:
+            raise ValueError(f"{x} is not a valid vLLM OpenAI style inference server.")
+        return x
+
+
+LoadFromLocalConfig = LocalServerConfig | WandbArtifactConfig
+"""Config that can be resolved to a local server path or a weightsandbiases path."""
diff --git a/src/flamingo/jobs/lm_harness/config.py b/src/flamingo/jobs/lm_harness/config.py
index 69eec279..78355732 100644
--- a/src/flamingo/jobs/lm_harness/config.py
+++ b/src/flamingo/jobs/lm_harness/config.py
@@ -2,6 +2,7 @@
 
 from pydantic import Field, conlist, validator
 
+from flamingo.integrations.vllm import LocalServerConfig
 from flamingo.integrations.huggingface import AutoModelConfig, QuantizationConfig
 from flamingo.integrations.wandb import WandbRunConfig
 from flamingo.types import BaseFlamingoConfig
@@ -23,17 +24,12 @@ class LMHarnessEvaluatorConfig(BaseFlamingoConfig):
     num_fewshot: int | None = None
     limit: int | float | None = None
 
-class InferenceServerConfig(BaseFlamingoConfig):
-    """Inference Server URL endpoint path"""
-
-    base_url: String | None
-
 
 
 class LMHarnessJobConfig(BaseFlamingoConfig):
     """Configuration to run an lm-evaluation-harness evaluation job."""
 
-    model: AutoModelConfig | InferenceServerConfig
+    model: AutoModelConfig | InferenceServerConfig = None
     evaluator: LMHarnessEvaluatorConfig
     quantization: QuantizationConfig | None = None
     tracking: WandbRunConfig | None = None
@@ -41,7 +37,9 @@ class LMHarnessJobConfig(BaseFlamingoConfig):
 
     @validator("model", pre=True, always=True)
     def validate_model_arg(cls, x):
-        """Allow for passing just a path string as the model argument."""
-        if isinstance(x, str):
+        """Allow for passing a path string as the model argument."""
+        if "v1/completions" in x:
+                return InferenceServerConfig(load_from=x)
+        else:
             return AutoModelConfig(load_from=x)
         return x
diff --git a/src/flamingo/jobs/lm_harness/entrypoint.py b/src/flamingo/jobs/lm_harness/entrypoint.py
index d1e5769f..18c684e1 100644
--- a/src/flamingo/jobs/lm_harness/entrypoint.py
+++ b/src/flamingo/jobs/lm_harness/entrypoint.py
@@ -4,10 +4,11 @@
 import ray
 import wandb
 from lm_eval.models.huggingface import HFLM
-from lm_eval.models.OpenaiCompletionsLM import OpenaiCompletionsLM
+from lm_eval.models.openai_completions import OpenaiCompletionsLM
 from peft import PeftConfig
 
-from flamingo.integrations.huggingface import resolve_loadable_path
+from flamingo.integrations.huggingface import AutoModelConfig, resolve_loadable_path
+from flamingo.integrations.vllm import InferenceServerConfig
 from flamingo.integrations.wandb import (
     ArtifactType,
     WandbResumeMode,
@@ -33,14 +34,14 @@ def log_evaluation_artifact(run_name: str, results: dict[str, dict[str, Any]]) -
 
 def load_harness_model(config: LMHarnessJobConfig) -> HFLM | OpenaiCompletionsLM:
     # Helper method to return lm-harness model wrapper
-    def loader(pretrained: str, tokenizer: str, peft: str | None):
-        quantization_kwargs = config.quantization.dict() if config.quantization else {}
-
+    def loader(model: str | None, tokenizer: str, peft: str | None):
         """Load model directly from HF if HF path, otherwise from an inference server URL"""
 
         if isinstance(config.model) == AutoModelConfig:
+            quantization_kwargs = config.quantization.dict() if config.quantization else {}
+
             return HFLM(
-                pretrained=pretrained,
+                pretrained=model,
                 tokenizer=tokenizer,
                 peft=peft,
                 device="cuda" if config.ray.num_gpus > 0 else None,
@@ -50,12 +51,11 @@ def loader(pretrained: str, tokenizer: str, peft: str | None):
             )
         elif isinstance(config.model) == InferenceServerConfig:
             return OpenaiCompletionsLM(
-                model=pretrained,
-                base_url = base_url,
-                tokenizer = tokenizer,
+                model=model,
+                base_url=base_url,
+                tokenizer=tokenizer,
             )
 
-
     # We don't know if the checkpoint is adapter weights or merged model weights
     # Try to load as an adapter and fall back to the checkpoint containing the full model
     load_path, revision = resolve_loadable_path(config.model.load_from)

From 0c048fed3d01b3fa1de9238ab3413f5ea58d3e48 Mon Sep 17 00:00:00 2001
From: Vicki Boykis <vicki@mozilla.ai>
Date: Tue, 30 Jan 2024 15:40:20 -0500
Subject: [PATCH 03/16] example file

---
 examples/configs/lm_harness_hf_config.yaml | 25 ++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 examples/configs/lm_harness_hf_config.yaml

diff --git a/examples/configs/lm_harness_hf_config.yaml b/examples/configs/lm_harness_hf_config.yaml
new file mode 100644
index 00000000..09a1ce79
--- /dev/null
+++ b/examples/configs/lm_harness_hf_config.yaml
@@ -0,0 +1,25 @@
+# Model to evaluate
+model:
+  load_from: "distilgpt2"
+  torch_dtype: "bfloat16"
+  
+
+# Settings specific to lm_harness.evaluate
+evaluator:
+  tasks: ["hellaswag"]
+  num_fewshot: 5
+  limit: 10
+
+quantization:
+  load_in_4bit: True
+  bnb_4bit_quant_type: "fp4"
+
+# Tracking info for where to log the run results
+tracking:
+  name: "flamingo-example-lm-harness"
+  project: "flamingo-examples"
+  entity: "mozilla-ai"
+
+ray:
+  num_cpus: 1
+  num_gpus: 1
\ No newline at end of file

From f5668a1984d48898fc5605a4b078c5ad7c86dff4 Mon Sep 17 00:00:00 2001
From: Vicki Boykis <vicki@mozilla.ai>
Date: Tue, 30 Jan 2024 15:49:01 -0500
Subject: [PATCH 04/16] simplifying vllm config

---
 src/flamingo/integrations/vllm/__init__.py    |  2 --
 .../integrations/vllm/model_config.py         | 15 ++---------
 src/flamingo/integrations/vllm/path_config.py | 25 -------------------
 src/flamingo/jobs/lm_harness/config.py        | 12 +--------
 src/flamingo/jobs/lm_harness/entrypoint.py    |  3 ++-
 5 files changed, 5 insertions(+), 52 deletions(-)
 delete mode 100644 src/flamingo/integrations/vllm/path_config.py

diff --git a/src/flamingo/integrations/vllm/__init__.py b/src/flamingo/integrations/vllm/__init__.py
index e0f203ed..9da28dbc 100644
--- a/src/flamingo/integrations/vllm/__init__.py
+++ b/src/flamingo/integrations/vllm/__init__.py
@@ -1,4 +1,2 @@
 # ruff: noqa: I001
 from flamingo.integrations.vllm.model_config import *
-from flamingo.integrations.vllm.path_config import *
-
diff --git a/src/flamingo/integrations/vllm/model_config.py b/src/flamingo/integrations/vllm/model_config.py
index 698c9d92..629b56aa 100644
--- a/src/flamingo/integrations/vllm/model_config.py
+++ b/src/flamingo/integrations/vllm/model_config.py
@@ -1,18 +1,7 @@
-from pydantic import validator
-
-from flamingo.integrations.wandb import WandbArtifactConfig
-from flamingo.types import BaseFlamingoConfig, TorchDtypeString
-from flamingo.integrations.vllm import LocalServerConfig
+from flamingo.types import BaseFlamingoConfig
 
 
 class InferenceServerConfig(BaseFlamingoConfig):
     """Inference Server URL endpoint path"""
 
-    load_from: LocalServerConfig | WandbArtifactConfig
-
-    trust_remote_code: bool = False
-    torch_dtype: TorchDtypeString | None = None
-
-    _validate_load_from_string = validator("load_from", pre=True, allow_reuse=True)(
-        convert_string_to_repo_config
-    )
\ No newline at end of file
+    base_url: str
diff --git a/src/flamingo/integrations/vllm/path_config.py b/src/flamingo/integrations/vllm/path_config.py
deleted file mode 100644
index 95bdf3c4..00000000
--- a/src/flamingo/integrations/vllm/path_config.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from typing import Any
-
-from huggingface_hub.utils import HFValidationError, validate_repo_id
-from pydantic import validator
-
-from flamingo.integrations.wandb import WandbArtifactConfig
-from flamingo.types import BaseFlamingoConfig
-
-
-class LocalServerConfig(BaseFlamingoConfig):
-    """Configuration for a HuggingFace Hub repository."""
-
-    __match_args__ = ("path")
-
-    path: str
-
-    @validator("path", pre=True)
-    def validate_repo_id(cls, x):
-        if isinstance(x, str) and "v1/completions" not in x:
-            raise ValueError(f"{x} is not a valid vLLM OpenAI style inference server.")
-        return x
-
-
-LoadFromLocalConfig = LocalServerConfig | WandbArtifactConfig
-"""Config that can be resolved to a local server path or a weightsandbiases path."""
diff --git a/src/flamingo/jobs/lm_harness/config.py b/src/flamingo/jobs/lm_harness/config.py
index 78355732..1687f4ce 100644
--- a/src/flamingo/jobs/lm_harness/config.py
+++ b/src/flamingo/jobs/lm_harness/config.py
@@ -2,8 +2,8 @@
 
 from pydantic import Field, conlist, validator
 
-from flamingo.integrations.vllm import LocalServerConfig
 from flamingo.integrations.huggingface import AutoModelConfig, QuantizationConfig
+from flamingo.integrations.vllm import InferenceServerConfig
 from flamingo.integrations.wandb import WandbRunConfig
 from flamingo.types import BaseFlamingoConfig
 
@@ -25,7 +25,6 @@ class LMHarnessEvaluatorConfig(BaseFlamingoConfig):
     limit: int | float | None = None
 
 
-
 class LMHarnessJobConfig(BaseFlamingoConfig):
     """Configuration to run an lm-evaluation-harness evaluation job."""
 
@@ -34,12 +33,3 @@ class LMHarnessJobConfig(BaseFlamingoConfig):
     quantization: QuantizationConfig | None = None
     tracking: WandbRunConfig | None = None
     ray: LMHarnessRayConfig = Field(default_factory=LMHarnessRayConfig)
-
-    @validator("model", pre=True, always=True)
-    def validate_model_arg(cls, x):
-        """Allow for passing a path string as the model argument."""
-        if "v1/completions" in x:
-                return InferenceServerConfig(load_from=x)
-        else:
-            return AutoModelConfig(load_from=x)
-        return x
diff --git a/src/flamingo/jobs/lm_harness/entrypoint.py b/src/flamingo/jobs/lm_harness/entrypoint.py
index 18c684e1..dcd7d18b 100644
--- a/src/flamingo/jobs/lm_harness/entrypoint.py
+++ b/src/flamingo/jobs/lm_harness/entrypoint.py
@@ -34,7 +34,8 @@ def log_evaluation_artifact(run_name: str, results: dict[str, dict[str, Any]]) -
 
 def load_harness_model(config: LMHarnessJobConfig) -> HFLM | OpenaiCompletionsLM:
     # Helper method to return lm-harness model wrapper
-    def loader(model: str | None, tokenizer: str, peft: str | None):
+    def _loader(model: str | None , tokenizer: str, peft: str | None):
+        
         """Load model directly from HF if HF path, otherwise from an inference server URL"""
 
         if isinstance(config.model) == AutoModelConfig:

From b57df408c5ba712656458875d6157fb631a13210 Mon Sep 17 00:00:00 2001
From: Vicki Boykis <vicki@mozilla.ai>
Date: Tue, 30 Jan 2024 15:52:05 -0500
Subject: [PATCH 05/16] fix None type

---
 src/flamingo/jobs/lm_harness/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/flamingo/jobs/lm_harness/config.py b/src/flamingo/jobs/lm_harness/config.py
index 1687f4ce..30be6aa3 100644
--- a/src/flamingo/jobs/lm_harness/config.py
+++ b/src/flamingo/jobs/lm_harness/config.py
@@ -28,7 +28,7 @@ class LMHarnessEvaluatorConfig(BaseFlamingoConfig):
 class LMHarnessJobConfig(BaseFlamingoConfig):
     """Configuration to run an lm-evaluation-harness evaluation job."""
 
-    model: AutoModelConfig | InferenceServerConfig = None
+    model: AutoModelConfig | InferenceServerConfig
     evaluator: LMHarnessEvaluatorConfig
     quantization: QuantizationConfig | None = None
     tracking: WandbRunConfig | None = None

From 0aa4f85736088d197df8a5558059c0240a55916d Mon Sep 17 00:00:00 2001
From: Vicki Boykis <vicki@mozilla.ai>
Date: Tue, 30 Jan 2024 15:56:49 -0500
Subject: [PATCH 06/16] vllm conf example

---
 examples/configs/lm_harness.yaml             | 25 --------------------
 examples/configs/lm_harness_vllm_config.yaml | 17 +++++++++++++
 2 files changed, 17 insertions(+), 25 deletions(-)
 delete mode 100644 examples/configs/lm_harness.yaml
 create mode 100644 examples/configs/lm_harness_vllm_config.yaml

diff --git a/examples/configs/lm_harness.yaml b/examples/configs/lm_harness.yaml
deleted file mode 100644
index 1e8380ca..00000000
--- a/examples/configs/lm_harness.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-# Model to evaluate
-model:
-  load_from: "distilgpt2"
-  torch_dtype: "bfloat16"
-
-# Settings specific to lm_harness.evaluate
-evaluator:
-  tasks: ["hellaswag"]
-  num_fewshot: 5
-  limit: 10
-
-quantization:
-  load_in_4bit: True
-  bnb_4bit_quant_type: "fp4"
-
-# Tracking info for where to log the run results
-tracking:
-  name: "flamingo-example-lm-harness"
-  project: "flamingo-examples"
-  entity: "mozilla-ai"
-
-ray:
-  num_cpus: 1
-  num_gpus: 1
-  timeout: 3600
diff --git a/examples/configs/lm_harness_vllm_config.yaml b/examples/configs/lm_harness_vllm_config.yaml
new file mode 100644
index 00000000..4ea1d851
--- /dev/null
+++ b/examples/configs/lm_harness_vllm_config.yaml
@@ -0,0 +1,17 @@
+# Model to evaluate, specified as a W&B artifact
+model:
+    base_url: "1.2.3.4:8000/v1/completions"
+
+# Settings specific to lm_harness.evaluate
+evaluator:
+  tasks: ["gsk8"]
+  num_fewshot: 5
+
+tracking:
+  name: "mistral-finetune"
+  project: "mistral-finetune"
+  entity: "mozilla-ai"
+
+ray:
+  num_cpus: 1
+  timeout: 3600
\ No newline at end of file

From 117c32865b1ff677ca3d244096cd1bb2f25481d9 Mon Sep 17 00:00:00 2001
From: Vicki Boykis <vicki@mozilla.ai>
Date: Tue, 30 Jan 2024 15:58:23 -0500
Subject: [PATCH 07/16] fix text

---
 src/flamingo/integrations/huggingface/repo_config.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/flamingo/integrations/huggingface/repo_config.py b/src/flamingo/integrations/huggingface/repo_config.py
index 32e47429..7e0ac460 100644
--- a/src/flamingo/integrations/huggingface/repo_config.py
+++ b/src/flamingo/integrations/huggingface/repo_config.py
@@ -42,5 +42,6 @@ def validate_repo_id(cls, x):
             raise ValueError(f"{x} is not a valid HuggingFace repo ID.")
         return x
 
-LoadFromLocalConfig = HuggingFaceRepoConfig | WandbArtifactConfig
+
+LoadFromConfig = HuggingFaceRepoConfig | WandbArtifactConfig
 """Config that can be resolved to a HuggingFace name/path or a local path."""

From d946c821d673b7a4c0aac296b61ae03243020c44 Mon Sep 17 00:00:00 2001
From: Vicki Boykis <vicki@mozilla.ai>
Date: Tue, 30 Jan 2024 16:01:33 -0500
Subject: [PATCH 08/16] fix test

---
 examples/configs/lm_harness_hf_config.yaml |  1 -
 tests/unit/jobs/test_lm_harness_config.py  | 17 +----------------
 2 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/examples/configs/lm_harness_hf_config.yaml b/examples/configs/lm_harness_hf_config.yaml
index 09a1ce79..df5a2765 100644
--- a/examples/configs/lm_harness_hf_config.yaml
+++ b/examples/configs/lm_harness_hf_config.yaml
@@ -3,7 +3,6 @@ model:
   load_from: "distilgpt2"
   torch_dtype: "bfloat16"
   
-
 # Settings specific to lm_harness.evaluate
 evaluator:
   tasks: ["hellaswag"]
diff --git a/tests/unit/jobs/test_lm_harness_config.py b/tests/unit/jobs/test_lm_harness_config.py
index a2a07bdc..7a93b0a9 100644
--- a/tests/unit/jobs/test_lm_harness_config.py
+++ b/tests/unit/jobs/test_lm_harness_config.py
@@ -54,21 +54,6 @@ def test_parse_yaml_file(lm_harness_job_config):
 
 def test_load_example_config(examples_dir):
     """Load the example configs to make sure they stay up to date."""
-    config_file = examples_dir / "configs" / "lm_harness.yaml"
+    config_file = examples_dir / "configs" / "lm_harness_hf_config.yaml"
     config = LMHarnessJobConfig.from_yaml_file(config_file)
     assert LMHarnessJobConfig.parse_raw(config.json()) == config
-
-
-def test_model_validation(lm_harness_evaluator_config):
-    model_repo = HuggingFaceRepoConfig(repo_id="model_repo")
-    allowed_config = LMHarnessJobConfig(
-        model=model_repo.repo_id,
-        evaluator=lm_harness_evaluator_config,
-    )
-    assert allowed_config.model.load_from == model_repo
-
-    with pytest.raises(ValidationError):
-        LMHarnessJobConfig(model="invalid...hf..repo", evaluator=lm_harness_evaluator_config)
-
-    with pytest.raises(ValidationError):
-        LMHarnessJobConfig(model=12345, evaluator=lm_harness_evaluator_config)

From 7665c56f28cd3cf184a6568629ce35b271150796 Mon Sep 17 00:00:00 2001
From: Vicki Boykis <vicki@mozilla.ai>
Date: Wed, 31 Jan 2024 06:21:16 -0500
Subject: [PATCH 09/16] unit tests

---
 .../integrations/vllm/model_config.py         |  1 +
 tests/unit/conftest.py                        |  6 ++++
 tests/unit/jobs/test_lm_harness_config.py     | 30 +++++++++++++++++--
 3 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/src/flamingo/integrations/vllm/model_config.py b/src/flamingo/integrations/vllm/model_config.py
index 629b56aa..3d916a53 100644
--- a/src/flamingo/integrations/vllm/model_config.py
+++ b/src/flamingo/integrations/vllm/model_config.py
@@ -4,4 +4,5 @@
 class InferenceServerConfig(BaseFlamingoConfig):
     """Inference Server URL endpoint path"""
 
+    load_from: HuggingFaceRepoConfig 
     base_url: str
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index 934e5f1c..be5472f3 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -8,6 +8,7 @@
     QuantizationConfig,
     TextDatasetConfig,
 )
+from flamingo.integrations.vllm import InferenceServerConfig
 from flamingo.integrations.wandb import WandbArtifactConfig, WandbRunConfig
 
 
@@ -22,6 +23,11 @@ def model_config_with_artifact():
     return AutoModelConfig(load_from=artifact, trust_remote_code=True)
 
 
+def model_config_with_vllm():
+    artifact = InferenceServerConfig(base_url="1.2.3.4:8000/v1/completions")
+    return AutoModelConfig(load_from=artifact, trust_remote_code=True)
+
+
 @pytest.fixture
 def tokenizer_config_with_repo_id():
     return AutoTokenizerConfig(load_from="mistral-ai/mistral-7", trust_remote_code=True)
diff --git a/tests/unit/jobs/test_lm_harness_config.py b/tests/unit/jobs/test_lm_harness_config.py
index 7a93b0a9..dd324a29 100644
--- a/tests/unit/jobs/test_lm_harness_config.py
+++ b/tests/unit/jobs/test_lm_harness_config.py
@@ -26,6 +26,9 @@ def lm_harness_ray_config():
     )
 
 
+""" Test for HuggingFace model"""
+
+
 @pytest.fixture
 def lm_harness_job_config(
     model_config_with_artifact,
@@ -43,6 +46,26 @@ def lm_harness_job_config(
     )
 
 
+"""test for vLLM-loaded model"""
+
+
+@pytest.fixture
+def lm_harness_vllm_job_config(
+    model_config_with_vllm,
+    quantization_config,
+    wandb_run_config,
+    lm_harness_evaluator_config,
+    lm_harness_ray_config,
+):
+    return LMHarnessJobConfig(
+        model=model_config_with_artifact,
+        evaluator=lm_harness_evaluator_config,
+        ray=lm_harness_ray_config,
+        tracking=wandb_run_config,
+        quantization=quantization_config,
+    )
+
+
 def test_serde_round_trip(lm_harness_job_config):
     assert LMHarnessJobConfig.parse_raw(lm_harness_job_config.json()) == lm_harness_job_config
 
@@ -52,8 +75,11 @@ def test_parse_yaml_file(lm_harness_job_config):
         assert lm_harness_job_config == LMHarnessJobConfig.from_yaml_file(config_path)
 
 
-def test_load_example_config(examples_dir):
+@pytest.mark.parametrize(
+    "file_suffix", ["lm_harness_hf_config.yaml", "lm_harness_vllm_config.yaml"]
+)
+def test_load_example_config(examples_dir, file_suffix):
     """Load the example configs to make sure they stay up to date."""
-    config_file = examples_dir / "configs" / "lm_harness_hf_config.yaml"
+    config_file = examples_dir / "configs" / file_suffix
     config = LMHarnessJobConfig.from_yaml_file(config_file)
     assert LMHarnessJobConfig.parse_raw(config.json()) == config

From d5656315896ee8a205f82f16aa3f0e65243fc7f5 Mon Sep 17 00:00:00 2001
From: Vicki Boykis <vicki@mozilla.ai>
Date: Wed, 31 Jan 2024 09:15:34 -0500
Subject: [PATCH 10/16] adding more unit tests

---
 CONTRIBUTING.md                               |  5 ++-
 README.md                                     |  6 ++--
 examples/configs/lm_harness_hf_config.yaml    |  4 +--
 examples/configs/lm_harness_vllm_config.yaml  |  2 +-
 .../integrations/huggingface/model_config.py  |  3 +-
 .../integrations/vllm/model_config.py         |  1 -
 src/flamingo/jobs/lm_harness/config.py        |  2 +-
 src/flamingo/jobs/lm_harness/entrypoint.py    | 36 +++++++++----------
 tests/conftest.py                             |  2 +-
 tests/integration/conftest.py                 |  2 +-
 tests/resources/README.md                     |  2 +-
 .../datasets/xyz.hf/dataset_info.json         |  2 +-
 tests/resources/datasets/xyz.hf/state.json    |  2 +-
 tests/unit/conftest.py                        |  3 +-
 tests/unit/jobs/test_lm_harness_config.py     |  4 +--
 15 files changed, 34 insertions(+), 42 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 3d3c3d37..6d55960a 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -13,7 +13,7 @@ Ruff will pick up the configuration defined in the `pyproject.toml` file automat
 
 `flamingo` is intended to be installed as a pip requirement in the runtime environment of a Ray job.
 However, it is often desirable to test local branches on Ray before publishing a new version of the library.
-This is possible submitting a Ray job with a runtime environment that points to your 
+This is possible submitting a Ray job with a runtime environment that points to your
 development branch of the `flamingo` repo.
 
 To do so, follow the steps:
@@ -24,7 +24,7 @@ To do so, follow the steps:
     poetry export --without-hashes --with finetuning,evaluation -o requirements.txt
     ```
 
-    The following command will create a `requirements.txt` file in the repository 
+    The following command will create a `requirements.txt` file in the repository
     that contains the dependencies for the `finetuning` and `evaluation` job groups:
 
 2. When submitting a job to cluster, specify in the Ray runtime environment the following:
@@ -42,4 +42,3 @@ To do so, follow the steps:
     but does not install its entrypoint in the environment path.
 
 An example of this workflow can be found in the `examples/dev_workflow.ipynb` notebook.
-
diff --git a/README.md b/README.md
index d2e6c189..1e8b4b5c 100644
--- a/README.md
+++ b/README.md
@@ -32,7 +32,7 @@ This will install an editable version of the package along with all of its depen
 
 Poetry should recognize your active virtual environment during installation
 If you have an active Conda environment, Poetry should recognize it during installation
-and install the package dependencies there. 
+and install the package dependencies there.
 This hasn't been explicitly tested with other virtual python environments, but will likely work.
 
 Alternatively, you can use poetry's own environment by running
@@ -44,7 +44,7 @@ poetry install
 where `python3.10` is your python interpreter.
 
 The `pyproject.toml` file defines dependency groups for the logical job types in the package.
-Individual dependency groups can be installed by running 
+Individual dependency groups can be installed by running
 `poetry install --with <group1>,<group2>` or `poetry install --only <group>`.
 
 See the [contributing](CONTRIBUTING.md) guide for more information on development workflows.
@@ -52,7 +52,7 @@ See the [contributing](CONTRIBUTING.md) guide for more information on developmen
 ### Usage
 
 `flamingo` exposes a simple CLI with a few commands, one for each Ray job type.
-Jobs are expected to take as input a YAML configuration file 
+Jobs are expected to take as input a YAML configuration file
 that contains all necessary parameters/settings for the work.
 See the `examples/configs` folder for examples of the configuration structure.
 
diff --git a/examples/configs/lm_harness_hf_config.yaml b/examples/configs/lm_harness_hf_config.yaml
index df5a2765..0b4520df 100644
--- a/examples/configs/lm_harness_hf_config.yaml
+++ b/examples/configs/lm_harness_hf_config.yaml
@@ -2,7 +2,7 @@
 model:
   load_from: "distilgpt2"
   torch_dtype: "bfloat16"
-  
+
 # Settings specific to lm_harness.evaluate
 evaluator:
   tasks: ["hellaswag"]
@@ -21,4 +21,4 @@ tracking:
 
 ray:
   num_cpus: 1
-  num_gpus: 1
\ No newline at end of file
+  num_gpus: 1
diff --git a/examples/configs/lm_harness_vllm_config.yaml b/examples/configs/lm_harness_vllm_config.yaml
index 4ea1d851..0a869955 100644
--- a/examples/configs/lm_harness_vllm_config.yaml
+++ b/examples/configs/lm_harness_vllm_config.yaml
@@ -14,4 +14,4 @@ tracking:
 
 ray:
   num_cpus: 1
-  timeout: 3600
\ No newline at end of file
+  timeout: 3600
diff --git a/src/flamingo/integrations/huggingface/model_config.py b/src/flamingo/integrations/huggingface/model_config.py
index ad9b6117..d8abec54 100644
--- a/src/flamingo/integrations/huggingface/model_config.py
+++ b/src/flamingo/integrations/huggingface/model_config.py
@@ -1,6 +1,6 @@
 from pydantic import validator
 
-from flamingo.integrations.huggingface import HuggingFaceRepoConfig,convert_string_to_repo_config
+from flamingo.integrations.huggingface import HuggingFaceRepoConfig, convert_string_to_repo_config
 from flamingo.integrations.wandb import WandbArtifactConfig
 from flamingo.types import BaseFlamingoConfig, TorchDtypeString
 
@@ -18,4 +18,3 @@ class AutoModelConfig(BaseFlamingoConfig):
     _validate_load_from_string = validator("load_from", pre=True, allow_reuse=True)(
         convert_string_to_repo_config
     )
-
diff --git a/src/flamingo/integrations/vllm/model_config.py b/src/flamingo/integrations/vllm/model_config.py
index 3d916a53..629b56aa 100644
--- a/src/flamingo/integrations/vllm/model_config.py
+++ b/src/flamingo/integrations/vllm/model_config.py
@@ -4,5 +4,4 @@
 class InferenceServerConfig(BaseFlamingoConfig):
     """Inference Server URL endpoint path"""
 
-    load_from: HuggingFaceRepoConfig 
     base_url: str
diff --git a/src/flamingo/jobs/lm_harness/config.py b/src/flamingo/jobs/lm_harness/config.py
index 30be6aa3..5fbfe897 100644
--- a/src/flamingo/jobs/lm_harness/config.py
+++ b/src/flamingo/jobs/lm_harness/config.py
@@ -1,6 +1,6 @@
 import datetime
 
-from pydantic import Field, conlist, validator
+from pydantic import Field, conlist
 
 from flamingo.integrations.huggingface import AutoModelConfig, QuantizationConfig
 from flamingo.integrations.vllm import InferenceServerConfig
diff --git a/src/flamingo/jobs/lm_harness/entrypoint.py b/src/flamingo/jobs/lm_harness/entrypoint.py
index dcd7d18b..e16e9874 100644
--- a/src/flamingo/jobs/lm_harness/entrypoint.py
+++ b/src/flamingo/jobs/lm_harness/entrypoint.py
@@ -34,8 +34,7 @@ def log_evaluation_artifact(run_name: str, results: dict[str, dict[str, Any]]) -
 
 def load_harness_model(config: LMHarnessJobConfig) -> HFLM | OpenaiCompletionsLM:
     # Helper method to return lm-harness model wrapper
-    def _loader(model: str | None , tokenizer: str, peft: str | None):
-        
+    def loader(model: str | None, tokenizer: str, base_url: str | None, peft: str | None):
         """Load model directly from HF if HF path, otherwise from an inference server URL"""
 
         if isinstance(config.model) == AutoModelConfig:
@@ -50,6 +49,22 @@ def _loader(model: str | None , tokenizer: str, peft: str | None):
                 dtype=config.model.torch_dtype if config.model.torch_dtype else "auto",
                 **quantization_kwargs,
             )
+            # We don't know if the checkpoint is adapter weights or merged model weights
+            # Try to load as an adapter and fall back to the checkpoint containing the full model
+            load_path, revision = resolve_loadable_path(config.model.load_from)
+            try:
+                peft_config = PeftConfig.from_pretrained(load_path, revision=revision)
+                return loader(
+                    pretrained=peft_config.base_model_name_or_path,
+                    tokenizer=peft_config.base_model_name_or_path,
+                    peft=load_path,
+                )
+            except ValueError as e:
+                print(
+                    f"Unable to load model as adapter: {e}. "
+                    "This is expected if the checkpoint does not contain adapter weights."
+                )
+            return loader(pretrained=load_path, tokenizer=load_path, peft=None)
         elif isinstance(config.model) == InferenceServerConfig:
             return OpenaiCompletionsLM(
                 model=model,
@@ -57,23 +72,6 @@ def _loader(model: str | None , tokenizer: str, peft: str | None):
                 tokenizer=tokenizer,
             )
 
-    # We don't know if the checkpoint is adapter weights or merged model weights
-    # Try to load as an adapter and fall back to the checkpoint containing the full model
-    load_path, revision = resolve_loadable_path(config.model.load_from)
-    try:
-        peft_config = PeftConfig.from_pretrained(load_path, revision=revision)
-        return loader(
-            pretrained=peft_config.base_model_name_or_path,
-            tokenizer=peft_config.base_model_name_or_path,
-            peft=load_path,
-        )
-    except ValueError as e:
-        print(
-            f"Unable to load model as adapter: {e}. "
-            "This is expected if the checkpoint does not contain adapter weights."
-        )
-        return loader(pretrained=load_path, tokenizer=load_path, peft=None)
-
 
 def load_and_evaluate(config: LMHarnessJobConfig) -> dict[str, Any]:
     print("Initializing lm-harness tasks...")
diff --git a/tests/conftest.py b/tests/conftest.py
index 89edd4f1..d18ffe9c 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,5 +1,5 @@
 """
-Tests for the Flamingo. 
+Tests for the Flamingo.
 
 This file is used to provide fixtures for the test session that are accessible to all submodules.
 """
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index ec2f0c59..477a4f2a 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -34,7 +34,7 @@ def disabled_wandb_env(temporary_storage_path):
         "WANDB_DIR": str(storage / "wandb" / "logs"),
         "WANDB_CACHE_DIR": str(storage / "wandb" / "cache"),
         "WANDB_CONFIG_DIR": str(storage / "wandb" / "configs"),
-        "WANDB_API_KEY": "MY-API-KEY",
+        "WANDB_API_KEY": "MY-API-KEY",  # pragma: allowlist secret
         "WANDB_MODE": "disabled",
     }
     with mock.patch.dict(os.environ, wandb_env):
diff --git a/tests/resources/README.md b/tests/resources/README.md
index cd6305c8..2b970012 100644
--- a/tests/resources/README.md
+++ b/tests/resources/README.md
@@ -4,5 +4,5 @@ Collection of resources to load/parse during tests.
 
 These resources should be kept as small as possible to minimize the git repo size.
 
-When applicable, helper scripts for re-generating the resources 
+When applicable, helper scripts for re-generating the resources
 can be added to the appropriate subfolders.
diff --git a/tests/resources/datasets/xyz.hf/dataset_info.json b/tests/resources/datasets/xyz.hf/dataset_info.json
index 872c12a3..63102527 100644
--- a/tests/resources/datasets/xyz.hf/dataset_info.json
+++ b/tests/resources/datasets/xyz.hf/dataset_info.json
@@ -17,4 +17,4 @@
   },
   "homepage": "",
   "license": ""
-}
\ No newline at end of file
+}
diff --git a/tests/resources/datasets/xyz.hf/state.json b/tests/resources/datasets/xyz.hf/state.json
index 93fe3d1d..5bfaa1c6 100644
--- a/tests/resources/datasets/xyz.hf/state.json
+++ b/tests/resources/datasets/xyz.hf/state.json
@@ -10,4 +10,4 @@
   "_format_type": null,
   "_output_all_columns": false,
   "_split": null
-}
\ No newline at end of file
+}
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index be5472f3..155f34a2 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -24,8 +24,7 @@ def model_config_with_artifact():
 
 
 def model_config_with_vllm():
-    artifact = InferenceServerConfig(base_url="1.2.3.4:8000/v1/completions")
-    return AutoModelConfig(load_from=artifact, trust_remote_code=True)
+    return InferenceServerConfig(base_url="1.2.3.4:8000/v1/completions")
 
 
 @pytest.fixture
diff --git a/tests/unit/jobs/test_lm_harness_config.py b/tests/unit/jobs/test_lm_harness_config.py
index dd324a29..100dad53 100644
--- a/tests/unit/jobs/test_lm_harness_config.py
+++ b/tests/unit/jobs/test_lm_harness_config.py
@@ -1,7 +1,5 @@
 import pytest
-from pydantic import ValidationError
 
-from flamingo.integrations.huggingface import HuggingFaceRepoConfig
 from flamingo.jobs.lm_harness import (
     LMHarnessEvaluatorConfig,
     LMHarnessJobConfig,
@@ -58,7 +56,7 @@ def lm_harness_vllm_job_config(
     lm_harness_ray_config,
 ):
     return LMHarnessJobConfig(
-        model=model_config_with_artifact,
+        model=model_config_with_vllm,
         evaluator=lm_harness_evaluator_config,
         ray=lm_harness_ray_config,
         tracking=wandb_run_config,

From 7b1651f99dbd0db5a9adf78328e1d8fd8b428c78 Mon Sep 17 00:00:00 2001
From: Vicki Boykis <vicki@mozilla.ai>
Date: Wed, 31 Jan 2024 09:17:20 -0500
Subject: [PATCH 11/16] requirements and precommit

---
 .gitignore              |  3 +++
 .pre-commit-config.yaml | 26 ++++++++++++++++++++++++++
 2 files changed, 29 insertions(+)
 create mode 100644 .pre-commit-config.yaml

diff --git a/.gitignore b/.gitignore
index 83fd0216..1c65f5f5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -164,3 +164,6 @@ cython_debug/
 
 # Poetry
 poetry.lock
+
+# Ignore requirements since we only use for local builds
+requirements.txt
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..94a13bd4
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,26 @@
+repos:
+  - repo: https://github.com/Yelp/detect-secrets
+    rev: v1.2.0
+    hooks:
+      - id: detect-secrets
+        exclude: requirements_lock.txt|tests/resources/datasets/xyz.hf/state.json
+
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v2.3.0
+    hooks:
+      - id: check-merge-conflict
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: requirements-txt-fixer
+        exclude: requirements_lock.txt
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.1.7
+    hooks:
+      - id: ruff
+        args: [--exit-non-zero-on-fix]
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.1.7
+    hooks:
+      - id: ruff-format

From a14250c8c1f3009a7c3edb780b52369ac03d2291 Mon Sep 17 00:00:00 2001
From: Vicki Boykis <vicki@mozilla.ai>
Date: Wed, 31 Jan 2024 09:45:52 -0500
Subject: [PATCH 12/16] fix eval name

---
 examples/configs/lm_harness_vllm_config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/configs/lm_harness_vllm_config.yaml b/examples/configs/lm_harness_vllm_config.yaml
index 0a869955..11d8e46c 100644
--- a/examples/configs/lm_harness_vllm_config.yaml
+++ b/examples/configs/lm_harness_vllm_config.yaml
@@ -4,7 +4,7 @@ model:
 
 # Settings specific to lm_harness.evaluate
 evaluator:
-  tasks: ["gsk8"]
+  tasks: ["gsm8k"]
   num_fewshot: 5
 
 tracking:

From 3d40a372902d383f91eb15acc873d9044330ed83 Mon Sep 17 00:00:00 2001
From: Vicki Boykis <vicki@mozilla.ai>
Date: Wed, 31 Jan 2024 09:49:38 -0500
Subject: [PATCH 13/16] cleanup

---
 src/flamingo/integrations/huggingface/repo_config.py | 2 +-
 tests/conftest.py                                    | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/flamingo/integrations/huggingface/repo_config.py b/src/flamingo/integrations/huggingface/repo_config.py
index 7e0ac460..461a19fb 100644
--- a/src/flamingo/integrations/huggingface/repo_config.py
+++ b/src/flamingo/integrations/huggingface/repo_config.py
@@ -44,4 +44,4 @@ def validate_repo_id(cls, x):
 
 
 LoadFromConfig = HuggingFaceRepoConfig | WandbArtifactConfig
-"""Config that can be resolved to a HuggingFace name/path or a local path."""
+"""Config that can be resolved to a HuggingFace name/path."""
diff --git a/tests/conftest.py b/tests/conftest.py
index d18ffe9c..0cac579d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,7 +1,5 @@
 """
-Tests for the Flamingo.
-
-This file is used to provide fixtures for the test session that are accessible to all submodules.
+This file is used to provide fixtures for the test session accessible to all Flamingo submodules.
 """
 from pathlib import Path
 

From a50d6e61a2bf29a5cdaa96267a2116f09842b233 Mon Sep 17 00:00:00 2001
From: Vicki Boykis <vicki@mozilla.ai>
Date: Wed, 31 Jan 2024 12:42:44 -0500
Subject: [PATCH 14/16] refactor unit test and precommit

---
 .pre-commit-config.yaml                      |  2 +-
 examples/configs/lm_harness_hf_config.yaml   |  1 +
 examples/configs/lm_harness_vllm_config.yaml | 17 ------
 pyproject.toml                               |  1 +
 src/flamingo/integrations/vllm/__init__.py   |  1 -
 tests/integration/conftest.py                |  2 +-
 tests/unit/conftest.py                       |  3 +-
 tests/unit/jobs/test_lm_harness_config.py    | 60 ++++++++++----------
 8 files changed, 35 insertions(+), 52 deletions(-)
 delete mode 100644 examples/configs/lm_harness_vllm_config.yaml

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 94a13bd4..c2f14b3b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,7 +3,7 @@ repos:
     rev: v1.2.0
     hooks:
       - id: detect-secrets
-        exclude: requirements_lock.txt|tests/resources/datasets/xyz.hf/state.json
+        exclude: tests/integration/.+
 
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v2.3.0
diff --git a/examples/configs/lm_harness_hf_config.yaml b/examples/configs/lm_harness_hf_config.yaml
index 0b4520df..1e8380ca 100644
--- a/examples/configs/lm_harness_hf_config.yaml
+++ b/examples/configs/lm_harness_hf_config.yaml
@@ -22,3 +22,4 @@ tracking:
 ray:
   num_cpus: 1
   num_gpus: 1
+  timeout: 3600
diff --git a/examples/configs/lm_harness_vllm_config.yaml b/examples/configs/lm_harness_vllm_config.yaml
deleted file mode 100644
index 11d8e46c..00000000
--- a/examples/configs/lm_harness_vllm_config.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-# Model to evaluate, specified as a W&B artifact
-model:
-    base_url: "1.2.3.4:8000/v1/completions"
-
-# Settings specific to lm_harness.evaluate
-evaluator:
-  tasks: ["gsm8k"]
-  num_fewshot: 5
-
-tracking:
-  name: "mistral-finetune"
-  project: "mistral-finetune"
-  entity: "mozilla-ai"
-
-ray:
-  num_cpus: 1
-  timeout: 3600
diff --git a/pyproject.toml b/pyproject.toml
index 1daea82d..4011dae5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,6 +28,7 @@ ruff = "0.1.7"
 pytest = "7.4.3"
 pytest-cov = "4.1.0"
 jupyter = "1.0.0"
+pre-commit = "3.6.0"
 
 [tool.poetry.group.finetuning.dependencies]
 datasets = "2.16.1"
diff --git a/src/flamingo/integrations/vllm/__init__.py b/src/flamingo/integrations/vllm/__init__.py
index 9da28dbc..5e648365 100644
--- a/src/flamingo/integrations/vllm/__init__.py
+++ b/src/flamingo/integrations/vllm/__init__.py
@@ -1,2 +1 @@
-# ruff: noqa: I001
 from flamingo.integrations.vllm.model_config import *
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index 477a4f2a..ec2f0c59 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -34,7 +34,7 @@ def disabled_wandb_env(temporary_storage_path):
         "WANDB_DIR": str(storage / "wandb" / "logs"),
         "WANDB_CACHE_DIR": str(storage / "wandb" / "cache"),
         "WANDB_CONFIG_DIR": str(storage / "wandb" / "configs"),
-        "WANDB_API_KEY": "MY-API-KEY",  # pragma: allowlist secret
+        "WANDB_API_KEY": "MY-API-KEY",
         "WANDB_MODE": "disabled",
     }
     with mock.patch.dict(os.environ, wandb_env):
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index 155f34a2..fc1af0f4 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -23,7 +23,8 @@ def model_config_with_artifact():
     return AutoModelConfig(load_from=artifact, trust_remote_code=True)
 
 
-def model_config_with_vllm():
+@pytest.fixture
+def inference_server_config():
     return InferenceServerConfig(base_url="1.2.3.4:8000/v1/completions")
 
 
diff --git a/tests/unit/jobs/test_lm_harness_config.py b/tests/unit/jobs/test_lm_harness_config.py
index 100dad53..b1a65483 100644
--- a/tests/unit/jobs/test_lm_harness_config.py
+++ b/tests/unit/jobs/test_lm_harness_config.py
@@ -24,57 +24,55 @@ def lm_harness_ray_config():
     )
 
 
-""" Test for HuggingFace model"""
-
-
 @pytest.fixture
 def lm_harness_job_config(
+    request,
     model_config_with_artifact,
+    inference_server_config,
     quantization_config,
     wandb_run_config,
     lm_harness_evaluator_config,
     lm_harness_ray_config,
 ):
-    return LMHarnessJobConfig(
-        model=model_config_with_artifact,
-        evaluator=lm_harness_evaluator_config,
-        ray=lm_harness_ray_config,
-        tracking=wandb_run_config,
-        quantization=quantization_config,
-    )
-
-
-"""test for vLLM-loaded model"""
-
-
-@pytest.fixture
-def lm_harness_vllm_job_config(
-    model_config_with_vllm,
-    quantization_config,
-    wandb_run_config,
-    lm_harness_evaluator_config,
-    lm_harness_ray_config,
-):
-    return LMHarnessJobConfig(
-        model=model_config_with_vllm,
-        evaluator=lm_harness_evaluator_config,
-        ray=lm_harness_ray_config,
-        tracking=wandb_run_config,
-        quantization=quantization_config,
-    )
+    if request.param == "model_config_with_artifact":
+        return LMHarnessJobConfig(
+            model=model_config_with_artifact,
+            evaluator=lm_harness_evaluator_config,
+            ray=lm_harness_ray_config,
+            tracking=wandb_run_config,
+            quantization=quantization_config,
+        )
+    elif request.param == "inference_server_config":
+        return LMHarnessJobConfig(
+            model=inference_server_config,
+            evaluator=lm_harness_evaluator_config,
+            ray=lm_harness_ray_config,
+            tracking=wandb_run_config,
+            quantization=quantization_config,
+        )
 
 
+@pytest.mark.parametrize(
+    "lm_harness_job_config",
+    ["model_config_with_artifact", "inference_server_config"],
+    indirect=True,
+)
 def test_serde_round_trip(lm_harness_job_config):
     assert LMHarnessJobConfig.parse_raw(lm_harness_job_config.json()) == lm_harness_job_config
 
 
+@pytest.mark.parametrize(
+    "lm_harness_job_config",
+    ["model_config_with_artifact", "inference_server_config"],
+    indirect=True,
+)
 def test_parse_yaml_file(lm_harness_job_config):
     with lm_harness_job_config.to_tempfile() as config_path:
         assert lm_harness_job_config == LMHarnessJobConfig.from_yaml_file(config_path)
 
 
 @pytest.mark.parametrize(
-    "file_suffix", ["lm_harness_hf_config.yaml", "lm_harness_vllm_config.yaml"]
+    "file_suffix", ["lm_harness_hf_config.yaml", "lm_harness_inference_server_config.yaml"]
 )
 def test_load_example_config(examples_dir, file_suffix):
     """Load the example configs to make sure they stay up to date."""

From 402f2f716f410faf35032924835ecc4440ec7216 Mon Sep 17 00:00:00 2001
From: Vicki Boykis <vicki@mozilla.ai>
Date: Wed, 31 Jan 2024 13:57:43 -0500
Subject: [PATCH 15/16] change inference server to accept tokenizer

---
 .../lm_harness_inference_server_config.yaml   | 19 +++++
 pyproject.toml                                |  2 +-
 .../integrations/vllm/model_config.py         |  1 +
 src/flamingo/jobs/lm_harness/entrypoint.py    | 75 ++++++++++---------
 tests/unit/conftest.py                        |  4 +-
 5 files changed, 62 insertions(+), 39 deletions(-)
 create mode 100644 examples/configs/lm_harness_inference_server_config.yaml

diff --git a/examples/configs/lm_harness_inference_server_config.yaml b/examples/configs/lm_harness_inference_server_config.yaml
new file mode 100644
index 00000000..b14ce9ae
--- /dev/null
+++ b/examples/configs/lm_harness_inference_server_config.yaml
@@ -0,0 +1,19 @@
+# Model to evaluate, specified as a W&B artifact
+model:
+    base_url: "1.2.3.4:8000/v1/completions"
+    tokenizer: "mistralai/Mistral-7B-v0.1"
+
+# Settings specific to lm_harness.evaluate
+evaluator:
+  tasks: ["gsm8k"]
+  num_fewshot: 5
+  limit: 10
+
+tracking:
+  name: "mistral-finetune"
+  project: "mistral-finetune"
+  entity: "mozilla-ai"
+
+ray:
+  num_cpus: 1
+  timeout: 3600
diff --git a/pyproject.toml b/pyproject.toml
index 4011dae5..77e4666a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,7 +39,7 @@ trl = "0.7.10"
 bitsandbytes = "0.42.0"
 
 [tool.poetry.group.evaluation.dependencies]
-lm-eval = "0.4.0"
+lm-eval = "0.4.1"
 einops = "0.7.0"
 
 [tool.poetry.scripts]
diff --git a/src/flamingo/integrations/vllm/model_config.py b/src/flamingo/integrations/vllm/model_config.py
index 629b56aa..3288be04 100644
--- a/src/flamingo/integrations/vllm/model_config.py
+++ b/src/flamingo/integrations/vllm/model_config.py
@@ -5,3 +5,4 @@ class InferenceServerConfig(BaseFlamingoConfig):
     """Inference Server URL endpoint path"""
 
     base_url: str
+    tokenizer: str
diff --git a/src/flamingo/jobs/lm_harness/entrypoint.py b/src/flamingo/jobs/lm_harness/entrypoint.py
index e16e9874..184c79ef 100644
--- a/src/flamingo/jobs/lm_harness/entrypoint.py
+++ b/src/flamingo/jobs/lm_harness/entrypoint.py
@@ -33,44 +33,45 @@ def log_evaluation_artifact(run_name: str, results: dict[str, dict[str, Any]]) -
 
 
 def load_harness_model(config: LMHarnessJobConfig) -> HFLM | OpenaiCompletionsLM:
-    # Helper method to return lm-harness model wrapper
-    def loader(model: str | None, tokenizer: str, base_url: str | None, peft: str | None):
-        """Load model directly from HF if HF path, otherwise from an inference server URL"""
-
-        if isinstance(config.model) == AutoModelConfig:
-            quantization_kwargs = config.quantization.dict() if config.quantization else {}
-
-            return HFLM(
-                pretrained=model,
-                tokenizer=tokenizer,
-                peft=peft,
-                device="cuda" if config.ray.num_gpus > 0 else None,
-                trust_remote_code=config.model.trust_remote_code,
-                dtype=config.model.torch_dtype if config.model.torch_dtype else "auto",
-                **quantization_kwargs,
-            )
-            # We don't know if the checkpoint is adapter weights or merged model weights
-            # Try to load as an adapter and fall back to the checkpoint containing the full model
-            load_path, revision = resolve_loadable_path(config.model.load_from)
-            try:
-                peft_config = PeftConfig.from_pretrained(load_path, revision=revision)
-                return loader(
-                    pretrained=peft_config.base_model_name_or_path,
-                    tokenizer=peft_config.base_model_name_or_path,
-                    peft=load_path,
-                )
-            except ValueError as e:
-                print(
-                    f"Unable to load model as adapter: {e}. "
-                    "This is expected if the checkpoint does not contain adapter weights."
-                )
-            return loader(pretrained=load_path, tokenizer=load_path, peft=None)
-        elif isinstance(config.model) == InferenceServerConfig:
-            return OpenaiCompletionsLM(
-                model=model,
-                base_url=base_url,
-                tokenizer=tokenizer,
+    if isinstance(config.model, AutoModelConfig):
+        # We don't know if the checkpoint is adapter weights or merged model weights
+        # Try to load as an adapter and fall back to the checkpoint containing the full model
+        path, revision = resolve_loadable_path(config.model.load_from)
+        try:
+            peft_config = PeftConfig.from_pretrained(path, revision=revision)
+            peft_path = path
+            pretrained_model_path = peft_config.base_model_name_or_path
+        except ValueError as e:
+            print(
+                f"Unable to load model as adapter: {e}. "
+                "This is expected if the checkpoint does not contain adapter weights."
             )
+            peft_path = None
+            pretrained_model_path = path
+
+        # Return the lm-harness version of a HuggingFace LLM
+        quantization_kwargs = config.quantization.dict() if config.quantization else {}
+        return HFLM(
+            pretrained=pretrained_model_path,
+            tokenizer=pretrained_model_path,
+            peft=peft_path,
+            revision=revision,
+            device="cuda" if config.ray.num_gpus > 0 else None,
+            trust_remote_code=config.model.trust_remote_code,
+            dtype=config.model.torch_dtype if config.model.torch_dtype else "auto",
+            **quantization_kwargs,
+        )
+
+    elif isinstance(config.model, InferenceServerConfig):
+        # Return the lm-harness version of a model endpoint
+        return OpenaiCompletionsLM(
+            model="vllm-model",
+            tokenizer=config.model.tokenizer,
+            base_url=config.model.base_url,
+        )
+
+    else:
+        raise ValueError(f"Unexpected model config type: {type(config.model)}")
 
 
 def load_and_evaluate(config: LMHarnessJobConfig) -> dict[str, Any]:
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index fc1af0f4..b4ac2b07 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -25,7 +25,9 @@ def model_config_with_artifact():
 
 @pytest.fixture
 def inference_server_config():
-    return InferenceServerConfig(base_url="1.2.3.4:8000/v1/completions")
+    return InferenceServerConfig(
+        base_url="1.2.3.4:8000/v1/completions", tokenizer="mistralai/Mistral-7B-v0.1"
+    )
 
 
 @pytest.fixture

From 91caf14fe9c73a9705dc1e9d2b051cda64247111 Mon Sep 17 00:00:00 2001
From: Vicki Boykis <vicki@mozilla.ai>
Date: Wed, 31 Jan 2024 14:05:44 -0500
Subject: [PATCH 16/16] precommit

---
 .pre-commit-config.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c2f14b3b..3199668d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -12,7 +12,6 @@ repos:
       - id: trailing-whitespace
       - id: end-of-file-fixer
       - id: requirements-txt-fixer
-        exclude: requirements_lock.txt
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
     rev: v0.1.7