diff --git a/Dockerfile.cache_test b/Dockerfile.cache_test index fd08b2c..86ece0e 100644 --- a/Dockerfile.cache_test +++ b/Dockerfile.cache_test @@ -4,7 +4,6 @@ WORKDIR /opt/whylogs-container RUN chown -R whylabs:whylabs /opt/whylogs-container USER whylabs -ENV LLM_CONTAINER=True ENV CONTAINER_CACHE_BASE=/opt/whylogs-container/.cache ENV HF_HOME=$CONTAINER_CACHE_BASE/hf_home/ ENV NLTK_DATA=$CONTAINER_CACHE_BASE/nltk_data/ @@ -38,9 +37,15 @@ RUN poetry config virtualenvs.in-project true RUN poetry install --no-root --extras "all" --without dev RUN rm -rf .venv/lib/python3.10/site-packages/pandas/tests # Pandas deploys a ton of tests to pypi +ENV TRANSFORMERS_VERBOSITY=debug copy ./langkit ./langkit RUN bash -c "source .venv/bin/activate; python -m langkit.scripts.langkit_cache" +RUN find $CONTAINER_CACHE_BASE/ + # This step will fail if any network requests happen +ENV TRANSFORMERS_OFFLINE=1 +ENV HF_DATASETS_OFFLINE=1 +ENV HF_HUB_OFFLINE=1 RUN --network=none bash -c "source .venv/bin/activate; python -m langkit.scripts.langkit_cache --skip-downloads" diff --git a/langkit/metrics/topic_onnx.py b/langkit/metrics/topic_onnx.py index 849f288..2511bc0 100644 --- a/langkit/metrics/topic_onnx.py +++ b/langkit/metrics/topic_onnx.py @@ -2,6 +2,7 @@ # pyright: reportUnknownVariableType=none # pyright: reportUnknownLambdaType=none +import os from dataclasses import dataclass from functools import lru_cache, partial from typing import List, Optional, TypedDict @@ -23,26 +24,46 @@ _hypothesis_template = "This example is about {}" +_model = "MoritzLaurer/xtremedistil-l6-h256-zeroshot-v1.1-all-33" +_revision = "dea69e79cd6063916d08b883ea8a3c1823fd10b4" + def _download_assets(): ORTModelForSequenceClassification.from_pretrained( - "MoritzLaurer/xtremedistil-l6-h256-zeroshot-v1.1-all-33", + _model, subfolder="onnx", file_name="model.onnx", + revision=_revision, export=False, ) + AutoTokenizer.from_pretrained(_model, revision=_revision) def _get_tokenizer() -> PreTrainedTokenizerBase: - return AutoTokenizer.from_pretrained("MoritzLaurer/xtremedistil-l6-h256-zeroshot-v1.1-all-33") + return AutoTokenizer.from_pretrained(_model, revision=_revision, local_files_only=True) def _get_model() -> PreTrainedModel: + # return ORTModelForSequenceClassification.from_pretrained( + # _model, + # subfolder="onnx", + # file_name="model.onnx", + # export=False, + # revision=_revision, + # local_files_only=True, + # ) + # Optimum doesn't support offline mode https://github.com/huggingface/optimum/issues/1796 + # workaround for now is to reference the actual model path after caching it. Uncomment the above code when the issue is resolved + + model_name = _model.replace("/", "--") + home_dir = os.path.expanduser("~") + base = os.environ.get("HF_HOME", os.path.join(home_dir, ".cache/huggingface")) + model_path = f"{base}/hub/models--{model_name }/snapshots/{_revision}" return ORTModelForSequenceClassification.from_pretrained( - "MoritzLaurer/xtremedistil-l6-h256-zeroshot-v1.1-all-33", - subfolder="onnx", - file_name="model.onnx", + model_path, + file_name="onnx/model.onnx", export=False, + revision=_revision, local_files_only=True, )