Skip to content

Commit

Permalink
Work around offline bug in optimum
Browse files Browse the repository at this point in the history
Optimum raises if you try to use offline mode with hf hub.

huggingface/optimum#1796
  • Loading branch information
naddeoa committed Apr 7, 2024
1 parent 7095b14 commit 318aaec
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 6 deletions.
7 changes: 6 additions & 1 deletion Dockerfile.cache_test
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ WORKDIR /opt/whylogs-container
RUN chown -R whylabs:whylabs /opt/whylogs-container
USER whylabs

ENV LLM_CONTAINER=True
ENV CONTAINER_CACHE_BASE=/opt/whylogs-container/.cache
ENV HF_HOME=$CONTAINER_CACHE_BASE/hf_home/
ENV NLTK_DATA=$CONTAINER_CACHE_BASE/nltk_data/
Expand Down Expand Up @@ -38,9 +37,15 @@ RUN poetry config virtualenvs.in-project true
RUN poetry install --no-root --extras "all" --without dev
RUN rm -rf .venv/lib/python3.10/site-packages/pandas/tests # Pandas deploys a ton of tests to pypi

ENV TRANSFORMERS_VERBOSITY=debug

copy ./langkit ./langkit
RUN bash -c "source .venv/bin/activate; python -m langkit.scripts.langkit_cache"
RUN find $CONTAINER_CACHE_BASE/

# This step will fail if any network requests happen
ENV TRANSFORMERS_OFFLINE=1
ENV HF_DATASETS_OFFLINE=1
ENV HF_HUB_OFFLINE=1
RUN --network=none bash -c "source .venv/bin/activate; python -m langkit.scripts.langkit_cache --skip-downloads"

31 changes: 26 additions & 5 deletions langkit/metrics/topic_onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# pyright: reportUnknownVariableType=none
# pyright: reportUnknownLambdaType=none

import os
from dataclasses import dataclass
from functools import lru_cache, partial
from typing import List, Optional, TypedDict
Expand All @@ -23,26 +24,46 @@

_hypothesis_template = "This example is about {}"

_model = "MoritzLaurer/xtremedistil-l6-h256-zeroshot-v1.1-all-33"
_revision = "dea69e79cd6063916d08b883ea8a3c1823fd10b4"


def _download_assets():
ORTModelForSequenceClassification.from_pretrained(
"MoritzLaurer/xtremedistil-l6-h256-zeroshot-v1.1-all-33",
_model,
subfolder="onnx",
file_name="model.onnx",
revision=_revision,
export=False,
)
AutoTokenizer.from_pretrained(_model, revision=_revision)


def _get_tokenizer() -> PreTrainedTokenizerBase:
return AutoTokenizer.from_pretrained("MoritzLaurer/xtremedistil-l6-h256-zeroshot-v1.1-all-33")
return AutoTokenizer.from_pretrained(_model, revision=_revision, local_files_only=True)


def _get_model() -> PreTrainedModel:
# return ORTModelForSequenceClassification.from_pretrained(
# _model,
# subfolder="onnx",
# file_name="model.onnx",
# export=False,
# revision=_revision,
# local_files_only=True,
# )
# Optimum doesn't support offline mode https://github.com/huggingface/optimum/issues/1796
# workaround for now is to reference the actual model path after caching it. Uncomment the above code when the issue is resolved

model_name = _model.replace("/", "--")
home_dir = os.path.expanduser("~")
base = os.environ.get("HF_HOME", os.path.join(home_dir, ".cache/huggingface"))
model_path = f"{base}/hub/models--{model_name }/snapshots/{_revision}"
return ORTModelForSequenceClassification.from_pretrained(
"MoritzLaurer/xtremedistil-l6-h256-zeroshot-v1.1-all-33",
subfolder="onnx",
file_name="model.onnx",
model_path,
file_name="onnx/model.onnx",
export=False,
revision=_revision,
local_files_only=True,
)

Expand Down

0 comments on commit 318aaec

Please sign in to comment.