setup.cfg

[metadata]
name = crfm-helm
version = 0.2.4
author = Stanford CRFM
author_email = contact-crfm@stanford.edu
description = Benchmark for language models
long_description = file: README.md, docs/tutorial.md
long_description_content_type = text/markdown
keywords = language models benchmarking
license = Apache License 2.0
classifiers =
    Programming Language :: Python :: 3 :: Only
    Programming Language :: Python :: 3.8
    License :: OSI Approved :: Apache Software License
url = https://github.com/stanford-crfm/helm

[options]
python_requires = >=3.8,<3.11
package_dir =
    =src
packages = find:
zip_safe = False
include_package_data = True

install_requires=
    # Common
    cattrs~=22.2.0
    dacite~=1.6.0
    importlib-resources~=5.10.0
    Mako~=1.2.3
    numpy~=1.23.3
    pyhocon~=0.3.59
    retrying~=1.3.4
    spacy~=3.5.3
    tqdm~=4.64.1
    zstandard~=0.18.0
    # sqlitedict==2.0.0 is slow! https://github.com/RaRe-Technologies/sqlitedict/issues/152
    # Keep sqlitedict version at 1.7.0.
    sqlitedict~=1.7.0
    bottle~=0.12.23
    # TODO: Remove these from common
    pymongo~=4.2.0

    # Basic Scenarios
    datasets~=2.5.2
    pyarrow>=11.0.0  # Pinned transitive dependency for datasets; workaround for #1026

    # Basic metrics
    nltk~=3.7
    pyext~=0.7
    rouge-score~=0.1.2
    scipy~=1.10.0
    uncertainty-calibration~=0.1.4
    scikit-learn~=1.1.2

    # Models and Metrics Extras
    transformers~=4.33.1  # For anthropic_client, huggingface_client, huggingface_tokenizer, test_openai_token_cost_estimator, model_summac (via summarization_metrics)
    # TODO: Upgrade torch - we need > 2.0.0 for newer versions of transformers
    torch>=1.12.1,<3.0.0  # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics)
    torchvision>=0.13.1,<3.0.0  # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics)

    # Metrics Extras
    google-api-python-client~=2.64.0  # For perspective_api_client via toxicity_metrics

[options.extras_require]
proxy-server =
    gunicorn~=20.1.0

human-evaluation =
    scaleapi~=2.13.0
    surge-api~=1.1.0

scenarios =
    gdown~=4.4.0  # For disinformation_scenario, med_mcqa_scenario, med_qa_scenario: used by ensure_file_downloaded()
    sympy~=1.11.1  # For numeracy_scenario
    xlrd~=2.0.1  # For ice_scenario: used by pandas.read_excel()

metrics =
    numba~=0.56.4  # For copyright_metrics
    pytrec_eval==0.5  # For ranking_metrics
    sacrebleu~=2.2.1  # For disinformation_metrics, machine_translation_metrics
    summ-eval~=0.892  # For summarization_metrics

plots =
    colorcet~=3.0.1
    matplotlib~=3.6.0
    seaborn~=0.11.0

slurm =
    simple-slurm~=0.2.6

# Model extras
aleph-alpha =
    aleph-alpha-client~=2.14.0
    tokenizers~=0.13.3

anthropic =
    anthropic~=0.2.5
    websocket-client~=1.3.2  # For legacy stanford-online-all-v4-s3

openai =
    openai~=0.27.8
    tiktoken~=0.3.3

tsinghua =
    icetk~=0.0.4

yandex =
    sentencepiece~=0.1.97

models =
    crfm-helm[aleph-alpha]
    crfm-helm[anthropic]
    crfm-helm[openai]
    crfm-helm[tsinghua]
    crfm-helm[yandex]

cleva = 
    unidecode==1.3.6
    pypinyin==0.49.0
    jieba==0.42.1
    opencc==1.1.6
    langdetect==1.0.9

images =
    accelerate~=0.23.0  # For the newer versions of Transformers
    pillow~=9.4.0

# Install everything
all =
    crfm-helm[server]
    crfm-helm[human-evaluation]
    crfm-helm[scenarios]
    crfm-helm[models]
    crfm-helm[metrics]
    crfm-helm[plots]
    crfm-helm[slurm]
    crfm-helm[cleva]
    crfm-helm[images]

# Development only
# Do not include in all
dev =
    pytest~=7.2.0
    black~=22.10.0
    mypy~=0.982
    pre-commit~=2.20.0
    flake8~=5.0.4

[options.entry_points]
console_scripts = 
    helm-run = helm.benchmark.run:main
    helm-summarize = helm.benchmark.presentation.summarize:main
    helm-server = helm.benchmark.server:main
    helm-create-plots = helm.benchmark.presentation.create_plots:main
    crfm-proxy-server = helm.proxy.server:main
    crfm-proxy-cli = helm.proxy.cli:main

[options.packages.find]
where = src
exclude =
    tests*

# Settings for Flake8: Tool For Style Guide Enforcement
[flake8]
max-line-length = 120
exclude = venv/*

# Ignore completely:
# E203 - White space before ':', (conflicts with black)
# E231 - Missing whitespace after ',', ';', or ':'
# E731 - do not assign a lambda expression, use a def
# W503 - line break before binary operator, (conflicts with black)
# W605 - invalid escape sequence '\', (causes failures)
ignore = E203,E231,E731,W503,W605

# Settings for Mypy: static type checker for Python 3
[mypy]
ignore_missing_imports = True
check_untyped_defs = True
# TODO: Remove disable_error_code
disable_error_code = annotation-unchecked
# TODO: Change disallow_untyped_defs to True
disallow_untyped_defs = False

[tool:pytest]
addopts =
    # By default, we don't test models because doing so will
    # make real requests and spend real money
    -m 'not models'
markers =
    # Marker for tests that make real model requests
    models