Main

Add support for tensor parallelism and add OLMo2-26B model config / train script #824

Workflow file for this run

	name: Main

	concurrency:
	group: ${{ github.workflow }}-${{ github.ref }}
	cancel-in-progress: true

	on:
	pull_request:
	branches:
	- main
	push:
	branches:
	- main
	tags:
	- 'v..*'

	env:
	# Change this to invalidate existing cache.
	CACHE_PREFIX: v0
	PYTHONPATH: ./src/
	AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
	AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
	BEAKER_TOKEN: ${{ secrets.BEAKER_TOKEN }}

	jobs:
	checks:
	name: ${{ matrix.task.name }}
	runs-on: [ubuntu-latest]
	timeout-minutes: 8
	strategy:
	fail-fast: false
	matrix:
	python: ['3.10']
	task:
	- name: Lint
	run: make lint-check

	- name: Test
	run: \|
	pytest -v --color=yes --durations=3 src/test/ \
	--ignore-glob='src/test/distributed/checkpoint*'

	- name: Test checkpoint
	run: \|
	pytest -v --color=yes --durations=3 src/test/distributed/checkpoint*

	- name: Type check
	run: make type-check

	- name: Build
	run: make build

	- name: Style
	run: make style-check

	- name: Docs
	run: \|
	cd docs && make html SPHINXOPTS="-W --keep-going"

	include:
	- python: '3.9'
	task:
	name: Lint (min Python)
	run: make lint-check

	steps:
	- uses: actions/checkout@v4

	- name: Setup Python environment
	uses: ./.github/actions/setup-venv
	with:
	python-version: ${{ matrix.python }}
	cache-prefix: ${{ env.CACHE_PREFIX }}

	- name: Restore mypy cache
	if: matrix.task.name == 'Type check'
	uses: actions/cache@v4
	with:
	path: .mypy_cache
	key: mypy-${{ env.CACHE_PREFIX }}-${{ runner.os }}-${{ matrix.python }}-${{ hashFiles('pyproject.toml') }}-${{ github.ref }}-${{ github.sha }}
	restore-keys: \|
	mypy-${{ env.CACHE_PREFIX }}-${{ runner.os }}-${{ matrix.python }}-${{ hashFiles('pyproject.toml') }}-${{ github.ref }}
	mypy-${{ env.CACHE_PREFIX }}-${{ runner.os }}-${{ matrix.python }}-${{ hashFiles('pyproject.toml') }}

	- name: ${{ matrix.task.name }}
	run: \|
	. .venv/bin/activate
	${{ matrix.task.run }}

	- name: Upload package distribution files
	if: matrix.task.name == 'Build'
	uses: actions/upload-artifact@v4
	with:
	name: package
	path: dist

	- name: Clean up
	if: always()
	run: \|
	. .venv/bin/activate
	pip uninstall -y ai2-olmo-core

	gpu_checks:
	name: ${{ matrix.task.name }}
	runs-on: ubuntu-latest
	timeout-minutes: 15
	strategy:
	fail-fast: false
	matrix:
	task:
	- name: Test (GPU)
	image: olmo-core
	gpus: 2
	run: \|
	pytest -v --color=yes --durations=3 -m gpu \
	--ignore-glob='src/test/distributed/checkpoint*' \
	--ignore-glob='src/test/nn/moe*' \
	src/test/

	- name: Test checkpoint (GPU)
	image: olmo-core
	gpus: 2
	run: \|
	pytest -v --color=yes --durations=3 -m gpu \
	src/test/distributed/checkpoint*

	- name: Test MoE (GPU)
	image: olmo-core
	gpus: 1
	run: \|
	pytest -v --color=yes --durations=3 -m gpu \
	src/test/nn/moe*
	steps:
	- uses: actions/checkout@v4

	- name: Setup Python
	uses: actions/setup-python@v5
	with:
	python-version: '3.11'

	- name: Set env vars
	run: \|
	echo "BEAKER_WORKSPACE=$(make get-beaker-workspace)" >> $GITHUB_ENV

	- name: Determine current commit SHA (pull request)
	if: github.event_name == 'pull_request'
	run: \|
	echo "COMMIT_SHA=${{ github.event.pull_request.head.sha }}" >> $GITHUB_ENV

	- name: Determine current commit SHA (push)
	if: github.event_name != 'pull_request'
	run: \|
	echo "COMMIT_SHA=$GITHUB_SHA" >> $GITHUB_ENV

	- uses: allenai/setup-beaker@v2
	if: env.BEAKER_TOKEN != ''
	with:
	token: ${{ env.BEAKER_TOKEN }}
	workspace: ${{ env.BEAKER_WORKSPACE }}

	- name: Get full image name
	if: env.BEAKER_TOKEN != ''
	run:
	echo "BEAKER_IMAGE=$(make get-full-beaker-image-name IMAGE_NAME=${{ matrix.task.image }})" >> $GITHUB_ENV

	- name: GPU Tests
	uses: allenai/[email protected]
	if: env.BEAKER_TOKEN != ''
	with:
	spec: \|
	version: v2
	description: OLMo-core ${{ matrix.task.name }}
	budget: ai2/oe-training
	tasks:
	- name: tests
	image:
	beaker: ${{ env.BEAKER_IMAGE }}
	context:
	priority: normal
	preemptible: true
	resources:
	gpuCount: ${{ matrix.task.gpus }}
	constraints:
	cluster:
	# H100 clusters
	- ai2/jupiter-cirrascale-2
	- ai2/pluto-cirrascale
	- ai2/augusta-google-1
	# A100 clusters
	- ai2/saturn-cirrascale
	- ai2/allennlp-cirrascale
	# - ai2/allennlp-elanding-a100-40g
	envVars:
	- name: CUBLAS_WORKSPACE_CONFIG
	value: ":16:8"
	- name: TOKENIZERS_PARALLELISM
	value: "false"
	- name: AWS_ACCESS_KEY_ID
	secret: AWS_ACCESS_KEY_ID
	- name: AWS_SECRET_ACCESS_KEY
	secret: AWS_SECRET_ACCESS_KEY
	command:
	- "bash"
	- "-c"
	- "git clone https://github.com/allenai/OLMo-core.git && cd OLMo-core && git checkout ${{ env.COMMIT_SHA }} && pip install -e .[all] && ${{ matrix.task.run }}"
	result:
	path: /unused
	token: ${{ env.BEAKER_TOKEN }}
	workspace: ${{ env.BEAKER_WORKSPACE }}

	release:
	name: Release
	runs-on: ubuntu-latest
	needs: [checks]
	if: startsWith(github.ref, 'refs/tags/')
	steps:
	- uses: actions/checkout@v4
	with:
	fetch-depth: 0

	- name: Setup Python environment
	uses: ./.github/actions/setup-venv
	with:
	python-version: '3.10'
	cache-prefix: ${{ env.CACHE_PREFIX }}

	- name: Prepare environment
	run: \|
	echo "RELEASE_VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_ENV
	echo "TAG=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV

	- name: Download package distribution files
	uses: actions/download-artifact@v4
	with:
	name: package
	path: dist

	- name: Generate release notes
	run: \|
	. .venv/bin/activate
	python src/scripts/release_notes.py > ${{ github.workspace }}-RELEASE_NOTES.md

	- name: Publish package to PyPI
	run: \|
	. .venv/bin/activate
	twine upload -u __token__ -p '${{ secrets.PYPI_TOKEN }}' dist/*

	- name: Publish GitHub release
	uses: softprops/action-gh-release@v2
	env:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	with:
	body_path: ${{ github.workspace }}-RELEASE_NOTES.md
	prerelease: ${{ contains(env.TAG, 'rc') }}
	files: \|
	dist/*

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add support for tensor parallelism and add OLMo2-26B model config / train script #824

Workflow file

Add support for tensor parallelism and add OLMo2-26B model config / train script #824

Jobs

Run details

Workflow file for this run