Skip to content

Commit

Permalink
feat: hugging face to couchbase migration initail code (#2)
Browse files Browse the repository at this point in the history
* initial code

* token fix

* workflow update
  • Loading branch information
shyam-cb authored Nov 6, 2024
1 parent 1d3aca2 commit 34717b4
Show file tree
Hide file tree
Showing 8 changed files with 843 additions and 2 deletions.
166 changes: 166 additions & 0 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
name: Build and Release

on:
push:
tags:
- 'v*.*.*' # Triggers on version tags like v0.2.0
workflow_dispatch: # Allows manual triggering

jobs:
build:
strategy:
matrix:
include:
- os: windows-latest
os_name: windows
arch: x86_64
arch_name: amd64
extension: zip
- os: macos-latest
os_name: darwin
arch: x86_64
arch_name: amd64
extension: tar.gz
- os: macos-12-arm64
os_name: darwin
arch: arm64
arch_name: arm64
extension: tar.gz
- os: ubuntu-latest
os_name: linux
arch: x86_64
arch_name: amd64
extension: tar.gz
- os: ubuntu-latest
os_name: linux
arch: arm64
arch_name: arm64
extension: tar.gz
runs-on: ${{ matrix.os }}
steps:
- name: Checkout code
uses: actions/checkout@v3

# Step to extract the version number
- name: Set Version
id: get_version
shell: bash
run: |
VERSION="${GITHUB_REF#refs/tags/}"
VERSION="${VERSION#v}"
echo "VERSION=$VERSION" >> $GITHUB_ENV
echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.9'
architecture: ${{ matrix.arch }}

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install pyinstaller
- name: Build with PyInstaller in directory mode
env:
ARCHFLAGS: ${{ runner.os == 'macOS' && matrix.arch == 'arm64' && '-arch arm64' || '' }}
run: |
pyinstaller your_script.py --name hf_to_cb_dataset_migrator
# Code-signing and notarization steps for macOS
- name: Code-sign on macOS
if: runner.os == 'macOS'
env:
CERTIFICATE: ${{ secrets.APPLE_DEV_CERT }}
CERT_PASSWORD: ${{ secrets.APPLE_DEV_CERT_PASSPHRASE }}
KEYCHAIN_PASSWORD: ${{ secrets.KEYCHAIN_PASSWORD }}
run: |
echo "$CERTIFICATE" | base64 --decode > certificate.p12
security create-keychain -p "$KEYCHAIN_PASSWORD" build.keychain
security import certificate.p12 -k build.keychain -P "$CERT_PASSWORD" -T /usr/bin/codesign
security set-keychain-settings -lut 21600 build.keychain
security list-keychains -s build.keychain
security unlock-keychain -p "$KEYCHAIN_PASSWORD" build.keychain
# Sign the main executable
codesign --force --options runtime --sign "Developer ID Application: Your Name (Team ID)" dist/hf_to_cb_dataset_migrator/hf_to_cb_dataset_migrator
# Sign all dynamic libraries and executables
find dist/hf_to_cb_dataset_migrator -type f \( -name "*.so" -or -name "*.dylib" -or -perm -u=x \) -exec codesign --force --options runtime --sign "Developer ID Application: Your Name (Team ID)" {} \;
# Verify the code-signing
codesign --verify --deep --strict --verbose=2 dist/hf_to_cb_dataset_migrator/hf_to_cb_dataset_migrator
# Compression and notarization for macOS
- name: Compress Application Directory on macOS
if: runner.os == 'macOS'
shell: bash
env:
VERSION: ${{ env.VERSION }}
run: |
APP_NAME="hf_to_cb_dataset_migrator_${VERSION}_${{ matrix.os_name }}_${{ matrix.arch_name }}"
cd dist
zip -r "../$APP_NAME.zip" hf_to_cb_dataset_migrator
- name: Notarize the macOS binary
if: runner.os == 'macOS'
env:
APPLE_ID: ${{ secrets.APPLE_ID }}
APPLE_APP_PASSWORD: ${{ secrets.APPLE_APP_PASSWORD }}
APPLE_TEAM_ID: ${{ secrets.APPLE_TEAM_ID }}
VERSION: ${{ env.VERSION }}
run: |
APP_NAME="hf_to_cb_dataset_migrator_${VERSION}_${{ matrix.os_name }}_${{ matrix.arch_name }}"
xcrun notarytool submit "$APP_NAME.zip" --apple-id "$APPLE_ID" --password "$APPLE_APP_PASSWORD" --team-id "$APPLE_TEAM_ID" --wait
# Staple the notarization ticket
xcrun stapler staple "$APP_NAME.zip"
# Compression for Linux
- name: Compress Application Directory on Linux
if: runner.os == 'Linux'
shell: bash
env:
VERSION: ${{ env.VERSION }}
run: |
APP_NAME="hf_to_cb_dataset_migrator_${VERSION}_${{ matrix.os_name }}_${{ matrix.arch_name }}"
tar -czvf "$APP_NAME.${{ matrix.extension }}" -C dist hf_to_cb_dataset_migrator
# Compression for Windows
- name: Compress Application Directory on Windows
if: runner.os == 'Windows'
shell: powershell
env:
VERSION: ${{ env.VERSION }}
run: |
$APP_NAME = "hf_to_cb_dataset_migrator_$Env:VERSION_${{ matrix.os_name }}_${{ matrix.arch_name }}"
Compress-Archive -Path dist\hf_to_cb_dataset_migrator\* -DestinationPath "$APP_NAME.${{ matrix.extension }}"
- name: Upload artifact
uses: actions/upload-artifact@v3
with:
name: ${{ matrix.os_name }}_${{ matrix.arch_name }}
path: |
hf_to_cb_dataset_migrator_*_${{ matrix.os_name }}_${{ matrix.arch_name }}.*
- name: Create Release
if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }}
id: create_release
uses: actions/create-release@v1
with:
tag_name: ${{ github.ref }}
release_name: Release ${{ env.VERSION }}
draft: false
prerelease: false
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

- name: Upload Release Asset
if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }}
uses: actions/upload-release-asset@v1
with:
upload_url: ${{ steps.create_release.outputs.upload_url }}
asset_path: |
hf_to_cb_dataset_migrator_*_${{ matrix.os_name }}_${{ matrix.arch_name }}.*
asset_name: hf_to_cb_dataset_migrator_${{ env.VERSION }}_${{ matrix.os_name }}_${{ matrix.arch_name }}.${{ matrix.extension }}
asset_content_type: application/octet-stream
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
11 changes: 9 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,9 @@
# hf-to-cb-dataset-migrator
Hugging face to Couchbase dataset migrator
# Hugging Face Dataset to Couchbase Migrator

A CLI tool to interact with Hugging Face datasets and migrate them to Couchbase, with support for streaming data.

## Installation

```bash
pip install -r requirements.txt
python setup.py install
161 changes: 161 additions & 0 deletions hf_to_cb_dataset_migrator/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
# my_cli/cli.py

import click
import json
from migration import DatasetMigrator
from typing import Any

@click.group()
def main():
"""CLI tool to interact with Hugging Face datasets and migrate them to Couchbase."""
pass

@main.command('list-configs')
@click.option('--path', required=True, help='Path or name of the dataset.')
@click.option('--revision', default=None, help='Version of the dataset script to load (optional).')
@click.option('--download-config', default=None, help='Specific download configuration parameters (optional).')
@click.option('--download-mode', type=click.Choice(['reuse_dataset_if_exists', 'force_redownload']), default=None,
help='Download mode (optional).')
@click.option('--dynamic-modules-path', default=None, help='Path to dynamic modules (optional).')
@click.option('--data-files', default=None, multiple=True, help='Path(s) to source data file(s) (optional).')
@click.option('--token',default=None, help='Use authentication token for private datasets.')
@click.option('--json-output', is_flag=True, help='Output the configurations in JSON format.')
def list_configs_cmd(path, revision, download_config, download_mode, dynamic_modules_path,
data_files, token, json_output):
"""List all configuration names for a given dataset."""
migrator = DatasetMigrator(token=token)
download_kwargs = {
'revision': revision,
'download_config': json.load(download_config) if download_config else None,
'download_mode': download_mode,
'dynamic_modules_path': dynamic_modules_path,
'data_files': data_files if data_files else None,
}
# Remove None values
download_kwargs = {k: v for k, v in download_kwargs.items() if v is not None}

configs = migrator.list_configs(path, **download_kwargs)
if configs:
if json_output:
click.echo(json.dumps(configs, indent=2))
else:
click.echo(f"Available configurations for '{path}':")
for config in configs:
click.echo(f"- {config}")
else:
click.echo(f"No configurations found for dataset '{path}' or dataset not found.")

@main.command('list-splits')
@click.option('--path', required=True, help='Path or name of the dataset.')
@click.option('--name', 'config_name', default=None, help='Configuration name of the dataset (optional).')
@click.option('--data-files', default=None, multiple=True, help='Path(s) to source data file(s) (optional).')
@click.option('--download-config', default=None, help='Specific download configuration parameters (optional).')
@click.option('--download-mode', type=click.Choice(['reuse_dataset_if_exists', 'force_redownload']), default=None,
help='Download mode (optional).')
@click.option('--revision', default=None, help='Version of the dataset script to load (optional).')
@click.option('--token', default=None, help='Authentication token for private datasets (optional).')
@click.option('--json-output', is_flag=True, help='Output the splits in JSON format.')
def list_splits_cmd(path, config_name, data_files, download_config, download_mode, revision, token,
json_output):
"""List all available splits for a given dataset and configuration."""
migrator = DatasetMigrator(token=token)

config_kwargs = {
'data_files': data_files if data_files else None,
'download_config': json.load(download_config) if download_config else None,
'download_mode': download_mode,
'revision': revision,
}
# Remove None values
config_kwargs = {k: v for k, v in config_kwargs.items() if v is not None}

splits = migrator.list_splits(path, config_name=config_name, **config_kwargs)
if splits:
if json_output:
click.echo(json.dumps(splits, indent=2))
else:
config_name_display = config_name if config_name else "default"
click.echo(f"Available splits for dataset '{path}' with config '{config_name_display}':")
for split in splits:
click.echo(f"- {split}")
else:
click.echo(f"No splits found for dataset '{path}' with config '{config_name}' or dataset not found.")

@main.command()
@click.option('--path', required=True, help='Path or name of the dataset.')
@click.option('--name', default=None, help='Configuration name of the dataset (optional).')
@click.option('--data-dir', default=None, help='Directory with the data files (optional).')
@click.option('--data-files', default=None, multiple=True, help='Path(s) to source data file(s) (optional).')
@click.option('--split', default=None, help='Which split of the data to load (optional).')
@click.option('--cache-dir', default=None, help='Cache directory to store the datasets (optional).')
#@click.option('--features', default=None, help='Set of features to use (optional).')
@click.option('--download-config', default=None, help='Specific download configuration parameters (optional).')
@click.option('--download-mode', type=click.Choice(['reuse_dataset_if_exists', 'force_redownload']), default=None,
help='Download mode (optional).')
@click.option('--verification-mode', type=click.Choice(['no_checks', 'basic_checks', 'all_checks']), default=None,
help='Verification mode (optional).')
@click.option('--keep-in-memory', is_flag=True, default=False, help='Keep the dataset in memory (optional).')
@click.option('--save-infos', is_flag=True, default=False, help='Save dataset information (default: False).')
@click.option('--revision', default=None, help='Version of the dataset script to load (optional).')
@click.option('--token', default=None, help='Authentication token for private datasets (optional).')
@click.option('--streaming/--no-streaming', default=True, help='Load the dataset in streaming mode (default: True).')
@click.option('--num-proc', default=None, type=int, help='Number of processes to use (optional).')
@click.option('--storage-options', default=None, help='Storage options for remote filesystems (optional).')
@click.option('--trust-remote-code', is_flag=True, default=None,
help='Allow loading arbitrary code from the dataset repository (optional).')
@click.option('--id-fields', default=None, help='Comma-separated list of field names to use as document ID.')
@click.option('--cb-url', prompt='Couchbase URL', help='Couchbase cluster URL (e.g., couchbase://localhost).')
@click.option('--cb-username', prompt='Couchbase username', help='Username for Couchbase authentication.')
@click.option('--cb-password', prompt=True, hide_input=True, confirmation_prompt=False,
help='Password for Couchbase authentication.')
@click.option('--cb-bucket', prompt='Couchbase bucket name', help='Couchbase bucket to store data.')
@click.option('--cb-scope', default=None, help='Couchbase scope name (optional).')
@click.option('--cb-collection', default=None, help='Couchbase collection name (optional).')
def migrate(
path, name, data_dir, data_files, split, cache_dir,
#features,
download_config, download_mode,
verification_mode, keep_in_memory, save_infos, revision, token, streaming, num_proc, storage_options,
trust_remote_code, id_fields, cb_url, cb_username, cb_password, cb_bucket, cb_scope, cb_collection
):
"""Migrate datasets from Hugging Face to Couchbase."""
click.echo(f"Starting migration of dataset '{path}' to Couchbase bucket '{cb_bucket}'...")
migrator = DatasetMigrator(token=token)

# Prepare data_files
data_files = list(data_files) if data_files else None

result = migrator.migrate_dataset(
path=path,
cb_url=cb_url,
cb_username=cb_username,
cb_password=cb_password,
couchbase_bucket=cb_bucket,
cb_scope=cb_scope,
cb_collection=cb_collection,
id_fields=id_fields,
name=name,
data_dir=data_dir,
data_files=data_files,
split=split,
cache_dir=cache_dir,
#features=features,
download_config=download_config,
download_mode=download_mode,
verification_mode=verification_mode,
keep_in_memory=keep_in_memory,
save_infos=save_infos,
revision=revision,
token=token,
streaming=streaming,
num_proc=num_proc,
storage_options=json.loads(storage_options) if storage_options else None,
trust_remote_code=trust_remote_code,
)
if result:
click.echo("Migration completed successfully.")
else:
click.echo("Migration failed.")

if __name__ == '__main__':
main()
Loading

0 comments on commit 34717b4

Please sign in to comment.