-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: hugging face to couchbase migration initail code (#2)
* initial code * token fix * workflow update
- Loading branch information
Showing
8 changed files
with
843 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,166 @@ | ||
name: Build and Release | ||
|
||
on: | ||
push: | ||
tags: | ||
- 'v*.*.*' # Triggers on version tags like v0.2.0 | ||
workflow_dispatch: # Allows manual triggering | ||
|
||
jobs: | ||
build: | ||
strategy: | ||
matrix: | ||
include: | ||
- os: windows-latest | ||
os_name: windows | ||
arch: x86_64 | ||
arch_name: amd64 | ||
extension: zip | ||
- os: macos-latest | ||
os_name: darwin | ||
arch: x86_64 | ||
arch_name: amd64 | ||
extension: tar.gz | ||
- os: macos-12-arm64 | ||
os_name: darwin | ||
arch: arm64 | ||
arch_name: arm64 | ||
extension: tar.gz | ||
- os: ubuntu-latest | ||
os_name: linux | ||
arch: x86_64 | ||
arch_name: amd64 | ||
extension: tar.gz | ||
- os: ubuntu-latest | ||
os_name: linux | ||
arch: arm64 | ||
arch_name: arm64 | ||
extension: tar.gz | ||
runs-on: ${{ matrix.os }} | ||
steps: | ||
- name: Checkout code | ||
uses: actions/checkout@v3 | ||
|
||
# Step to extract the version number | ||
- name: Set Version | ||
id: get_version | ||
shell: bash | ||
run: | | ||
VERSION="${GITHUB_REF#refs/tags/}" | ||
VERSION="${VERSION#v}" | ||
echo "VERSION=$VERSION" >> $GITHUB_ENV | ||
echo "VERSION=$VERSION" >> $GITHUB_OUTPUT | ||
- name: Set up Python | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: '3.9' | ||
architecture: ${{ matrix.arch }} | ||
|
||
- name: Install dependencies | ||
run: | | ||
python -m pip install --upgrade pip | ||
pip install -r requirements.txt | ||
pip install pyinstaller | ||
- name: Build with PyInstaller in directory mode | ||
env: | ||
ARCHFLAGS: ${{ runner.os == 'macOS' && matrix.arch == 'arm64' && '-arch arm64' || '' }} | ||
run: | | ||
pyinstaller your_script.py --name hf_to_cb_dataset_migrator | ||
# Code-signing and notarization steps for macOS | ||
- name: Code-sign on macOS | ||
if: runner.os == 'macOS' | ||
env: | ||
CERTIFICATE: ${{ secrets.APPLE_DEV_CERT }} | ||
CERT_PASSWORD: ${{ secrets.APPLE_DEV_CERT_PASSPHRASE }} | ||
KEYCHAIN_PASSWORD: ${{ secrets.KEYCHAIN_PASSWORD }} | ||
run: | | ||
echo "$CERTIFICATE" | base64 --decode > certificate.p12 | ||
security create-keychain -p "$KEYCHAIN_PASSWORD" build.keychain | ||
security import certificate.p12 -k build.keychain -P "$CERT_PASSWORD" -T /usr/bin/codesign | ||
security set-keychain-settings -lut 21600 build.keychain | ||
security list-keychains -s build.keychain | ||
security unlock-keychain -p "$KEYCHAIN_PASSWORD" build.keychain | ||
# Sign the main executable | ||
codesign --force --options runtime --sign "Developer ID Application: Your Name (Team ID)" dist/hf_to_cb_dataset_migrator/hf_to_cb_dataset_migrator | ||
# Sign all dynamic libraries and executables | ||
find dist/hf_to_cb_dataset_migrator -type f \( -name "*.so" -or -name "*.dylib" -or -perm -u=x \) -exec codesign --force --options runtime --sign "Developer ID Application: Your Name (Team ID)" {} \; | ||
# Verify the code-signing | ||
codesign --verify --deep --strict --verbose=2 dist/hf_to_cb_dataset_migrator/hf_to_cb_dataset_migrator | ||
# Compression and notarization for macOS | ||
- name: Compress Application Directory on macOS | ||
if: runner.os == 'macOS' | ||
shell: bash | ||
env: | ||
VERSION: ${{ env.VERSION }} | ||
run: | | ||
APP_NAME="hf_to_cb_dataset_migrator_${VERSION}_${{ matrix.os_name }}_${{ matrix.arch_name }}" | ||
cd dist | ||
zip -r "../$APP_NAME.zip" hf_to_cb_dataset_migrator | ||
- name: Notarize the macOS binary | ||
if: runner.os == 'macOS' | ||
env: | ||
APPLE_ID: ${{ secrets.APPLE_ID }} | ||
APPLE_APP_PASSWORD: ${{ secrets.APPLE_APP_PASSWORD }} | ||
APPLE_TEAM_ID: ${{ secrets.APPLE_TEAM_ID }} | ||
VERSION: ${{ env.VERSION }} | ||
run: | | ||
APP_NAME="hf_to_cb_dataset_migrator_${VERSION}_${{ matrix.os_name }}_${{ matrix.arch_name }}" | ||
xcrun notarytool submit "$APP_NAME.zip" --apple-id "$APPLE_ID" --password "$APPLE_APP_PASSWORD" --team-id "$APPLE_TEAM_ID" --wait | ||
# Staple the notarization ticket | ||
xcrun stapler staple "$APP_NAME.zip" | ||
# Compression for Linux | ||
- name: Compress Application Directory on Linux | ||
if: runner.os == 'Linux' | ||
shell: bash | ||
env: | ||
VERSION: ${{ env.VERSION }} | ||
run: | | ||
APP_NAME="hf_to_cb_dataset_migrator_${VERSION}_${{ matrix.os_name }}_${{ matrix.arch_name }}" | ||
tar -czvf "$APP_NAME.${{ matrix.extension }}" -C dist hf_to_cb_dataset_migrator | ||
# Compression for Windows | ||
- name: Compress Application Directory on Windows | ||
if: runner.os == 'Windows' | ||
shell: powershell | ||
env: | ||
VERSION: ${{ env.VERSION }} | ||
run: | | ||
$APP_NAME = "hf_to_cb_dataset_migrator_$Env:VERSION_${{ matrix.os_name }}_${{ matrix.arch_name }}" | ||
Compress-Archive -Path dist\hf_to_cb_dataset_migrator\* -DestinationPath "$APP_NAME.${{ matrix.extension }}" | ||
- name: Upload artifact | ||
uses: actions/upload-artifact@v3 | ||
with: | ||
name: ${{ matrix.os_name }}_${{ matrix.arch_name }} | ||
path: | | ||
hf_to_cb_dataset_migrator_*_${{ matrix.os_name }}_${{ matrix.arch_name }}.* | ||
- name: Create Release | ||
if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} | ||
id: create_release | ||
uses: actions/create-release@v1 | ||
with: | ||
tag_name: ${{ github.ref }} | ||
release_name: Release ${{ env.VERSION }} | ||
draft: false | ||
prerelease: false | ||
env: | ||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||
|
||
- name: Upload Release Asset | ||
if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} | ||
uses: actions/upload-release-asset@v1 | ||
with: | ||
upload_url: ${{ steps.create_release.outputs.upload_url }} | ||
asset_path: | | ||
hf_to_cb_dataset_migrator_*_${{ matrix.os_name }}_${{ matrix.arch_name }}.* | ||
asset_name: hf_to_cb_dataset_migrator_${{ env.VERSION }}_${{ matrix.os_name }}_${{ matrix.arch_name }}.${{ matrix.extension }} | ||
asset_content_type: application/octet-stream | ||
env: | ||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,9 @@ | ||
# hf-to-cb-dataset-migrator | ||
Hugging face to Couchbase dataset migrator | ||
# Hugging Face Dataset to Couchbase Migrator | ||
|
||
A CLI tool to interact with Hugging Face datasets and migrate them to Couchbase, with support for streaming data. | ||
|
||
## Installation | ||
|
||
```bash | ||
pip install -r requirements.txt | ||
python setup.py install |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,161 @@ | ||
# my_cli/cli.py | ||
|
||
import click | ||
import json | ||
from migration import DatasetMigrator | ||
from typing import Any | ||
|
||
@click.group() | ||
def main(): | ||
"""CLI tool to interact with Hugging Face datasets and migrate them to Couchbase.""" | ||
pass | ||
|
||
@main.command('list-configs') | ||
@click.option('--path', required=True, help='Path or name of the dataset.') | ||
@click.option('--revision', default=None, help='Version of the dataset script to load (optional).') | ||
@click.option('--download-config', default=None, help='Specific download configuration parameters (optional).') | ||
@click.option('--download-mode', type=click.Choice(['reuse_dataset_if_exists', 'force_redownload']), default=None, | ||
help='Download mode (optional).') | ||
@click.option('--dynamic-modules-path', default=None, help='Path to dynamic modules (optional).') | ||
@click.option('--data-files', default=None, multiple=True, help='Path(s) to source data file(s) (optional).') | ||
@click.option('--token',default=None, help='Use authentication token for private datasets.') | ||
@click.option('--json-output', is_flag=True, help='Output the configurations in JSON format.') | ||
def list_configs_cmd(path, revision, download_config, download_mode, dynamic_modules_path, | ||
data_files, token, json_output): | ||
"""List all configuration names for a given dataset.""" | ||
migrator = DatasetMigrator(token=token) | ||
download_kwargs = { | ||
'revision': revision, | ||
'download_config': json.load(download_config) if download_config else None, | ||
'download_mode': download_mode, | ||
'dynamic_modules_path': dynamic_modules_path, | ||
'data_files': data_files if data_files else None, | ||
} | ||
# Remove None values | ||
download_kwargs = {k: v for k, v in download_kwargs.items() if v is not None} | ||
|
||
configs = migrator.list_configs(path, **download_kwargs) | ||
if configs: | ||
if json_output: | ||
click.echo(json.dumps(configs, indent=2)) | ||
else: | ||
click.echo(f"Available configurations for '{path}':") | ||
for config in configs: | ||
click.echo(f"- {config}") | ||
else: | ||
click.echo(f"No configurations found for dataset '{path}' or dataset not found.") | ||
|
||
@main.command('list-splits') | ||
@click.option('--path', required=True, help='Path or name of the dataset.') | ||
@click.option('--name', 'config_name', default=None, help='Configuration name of the dataset (optional).') | ||
@click.option('--data-files', default=None, multiple=True, help='Path(s) to source data file(s) (optional).') | ||
@click.option('--download-config', default=None, help='Specific download configuration parameters (optional).') | ||
@click.option('--download-mode', type=click.Choice(['reuse_dataset_if_exists', 'force_redownload']), default=None, | ||
help='Download mode (optional).') | ||
@click.option('--revision', default=None, help='Version of the dataset script to load (optional).') | ||
@click.option('--token', default=None, help='Authentication token for private datasets (optional).') | ||
@click.option('--json-output', is_flag=True, help='Output the splits in JSON format.') | ||
def list_splits_cmd(path, config_name, data_files, download_config, download_mode, revision, token, | ||
json_output): | ||
"""List all available splits for a given dataset and configuration.""" | ||
migrator = DatasetMigrator(token=token) | ||
|
||
config_kwargs = { | ||
'data_files': data_files if data_files else None, | ||
'download_config': json.load(download_config) if download_config else None, | ||
'download_mode': download_mode, | ||
'revision': revision, | ||
} | ||
# Remove None values | ||
config_kwargs = {k: v for k, v in config_kwargs.items() if v is not None} | ||
|
||
splits = migrator.list_splits(path, config_name=config_name, **config_kwargs) | ||
if splits: | ||
if json_output: | ||
click.echo(json.dumps(splits, indent=2)) | ||
else: | ||
config_name_display = config_name if config_name else "default" | ||
click.echo(f"Available splits for dataset '{path}' with config '{config_name_display}':") | ||
for split in splits: | ||
click.echo(f"- {split}") | ||
else: | ||
click.echo(f"No splits found for dataset '{path}' with config '{config_name}' or dataset not found.") | ||
|
||
@main.command() | ||
@click.option('--path', required=True, help='Path or name of the dataset.') | ||
@click.option('--name', default=None, help='Configuration name of the dataset (optional).') | ||
@click.option('--data-dir', default=None, help='Directory with the data files (optional).') | ||
@click.option('--data-files', default=None, multiple=True, help='Path(s) to source data file(s) (optional).') | ||
@click.option('--split', default=None, help='Which split of the data to load (optional).') | ||
@click.option('--cache-dir', default=None, help='Cache directory to store the datasets (optional).') | ||
#@click.option('--features', default=None, help='Set of features to use (optional).') | ||
@click.option('--download-config', default=None, help='Specific download configuration parameters (optional).') | ||
@click.option('--download-mode', type=click.Choice(['reuse_dataset_if_exists', 'force_redownload']), default=None, | ||
help='Download mode (optional).') | ||
@click.option('--verification-mode', type=click.Choice(['no_checks', 'basic_checks', 'all_checks']), default=None, | ||
help='Verification mode (optional).') | ||
@click.option('--keep-in-memory', is_flag=True, default=False, help='Keep the dataset in memory (optional).') | ||
@click.option('--save-infos', is_flag=True, default=False, help='Save dataset information (default: False).') | ||
@click.option('--revision', default=None, help='Version of the dataset script to load (optional).') | ||
@click.option('--token', default=None, help='Authentication token for private datasets (optional).') | ||
@click.option('--streaming/--no-streaming', default=True, help='Load the dataset in streaming mode (default: True).') | ||
@click.option('--num-proc', default=None, type=int, help='Number of processes to use (optional).') | ||
@click.option('--storage-options', default=None, help='Storage options for remote filesystems (optional).') | ||
@click.option('--trust-remote-code', is_flag=True, default=None, | ||
help='Allow loading arbitrary code from the dataset repository (optional).') | ||
@click.option('--id-fields', default=None, help='Comma-separated list of field names to use as document ID.') | ||
@click.option('--cb-url', prompt='Couchbase URL', help='Couchbase cluster URL (e.g., couchbase://localhost).') | ||
@click.option('--cb-username', prompt='Couchbase username', help='Username for Couchbase authentication.') | ||
@click.option('--cb-password', prompt=True, hide_input=True, confirmation_prompt=False, | ||
help='Password for Couchbase authentication.') | ||
@click.option('--cb-bucket', prompt='Couchbase bucket name', help='Couchbase bucket to store data.') | ||
@click.option('--cb-scope', default=None, help='Couchbase scope name (optional).') | ||
@click.option('--cb-collection', default=None, help='Couchbase collection name (optional).') | ||
def migrate( | ||
path, name, data_dir, data_files, split, cache_dir, | ||
#features, | ||
download_config, download_mode, | ||
verification_mode, keep_in_memory, save_infos, revision, token, streaming, num_proc, storage_options, | ||
trust_remote_code, id_fields, cb_url, cb_username, cb_password, cb_bucket, cb_scope, cb_collection | ||
): | ||
"""Migrate datasets from Hugging Face to Couchbase.""" | ||
click.echo(f"Starting migration of dataset '{path}' to Couchbase bucket '{cb_bucket}'...") | ||
migrator = DatasetMigrator(token=token) | ||
|
||
# Prepare data_files | ||
data_files = list(data_files) if data_files else None | ||
|
||
result = migrator.migrate_dataset( | ||
path=path, | ||
cb_url=cb_url, | ||
cb_username=cb_username, | ||
cb_password=cb_password, | ||
couchbase_bucket=cb_bucket, | ||
cb_scope=cb_scope, | ||
cb_collection=cb_collection, | ||
id_fields=id_fields, | ||
name=name, | ||
data_dir=data_dir, | ||
data_files=data_files, | ||
split=split, | ||
cache_dir=cache_dir, | ||
#features=features, | ||
download_config=download_config, | ||
download_mode=download_mode, | ||
verification_mode=verification_mode, | ||
keep_in_memory=keep_in_memory, | ||
save_infos=save_infos, | ||
revision=revision, | ||
token=token, | ||
streaming=streaming, | ||
num_proc=num_proc, | ||
storage_options=json.loads(storage_options) if storage_options else None, | ||
trust_remote_code=trust_remote_code, | ||
) | ||
if result: | ||
click.echo("Migration completed successfully.") | ||
else: | ||
click.echo("Migration failed.") | ||
|
||
if __name__ == '__main__': | ||
main() |
Oops, something went wrong.