feat: hugging face to couchbase migration initail code (#2)

* initial code * token fix * workflow update
Couchbase-Ecosystem · Nov 6, 2024 · 34717b4 · 34717b4
1 parent 1d3aca2
commit 34717b4
Show file tree

Hide file tree

Showing 8 changed files with 843 additions and 2 deletions.
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -0,0 +1,166 @@
+name: Build and Release
+
+on:
+  push:
+    tags:
+      - 'v*.*.*'   # Triggers on version tags like v0.2.0
+  workflow_dispatch:   # Allows manual triggering
+
+jobs:
+  build:
+    strategy:
+      matrix:
+        include:
+          - os: windows-latest
+            os_name: windows
+            arch: x86_64
+            arch_name: amd64
+            extension: zip
+          - os: macos-latest
+            os_name: darwin
+            arch: x86_64
+            arch_name: amd64
+            extension: tar.gz
+          - os: macos-12-arm64
+            os_name: darwin
+            arch: arm64
+            arch_name: arm64
+            extension: tar.gz
+          - os: ubuntu-latest
+            os_name: linux
+            arch: x86_64
+            arch_name: amd64
+            extension: tar.gz
+          - os: ubuntu-latest
+            os_name: linux
+            arch: arm64
+            arch_name: arm64
+            extension: tar.gz
+    runs-on: ${{ matrix.os }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      # Step to extract the version number
+      - name: Set Version
+        id: get_version
+        shell: bash
+        run: |
+          VERSION="${GITHUB_REF#refs/tags/}"
+          VERSION="${VERSION#v}"
+          echo "VERSION=$VERSION" >> $GITHUB_ENV
+          echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.9'
+          architecture: ${{ matrix.arch }}
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install pyinstaller
+
+      - name: Build with PyInstaller in directory mode
+        env:
+          ARCHFLAGS: ${{ runner.os == 'macOS' && matrix.arch == 'arm64' && '-arch arm64' || '' }}
+        run: |
+          pyinstaller your_script.py --name hf_to_cb_dataset_migrator
+
+      # Code-signing and notarization steps for macOS
+      - name: Code-sign on macOS
+        if: runner.os == 'macOS'
+        env:
+          CERTIFICATE: ${{ secrets.APPLE_DEV_CERT }}
+          CERT_PASSWORD: ${{ secrets.APPLE_DEV_CERT_PASSPHRASE }}
+          KEYCHAIN_PASSWORD: ${{ secrets.KEYCHAIN_PASSWORD }}
+        run: |
+          echo "$CERTIFICATE" | base64 --decode > certificate.p12
+          security create-keychain -p "$KEYCHAIN_PASSWORD" build.keychain
+          security import certificate.p12 -k build.keychain -P "$CERT_PASSWORD" -T /usr/bin/codesign
+          security set-keychain-settings -lut 21600 build.keychain
+          security list-keychains -s build.keychain
+          security unlock-keychain -p "$KEYCHAIN_PASSWORD" build.keychain
+          # Sign the main executable
+          codesign --force --options runtime --sign "Developer ID Application: Your Name (Team ID)" dist/hf_to_cb_dataset_migrator/hf_to_cb_dataset_migrator
+          # Sign all dynamic libraries and executables
+          find dist/hf_to_cb_dataset_migrator -type f \( -name "*.so" -or -name "*.dylib" -or -perm -u=x \) -exec codesign --force --options runtime --sign "Developer ID Application: Your Name (Team ID)" {} \;
+          # Verify the code-signing
+          codesign --verify --deep --strict --verbose=2 dist/hf_to_cb_dataset_migrator/hf_to_cb_dataset_migrator
+
+      # Compression and notarization for macOS
+      - name: Compress Application Directory on macOS
+        if: runner.os == 'macOS'
+        shell: bash
+        env:
+          VERSION: ${{ env.VERSION }}
+        run: |
+          APP_NAME="hf_to_cb_dataset_migrator_${VERSION}_${{ matrix.os_name }}_${{ matrix.arch_name }}"
+          cd dist
+          zip -r "../$APP_NAME.zip" hf_to_cb_dataset_migrator
+
+      - name: Notarize the macOS binary
+        if: runner.os == 'macOS'
+        env:
+          APPLE_ID: ${{ secrets.APPLE_ID }}
+          APPLE_APP_PASSWORD: ${{ secrets.APPLE_APP_PASSWORD }}
+          APPLE_TEAM_ID: ${{ secrets.APPLE_TEAM_ID }}
+          VERSION: ${{ env.VERSION }}
+        run: |
+          APP_NAME="hf_to_cb_dataset_migrator_${VERSION}_${{ matrix.os_name }}_${{ matrix.arch_name }}"
+          xcrun notarytool submit "$APP_NAME.zip" --apple-id "$APPLE_ID" --password "$APPLE_APP_PASSWORD" --team-id "$APPLE_TEAM_ID" --wait
+          # Staple the notarization ticket
+          xcrun stapler staple "$APP_NAME.zip"
+
+      # Compression for Linux
+      - name: Compress Application Directory on Linux
+        if: runner.os == 'Linux'
+        shell: bash
+        env:
+          VERSION: ${{ env.VERSION }}
+        run: |
+          APP_NAME="hf_to_cb_dataset_migrator_${VERSION}_${{ matrix.os_name }}_${{ matrix.arch_name }}"
+          tar -czvf "$APP_NAME.${{ matrix.extension }}" -C dist hf_to_cb_dataset_migrator
+
+      # Compression for Windows
+      - name: Compress Application Directory on Windows
+        if: runner.os == 'Windows'
+        shell: powershell
+        env:
+          VERSION: ${{ env.VERSION }}
+        run: |
+          $APP_NAME = "hf_to_cb_dataset_migrator_$Env:VERSION_${{ matrix.os_name }}_${{ matrix.arch_name }}"
+          Compress-Archive -Path dist\hf_to_cb_dataset_migrator\* -DestinationPath "$APP_NAME.${{ matrix.extension }}"
+
+      - name: Upload artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ matrix.os_name }}_${{ matrix.arch_name }}
+          path: |
+            hf_to_cb_dataset_migrator_*_${{ matrix.os_name }}_${{ matrix.arch_name }}.*
+
+      - name: Create Release
+        if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }}
+        id: create_release
+        uses: actions/create-release@v1
+        with:
+          tag_name: ${{ github.ref }}
+          release_name: Release ${{ env.VERSION }}
+          draft: false
+          prerelease: false
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Upload Release Asset
+        if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }}
+        uses: actions/upload-release-asset@v1
+        with:
+          upload_url: ${{ steps.create_release.outputs.upload_url }}
+          asset_path: |
+            hf_to_cb_dataset_migrator_*_${{ matrix.os_name }}_${{ matrix.arch_name }}.*
+          asset_name: hf_to_cb_dataset_migrator_${{ env.VERSION }}_${{ matrix.os_name }}_${{ matrix.arch_name }}.${{ matrix.extension }}
+          asset_content_type: application/octet-stream
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/README.md b/README.md
@@ -1,2 +1,9 @@
-# hf-to-cb-dataset-migrator
-Hugging face to Couchbase dataset migrator
+# Hugging Face Dataset to Couchbase Migrator
+
+A CLI tool to interact with Hugging Face datasets and migrate them to Couchbase, with support for streaming data.
+
+## Installation
+
+```bash
+pip install -r requirements.txt
+python setup.py install
diff --git a/hf_to_cb_dataset_migrator/cli.py b/hf_to_cb_dataset_migrator/cli.py
@@ -0,0 +1,161 @@
+# my_cli/cli.py
+
+import click
+import json
+from migration import DatasetMigrator
+from typing import Any
+
+@click.group()
+def main():
+    """CLI tool to interact with Hugging Face datasets and migrate them to Couchbase."""
+    pass
+
+@main.command('list-configs')
+@click.option('--path', required=True, help='Path or name of the dataset.')
+@click.option('--revision', default=None, help='Version of the dataset script to load (optional).')
+@click.option('--download-config', default=None, help='Specific download configuration parameters (optional).')
+@click.option('--download-mode', type=click.Choice(['reuse_dataset_if_exists', 'force_redownload']), default=None,
+              help='Download mode (optional).')
+@click.option('--dynamic-modules-path', default=None, help='Path to dynamic modules (optional).')
+@click.option('--data-files', default=None, multiple=True, help='Path(s) to source data file(s) (optional).')
+@click.option('--token',default=None, help='Use authentication token for private datasets.')
+@click.option('--json-output', is_flag=True, help='Output the configurations in JSON format.')
+def list_configs_cmd(path, revision, download_config, download_mode, dynamic_modules_path,
+                     data_files, token, json_output):
+    """List all configuration names for a given dataset."""
+    migrator = DatasetMigrator(token=token)
+    download_kwargs = {
+        'revision': revision,
+        'download_config': json.load(download_config) if download_config else None,
+        'download_mode': download_mode,
+        'dynamic_modules_path': dynamic_modules_path,
+        'data_files': data_files if data_files else None,
+    }
+    # Remove None values
+    download_kwargs = {k: v for k, v in download_kwargs.items() if v is not None}
+
+    configs = migrator.list_configs(path, **download_kwargs)
+    if configs:
+        if json_output:
+            click.echo(json.dumps(configs, indent=2))
+        else:
+            click.echo(f"Available configurations for '{path}':")
+            for config in configs:
+                click.echo(f"- {config}")
+    else:
+        click.echo(f"No configurations found for dataset '{path}' or dataset not found.")
+
+@main.command('list-splits')
+@click.option('--path', required=True, help='Path or name of the dataset.')
+@click.option('--name', 'config_name', default=None, help='Configuration name of the dataset (optional).')
+@click.option('--data-files', default=None, multiple=True, help='Path(s) to source data file(s) (optional).')
+@click.option('--download-config', default=None, help='Specific download configuration parameters (optional).')
+@click.option('--download-mode', type=click.Choice(['reuse_dataset_if_exists', 'force_redownload']), default=None,
+              help='Download mode (optional).')
+@click.option('--revision', default=None, help='Version of the dataset script to load (optional).')
+@click.option('--token', default=None, help='Authentication token for private datasets (optional).')
+@click.option('--json-output', is_flag=True, help='Output the splits in JSON format.')
+def list_splits_cmd(path, config_name, data_files, download_config, download_mode, revision, token,
+                    json_output):
+    """List all available splits for a given dataset and configuration."""
+    migrator = DatasetMigrator(token=token)
+
+    config_kwargs = {
+        'data_files': data_files if data_files else None,
+        'download_config': json.load(download_config) if download_config else None,
+        'download_mode': download_mode,
+        'revision': revision,
+    }
+    # Remove None values
+    config_kwargs = {k: v for k, v in config_kwargs.items() if v is not None}
+
+    splits = migrator.list_splits(path, config_name=config_name, **config_kwargs)
+    if splits:
+        if json_output:
+            click.echo(json.dumps(splits, indent=2))
+        else:
+            config_name_display = config_name if config_name else "default"
+            click.echo(f"Available splits for dataset '{path}' with config '{config_name_display}':")
+            for split in splits:
+                click.echo(f"- {split}")
+    else:
+        click.echo(f"No splits found for dataset '{path}' with config '{config_name}' or dataset not found.")
+
+@main.command()
+@click.option('--path', required=True, help='Path or name of the dataset.')
+@click.option('--name', default=None, help='Configuration name of the dataset (optional).')
+@click.option('--data-dir', default=None, help='Directory with the data files (optional).')
+@click.option('--data-files', default=None, multiple=True, help='Path(s) to source data file(s) (optional).')
+@click.option('--split', default=None, help='Which split of the data to load (optional).')
+@click.option('--cache-dir', default=None, help='Cache directory to store the datasets (optional).')
+#@click.option('--features', default=None, help='Set of features to use (optional).')
+@click.option('--download-config', default=None, help='Specific download configuration parameters (optional).')
+@click.option('--download-mode', type=click.Choice(['reuse_dataset_if_exists', 'force_redownload']), default=None,
+              help='Download mode (optional).')
+@click.option('--verification-mode', type=click.Choice(['no_checks', 'basic_checks', 'all_checks']), default=None,
+              help='Verification mode (optional).')
+@click.option('--keep-in-memory', is_flag=True, default=False, help='Keep the dataset in memory (optional).')
+@click.option('--save-infos', is_flag=True, default=False, help='Save dataset information (default: False).')
+@click.option('--revision', default=None, help='Version of the dataset script to load (optional).')
+@click.option('--token', default=None, help='Authentication token for private datasets (optional).')
+@click.option('--streaming/--no-streaming', default=True, help='Load the dataset in streaming mode (default: True).')
+@click.option('--num-proc', default=None, type=int, help='Number of processes to use (optional).')
+@click.option('--storage-options', default=None, help='Storage options for remote filesystems (optional).')
+@click.option('--trust-remote-code', is_flag=True, default=None,
+              help='Allow loading arbitrary code from the dataset repository (optional).')
+@click.option('--id-fields', default=None, help='Comma-separated list of field names to use as document ID.')
+@click.option('--cb-url', prompt='Couchbase URL', help='Couchbase cluster URL (e.g., couchbase://localhost).')
+@click.option('--cb-username', prompt='Couchbase username', help='Username for Couchbase authentication.')
+@click.option('--cb-password', prompt=True, hide_input=True, confirmation_prompt=False,
+              help='Password for Couchbase authentication.')
+@click.option('--cb-bucket', prompt='Couchbase bucket name', help='Couchbase bucket to store data.')
+@click.option('--cb-scope', default=None, help='Couchbase scope name (optional).')
+@click.option('--cb-collection', default=None, help='Couchbase collection name (optional).')
+def migrate(
+    path, name, data_dir, data_files, split, cache_dir, 
+    #features, 
+    download_config, download_mode,
+    verification_mode, keep_in_memory, save_infos, revision, token, streaming, num_proc, storage_options,
+    trust_remote_code, id_fields, cb_url, cb_username, cb_password, cb_bucket, cb_scope, cb_collection
+):
+    """Migrate datasets from Hugging Face to Couchbase."""
+    click.echo(f"Starting migration of dataset '{path}' to Couchbase bucket '{cb_bucket}'...")
+    migrator = DatasetMigrator(token=token)
+
+    # Prepare data_files
+    data_files = list(data_files) if data_files else None
+
+    result = migrator.migrate_dataset(
+        path=path,
+        cb_url=cb_url,
+        cb_username=cb_username,
+        cb_password=cb_password,
+        couchbase_bucket=cb_bucket,
+        cb_scope=cb_scope,
+        cb_collection=cb_collection,
+        id_fields=id_fields,
+        name=name,
+        data_dir=data_dir,
+        data_files=data_files,
+        split=split,
+        cache_dir=cache_dir,
+        #features=features,
+        download_config=download_config,
+        download_mode=download_mode,
+        verification_mode=verification_mode,
+        keep_in_memory=keep_in_memory,
+        save_infos=save_infos,
+        revision=revision,
+        token=token,
+        streaming=streaming,
+        num_proc=num_proc,
+        storage_options=json.loads(storage_options) if storage_options else None,
+        trust_remote_code=trust_remote_code,
+    )
+    if result:
+        click.echo("Migration completed successfully.")
+    else:
+        click.echo("Migration failed.")
+
+if __name__ == '__main__':
+    main()