iree-org · mariecwhite · Oct 5, 2023 · Oct 7, 2023
diff --git a/.github/workflows/run_tflite_benchmark.yml b/.github/workflows/run_tflite_benchmark.yml
@@ -0,0 +1,123 @@
+# Copyright 2023 The OpenXLA Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# TFLite Benchmarks Workflow.
+
+name: TFLite Benchmarks
+
+on:
+  workflow_dispatch:
+  pull_request:
+
+concurrency:
+  # A PR number if a pull request and otherwise the commit hash. This cancels
+  # queued and in-progress runs for the same PR (presubmit) or commit
+  # (postsubmit).
+  group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
+  cancel-in-progress: true
+
+env:
+  GCS_DIR: gs://openxla-github-actions-${{ github.event_name == 'pull_request' && 'presubmit' || 'postsubmit' }}-artifacts/${{ github.run_id }}/${{ github.run_attempt }}
+
+jobs:
+  setup:
+    runs-on: ubuntu-22.04
+    outputs:
+      runner-group: ${{ steps.configure.outputs.runner-group }}
+      benchmark-gcs-dir: ${{ steps.configure.outputs.benchmark-gcs-dir }}
+    steps:
+      - name: "Checking out PR repository"
+        uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791  # v2.5.0
+      - name: "Configuring CI options"
+        id: configure
+        env:
+          RUNNER_GROUP: ${{ github.event_name == 'pull_request' && 'presubmit' || 'postsubmit' }}
+        run: |
+          # Just informative logging. There should only be two commits in the
+          # history here, but limiting the depth helps when copying from a local
+          # repo instead of using checkout, e.g. with
+          # https://github.com/nektos/act where there will be more.
+          git log --oneline --graph --max-count=3
+          # Workflow jobs can't access `env` in `runs-on`, so we need to make
+          # `runner-group` a job output variable.
+          echo "runner-group=${RUNNER_GROUP}" > "${GITHUB_OUTPUT}"
+
+          # For presubmit testing, the result artifacts are uploaded to the
+          # temporary workflow GCS dir. In postsubmit, the result artifacts are
+          # uploaded to the comparative benchmark GCS dir.
+          if [[ "${RUNNER_GROUP}" == "presubmit" ]]; then
+            BENCHMARK_GCS_DIR="${GCS_DIR}/comparative-benchmark-artifacts"
+          else
+            BENCHMARK_GCS_DIR="gs://comparative-benchmark-artifacts/$(date +'%Y-%m-%d').$(date +'%s')"
+          fi
+          echo "benchmark-gcs-dir=${BENCHMARK_GCS_DIR}" >> "${GITHUB_OUTPUT}"
+
+  benchmark_on_c2-standard-16:
+    needs: [setup]
+    runs-on:
+      - self-hosted  # must come first
+      - runner-group=${{ needs.setup.outputs.runner-group }}
+      - environment=prod
+      - machine-type=c2-standard-16
+    env:
+      BENCHMARK_GCS_DIR: ${{ needs.setup.outputs.benchmark-gcs-dir }}
+      RESULTS_DIR: results-dir
+      TARGET_DEVICE: c2-standard-16
+      TFLITE_TOOL_DIR: tool-dir
+    steps:
+      - name: "Checking out PR repository"
+        uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791  # v2.5.0
+      - name: "Setup"
+        id: setup
+        run: |
+          echo "results-gcs-dir=${BENCHMARK_GCS_DIR}/${TARGET_DEVICE}-results" >> "${GITHUB_OUTPUT}"
+          mkdir "${RESULTS_DIR}"
+          mkdir "${TFLITE_TOOL_DIR}"
+      - name: "Benchmarking TFLite"
+        env:
+          TFLITE_RESULTS_JSON: tflite.json
+          RESULTS_GCS_DIR: ${{ steps.setup.outputs.results-gcs-dir }}
+        run: |
+          RESULTS_PATH="${RESULTS_DIR}/${TFLITE_RESULTS_JSON}"
+          docker run --mount="type=bind,src="${PWD}",target=/work" --workdir="/work" \
+            "gcr.io/iree-oss/openxla-benchmark/base@sha256:1bf3e319465ec8fb465baae3f6ba9a5b09cb84a5349a675c671a552fc77f2251" \
+            ./experimental/tflite/benchmark_tflite.sh \
+              "${TARGET_DEVICE}" \
+              "${TFLITE_TOOL_DIR}" \
+              "${RESULTS_PATH}"
+          gcloud storage cp "${RESULTS_PATH}" "${RESULTS_GCS_DIR}/"
+
+  benchmark_on_pixel-6-pro:
+    needs: [setup]
+    runs-on:
+      - self-hosted  # must come first
+      - runner-group=${{ needs.setup.outputs.runner-group }}
+      - environment=prod
+      - machine-type=pixel-6-pro
+    env:
+      BENCHMARK_GCS_DIR: ${{ needs.setup.outputs.benchmark-gcs-dir }}
+      RESULTS_DIR: results-dir
+      TARGET_DEVICE: pixel-6-pro
+      TFLITE_TOOL_DIR: tool-dir
+    steps:
+      - name: "Checking out PR repository"
+        uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791  # v2.5.0
+      - name: "Setup"
+        id: setup
+        run: |
+          echo "results-gcs-dir=${BENCHMARK_GCS_DIR}/${TARGET_DEVICE}-results" >> "${GITHUB_OUTPUT}"
+          mkdir "${RESULTS_DIR}"
+          mkdir "${TFLITE_TOOL_DIR}"
+      - name: "Benchmarking TFLite"
+        env:
+          TFLITE_RESULTS_JSON: tflite.json
+          RESULTS_GCS_DIR: ${{ steps.setup.outputs.results-gcs-dir }}
+        run: |
+          ./experimental/iree/benchmark_iree.sh
+
+          #RESULTS_PATH="${RESULTS_DIR}/${TFLITE_RESULTS_JSON}"
+          #./experimental/tflite/benchmark_tflite.sh "${TARGET_DEVICE}" "${TFLITE_TOOL_DIR}" "${RESULTS_PATH}"
+          #gcloud storage cp "${RESULTS_PATH}" "${RESULTS_GCS_DIR}/"
diff --git a/common_benchmark_suite/openxla/benchmark/comparative_suite/tf/model_definitions.py b/common_benchmark_suite/openxla/benchmark/comparative_suite/tf/model_definitions.py
@@ -10,7 +10,7 @@
 from openxla.benchmark import def_types
 from openxla.benchmark.comparative_suite import utils
 
-PARENT_GCS_DIR = "https://storage.googleapis.com/iree-model-artifacts/tensorflow/tf_models_2.15.0.dev20230817_1692333975/"
+PARENT_GCS_DIR = "https://storage.googleapis.com/iree-model-artifacts/tensorflow/tf_models_2.15.0.dev20230829_1696537918/"
 ARTIFACTS_DIR_URL_TEMPLATE = string.Template(PARENT_GCS_DIR + "${name}")
 
 # T5-Large models.
@@ -37,6 +37,7 @@
         def_types.ModelArtifactType.STABLEHLO_MLIR,
         def_types.ModelArtifactType.XLA_HLO_DUMP,
         def_types.ModelArtifactType.TF_SAVEDMODEL_V2,
+        def_types.ModelArtifactType.TFLITE_FP32,
     ],
 )
 T5_LARGE_FP32_TF_512XI32_BATCHES = utils.build_batch_models(
@@ -69,6 +70,7 @@
         def_types.ModelArtifactType.STABLEHLO_MLIR,
         def_types.ModelArtifactType.XLA_HLO_DUMP,
         def_types.ModelArtifactType.TF_SAVEDMODEL_V2,
+        def_types.ModelArtifactType.TFLITE_FP32,
     ],
 )
 BERT_LARGE_FP32_TF_384XI32_BATCHES = utils.build_batch_models(
@@ -100,6 +102,8 @@
         def_types.ModelArtifactType.STABLEHLO_MLIR,
         def_types.ModelArtifactType.XLA_HLO_DUMP,
         def_types.ModelArtifactType.TF_SAVEDMODEL_V2,
+        def_types.ModelArtifactType.TFLITE_FP32,
+        def_types.ModelArtifactType.TFLITE_INT8,
     ],
 )
 RESNET50_FP32_TF_224X224X3XF32_BATCHES = utils.build_batch_models(
@@ -130,6 +134,7 @@
         def_types.ModelArtifactType.STABLEHLO_MLIR,
         def_types.ModelArtifactType.XLA_HLO_DUMP,
         def_types.ModelArtifactType.TF_SAVEDMODEL_V2,
+        def_types.ModelArtifactType.TFLITE_FP32,
     ],
 )
 

diff --git a/...enchmark_suite/openxla/benchmark/comparative_suite/tf/scripts/generate_model_artifacts.py b/...enchmark_suite/openxla/benchmark/comparative_suite/tf/scripts/generate_model_artifacts.py
@@ -9,6 +9,7 @@
 import pathlib
 import re
 import multiprocessing
+import numpy as np
 import shutil
 import sys
 import tarfile
@@ -61,6 +62,47 @@ def _generate_mlir(model_dir: pathlib.Path, saved_model_dir: pathlib.Path):
   write_bytecode(str(mlir_path), result)
 
 
+def _generate_tflite(inputs: Tuple[Any, ...], model_dir: pathlib.Path,
+                     saved_model_dir: pathlib.Path):
+  converter = tf.lite.TFLiteConverter.from_saved_model(str(saved_model_dir))
+
+  # Generate fp32 model.
+  try:
+    tflite_model = converter.convert()
+    tflite_model_path = model_dir.joinpath("model_fp32.tflite")
+    with open(tflite_model_path, 'wb') as f:
+      f.write(tflite_model)
+  except Exception as e:
+    print(f"Failed to generate int8 TFLite model. Exception: {e}")
+
+  # Generate int8 model.
+  try:
+
+    def representative_examples():
+      for _ in range(2):
+        random_inputs = []
+        for input in inputs:
+          random_inputs.append(
+              np.random.uniform(low=input.dtype.min,
+                                high=input.dtype.max,
+                                size=input.shape).astype(
+                                    input.dtype.as_numpy_dtype))
+        yield random_inputs
+
+    converter.optimizations = [tf.lite.Optimize.DEFAULT]
+    converter.target_spec.supported_ops = [
+        tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.TFLITE_BUILTINS_INT8
+    ]
+    converter.representative_dataset = representative_examples
+    converter.inference_type = tf.int8
+    tflite_model_int8 = converter.convert()
+    tflite_model_int8_path = model_dir.joinpath("model_int8.tflite")
+    with open(tflite_model_int8_path, 'wb') as f:
+      f.write(tflite_model_int8)
+  except Exception as e:
+    print(f"Failed to generate int8 TFLite model. Exception: {e}")
+
+
 def _generate_artifacts(model: def_types.Model, save_dir: pathlib.Path,
                         auto_upload: bool):
   model_dir = save_dir.joinpath(model.name)
@@ -87,6 +129,7 @@ def _generate_artifacts(model: def_types.Model, save_dir: pathlib.Path,
 
     saved_model_dir = _generate_saved_model(inputs, model_obj, model_dir)
     _generate_mlir(model_dir, saved_model_dir)
+    _generate_tflite(inputs, model_dir, saved_model_dir)
 
     with tarfile.open(model_dir.joinpath("tf-model.tgz"), "w:gz") as tar:
       tar.add(f"{saved_model_dir}/", arcname="")

diff --git a/common_benchmark_suite/openxla/benchmark/def_types.py b/common_benchmark_suite/openxla/benchmark/def_types.py
@@ -15,6 +15,7 @@ class ModelFrameworkType(Enum):
   """Type of framework a model is implemented in."""
   TF_V1 = "tensorflow_v1"
   TF_V2 = "tensorflow_v2"
+  TFLITE = "tflite"
   PYTORCH = "pytorch"
   JAX = "jax"
   GGML = "ggml"
@@ -42,6 +43,8 @@ class ModelArtifactType(Enum):
   """Type of derived model artifact."""
   TF_SAVEDMODEL_V1 = "tf_savedmodel_v1"
   TF_SAVEDMODEL_V2 = "tf_savedmodel_v2"
+  TFLITE_FP32 = "tflite_fp32"
+  TFLITE_INT8 = "tflite_int8"
   XLA_HLO_DUMP = "xla_hlo_dump"
   STABLEHLO_MLIR = "stablehlo_mlir"
   LINALG_MLIR = "linalg_mlir"

diff --git a/experimental/ggml/benchmark_ggml.sh b/experimental/ggml/benchmark_ggml.sh
@@ -88,8 +88,8 @@ if [[ "${TARGET_DEVICE_NAME}" =~ ^(pixel-4|pixel-6-pro|moto-edge-x30)$ ]]; then
   adb shell "su root sh /data/local/tmp/set_android_scaling_governor.sh performance"
 else
   BENCHMARK_SCRIPT="run_benchmarks.py"
-  # c2-standard-16 has 16 cores.
-  THREADS="1,8,16"
+  # c2-standard-16 has 8 cores.
+  THREADS="1,8"
 
   args+=(
      --threads "${THREADS}"

diff --git a/experimental/iree/benchmark_iree.sh b/experimental/iree/benchmark_iree.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+# This is a temporary hack to run IREE benchmarks on pixel-6-pro since
+# it's currently not working in the IREE repo.
+
+ROOT_DIR=/tmp/iree-benchmarks
+TD="$(cd $(dirname $0) && pwd)"
+
+rm -rf "${ROOT_DIR}"
+mkdir "${ROOT_DIR}"
+pushd "${ROOT_DIR}"
+
+# Download benchmark tool.
+gsutil cp "gs://iree-github-actions-presubmit-artifacts/6464567954/1/benchmark-tools/android-armv8.2-a-benchmark-tools.tar" .
+tar -xf "android-armv8.2-a-benchmark-tools.tar"
+adb push "android-armv8.2-a-benchmark-tools-dir/build/tools/iree-benchmark-module" "/data/local/tmp"
+adb shell "chmod +x /data/local/tmp/iree-benchmark-module"
+
+# Download vmfb's.
+
+
+# Setup environment.
+adb push "${TD}/set_android_scaling_governor.sh" "/data/local/tmp"
+adb shell "chmod +x /data/local/tmp/set_android_scaling_governor.sh"
+adb shell "su root sh /data/local/tmp/set_android_scaling_governor.sh performance"
+
+# Benchmark.
+ITERATIONS=10
+gsutil cp "gs://iree-github-actions-presubmit-artifacts/6464567954/1/e2e-test-artifacts/iree_module_BertLarge_Fp32_Batch1_tflite___armv8.2-a-generic-linux_android29-llvm_cpu__experimental-flags_data-tiling_ukernel_/module.vmfb" "BertLarge_Batch1.vmfb"
+adb push "BertLarge_Batch1.vmfb" "/data/local/tmp"
+adb shell "taskset f0 /data/local/tmp/iree-benchmark-module --function=main --input=1x384xi32=0 --input=1x384xi32=0 --device_allocator=caching --task_topology_group_count=4 --device=local-task --module=/data/local/tmp/BertLarge_Batch1.vmfb --time_unit=ns --benchmark_format=json --benchmark_out_format=json --print_statistics=true --benchmark_repetitions=${ITERATIONS}"
+adb shell "rm /data/local/tmp/BertLarge_Batch1.vmfb"
+rm "BertLarge_Batch1.vmfb"
+
+gsutil cp "gs://iree-github-actions-presubmit-artifacts/6464567954/1/e2e-test-artifacts/iree_module_BertLarge_Fp32_Batch16_tflite___armv8.2-a-generic-linux_android29-llvm_cpu__experimental-flags_data-tiling_ukernel_/module.vmfb" "BertLarge_Batch16.vmfb"
+adb push "BertLarge_Batch16.vmfb" "/data/local/tmp"
+adb shell "taskset f0 /data/local/tmp/iree-benchmark-module --function=main --input=16x384xi32=0 --input=16x384xi32=0 --device_allocator=caching --task_topology_group_count=4 --device=local-task --module=/data/local/tmp/BertLarge_Batch16.vmfb --time_unit=ns --benchmark_format=json --benchmark_out_format=json --print_statistics=true --benchmark_repetitions=${ITERATIONS}"
+adb shell "rm /data/local/tmp/BertLarge_Batch16.vmfb"
+rm "BertLarge_Batch16.vmfb"
+
+gsutil cp "gs://iree-github-actions-presubmit-artifacts/6464567954/1/e2e-test-artifacts/iree_module_BertLarge_Fp32_Batch24_tflite___armv8.2-a-generic-linux_android29-llvm_cpu__experimental-flags_data-tiling_ukernel_/module.vmfb" "BertLarge_Batch24.vmfb"
+adb push "BertLarge_Batch24.vmfb" "/data/local/tmp"
+adb shell "taskset f0 /data/local/tmp/iree-benchmark-module --function=main --input=24x384xi32=0 --input=24x384xi32=0 --device_allocator=caching --task_topology_group_count=4 --device=local-task --module=/data/local/tmp/BertLarge_Batch24.vmfb --time_unit=ns --benchmark_format=json --benchmark_out_format=json --print_statistics=true --benchmark_repetitions=${ITERATIONS}"
+adb shell "rm /data/local/tmp/BertLarge_Batch24.vmfb"
+rm "BertLarge_Batch24.vmfb"
+
+gsutil cp "gs://iree-github-actions-presubmit-artifacts/6464567954/1/e2e-test-artifacts/iree_module_BertLarge_Fp32_Batch32_tflite___armv8.2-a-generic-linux_android29-llvm_cpu__experimental-flags_data-tiling_ukernel_/module.vmfb" "BertLarge_Batch32.vmfb"
+adb push "BertLarge_Batch32.vmfb" "/data/local/tmp"
+adb shell "taskset f0 /data/local/tmp/iree-benchmark-module --function=main --input=32x384xi32=0 --input=32x384xi32=0 --device_allocator=caching --task_topology_group_count=4 --device=local-task --module=/data/local/tmp/BertLarge_Batch32.vmfb --time_unit=ns --benchmark_format=json --benchmark_out_format=json --print_statistics=true --benchmark_repetitions=${ITERATIONS}"
+adb shell "rm /data/local/tmp/BertLarge_Batch32.vmfb"
+rm "BertLarge_Batch32.vmfb"
+
+adb shell "rm -rf /data/local/tmp/*"
+
+popd
+rm -rf "${ROOT_DIR}"
+
+
+
+
+
diff --git a/experimental/iree/set_android_scaling_governor.sh b/experimental/iree/set_android_scaling_governor.sh
@@ -0,0 +1,51 @@
+#!/bin/sh
+
+# Copyright 2023 The OpenXLA Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Runs on an android device itself to set the frequency scaling governor for all
+# CPUs (default performance).
+
+################################### WARNING ####################################
+# This will overheat the phone if it's not on a cooling plate, resulting in    #
+# thermal throttling. To prevent anything catching on fire, the actual CPU     #
+# frequencies will be throttled to below the maximum, skewing your results.    #
+################################################################################
+
+set -euo pipefail
+
+GOVERNOR="${1:-performance}"
+
+echo "CPU info (before changing governor):"
+echo 'cpu\tgovernor\tcur\tmin\tmax'
+echo "------------------------------------------------"
+for i in `cat /sys/devices/system/cpu/present | tr '-' ' ' | xargs seq`; do \
+    echo "cpu${i}" | paste \
+      - \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/scaling_governor" \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/cpuinfo_cur_freq" \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/cpuinfo_min_freq" \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/cpuinfo_max_freq"; \
+done
+
+echo "Setting CPU frequency governor to ${GOVERNOR}"
+
+for i in `cat /sys/devices/system/cpu/present | tr '-' ' ' | xargs seq`; do \
+  echo "${GOVERNOR}" > \
+    "/sys/devices/system/cpu/cpu${i?}/cpufreq/scaling_governor"; \
+done
+
+echo "CPU info (after changing governor):"
+echo 'cpu\tgovernor\tcur\tmin\tmax'
+echo "------------------------------------------------"
+for i in `cat /sys/devices/system/cpu/present | tr '-' ' ' | xargs seq`; do \
+    echo "cpu${i}" | paste \
+      - \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/scaling_governor" \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/cpuinfo_cur_freq" \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/cpuinfo_min_freq" \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/cpuinfo_max_freq"; \
+done