diff --git a/tuner/examples/punet/.gitignore b/tuner/examples/punet/.gitignore
new file mode 100644
index 000000000..fae904ffb
--- /dev/null
+++ b/tuner/examples/punet/.gitignore
@@ -0,0 +1,3 @@
+# Test files/dirs recommended by README.md.
+dump-mmt
+test-benchmark.mlir
diff --git a/tuner/examples/punet/2.mlir b/tuner/examples/punet/2.mlir
deleted file mode 100644
index cddf89428..000000000
--- a/tuner/examples/punet/2.mlir
+++ /dev/null
@@ -1,80 +0,0 @@
-module attributes {hal.device.targets = [#hal.device.target<"rocm", {legacy_sync}, [#hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F16_16x16x16_F32>, <MFMA_F16_32x32x8_F32>, <MFMA_F8E4M3FNUZ_16x16x32_F32>, <MFMA_I8_16x16x32_I32>, <MFMA_I8_32x32x16_I32>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>>, ukernels = "none"}>]>]} {
-  hal.executable private @main_2_dispatch_0 {
-    hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F16_16x16x16_F32>, <MFMA_F16_32x32x8_F32>, <MFMA_F8E4M3FNUZ_16x16x32_F32>, <MFMA_I8_16x16x32_I32>, <MFMA_I8_32x32x16_I32>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>>, ukernels = "none"}>) {
-      hal.executable.export public @main_2_dispatch_0_matmul_like_2048x1280x5120_f16xf16xf32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
-      ^bb0(%arg0: !hal.device):
-        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
-        hal.return %x, %y, %z : index, index, index
-      }
-      builtin.module {
-        func.func @main_2_dispatch_0_matmul_like_2048x1280x5120_f16xf16xf32() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64, {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, subgroup_m_count = 2, subgroup_n_count = 2>, prefetch_shared_memory}>} {
-          %cst = arith.constant 0.000000e+00 : f16
-          %c0 = arith.constant 0 : index
-          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x5120xf16>>
-          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1280x5120xf16>>
-          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x1280xf32>>
-          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 5120], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x5120xf16>> -> tensor<2048x5120xf16>
-          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1280, 5120], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1280x5120xf16>> -> tensor<1280x5120xf16>
-          %5 = tensor.empty() : tensor<2048x1280xf32>
-          %6 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 128, 64]]>} ins(%cst : f16) outs(%5 : tensor<2048x1280xf32>) -> tensor<2048x1280xf32>
-          %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<2048x5120xf16>, tensor<1280x5120xf16>) outs(%6 : tensor<2048x1280xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 128, 64]]>} {
-          ^bb0(%in: f16, %in_0: f16, %out: f32):
-            %8 = arith.extf %in : f16 to f32
-            %9 = arith.extf %in_0 : f16 to f32
-            %10 = arith.mulf %8, %9 : f32
-            %11 = arith.addf %out, %10 : f32
-            linalg.yield %11 : f32
-          } -> tensor<2048x1280xf32>
-          flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 1280], strides = [1, 1] : tensor<2048x1280xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x1280xf32>>
-          return
-        }
-      }
-    }
-  }
-  util.global private mutable @main_2_dispatch_0_rocm_hsaco_fb_main_2_dispatch_0_matmul_like_2048x1280x5120_f16xf16xf32_buffer : !hal.buffer
-  util.initializer {
-    %c44564480 = arith.constant 44564480 : index
-    %c-1_i64 = arith.constant -1 : i64
-    %c0 = arith.constant 0 : index
-    %device_0 = hal.devices.get %c0 : !hal.device
-    %allocator = hal.device.allocator<%device_0 : !hal.device> : !hal.allocator
-    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> affinity(%c-1_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c44564480}
-    util.global.store %buffer, @main_2_dispatch_0_rocm_hsaco_fb_main_2_dispatch_0_matmul_like_2048x1280x5120_f16xf16xf32_buffer : !hal.buffer
-    util.return
-  }
-  util.func public @main_2_dispatch_0_rocm_hsaco_fb_main_2_dispatch_0_matmul_like_2048x1280x5120_f16xf16xf32(%arg0: i32) attributes {iree.abi.stub, iree.reflection = {iree.benchmark = "dispatch"}} {
-    %c-1_i32 = arith.constant -1 : i32
-    %c-1_i64 = arith.constant -1 : i64
-    %c10485760 = arith.constant 10485760 : index
-    %c34078720 = arith.constant 34078720 : index
-    %c2 = arith.constant 2 : index
-    %c13107200 = arith.constant 13107200 : index
-    %c1 = arith.constant 1 : index
-    %c20971520 = arith.constant 20971520 : index
-    %c0 = arith.constant 0 : index
-    %0 = arith.index_cast %arg0 : i32 to index
-    %device_0 = hal.devices.get %c0 : !hal.device
-    %cmd = hal.command_buffer.create device(%device_0 : !hal.device) mode("OneShot|AllowInlineExecution") categories(Dispatch) : !hal.command_buffer
-    %pipeline_layout = hal.pipeline_layout.lookup device(%device_0 : !hal.device) layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) : !hal.pipeline_layout
-    %main_2_dispatch_0_rocm_hsaco_fb_main_2_dispatch_0_matmul_like_2048x1280x5120_f16xf16xf32_buffer = util.global.load @main_2_dispatch_0_rocm_hsaco_fb_main_2_dispatch_0_matmul_like_2048x1280x5120_f16xf16xf32_buffer : !hal.buffer
-    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
-      %c0 = (%main_2_dispatch_0_rocm_hsaco_fb_main_2_dispatch_0_matmul_like_2048x1280x5120_f16xf16xf32_buffer : !hal.buffer)[%c0, %c20971520],
-      %c1 = (%main_2_dispatch_0_rocm_hsaco_fb_main_2_dispatch_0_matmul_like_2048x1280x5120_f16xf16xf32_buffer : !hal.buffer)[%c20971520, %c13107200],
-      %c2 = (%main_2_dispatch_0_rocm_hsaco_fb_main_2_dispatch_0_matmul_like_2048x1280x5120_f16xf16xf32_buffer : !hal.buffer)[%c34078720, %c10485760]
-    ])
-    %workgroup_x, %workgroup_y, %workgroup_z = hal.executable.calculate_workgroups device(%device_0 : !hal.device) target(@main_2_dispatch_0::@rocm_hsaco_fb::@main_2_dispatch_0_matmul_like_2048x1280x5120_f16xf16xf32) : index, index, index
-    %exe = hal.executable.lookup device(%device_0 : !hal.device) executable(@main_2_dispatch_0) : !hal.executable
-    %ordinal = hal.executable.export.ordinal target(@main_2_dispatch_0::@rocm_hsaco_fb::@main_2_dispatch_0_matmul_like_2048x1280x5120_f16xf16xf32) : index
-    scf.for %arg1 = %c0 to %0 step %c1 {
-      hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%workgroup_x, %workgroup_y, %workgroup_z])
-      hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
-    }
-    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
-    %1 = util.null : !hal.fence
-    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
-    hal.device.queue.execute<%device_0 : !hal.device> affinity(%c-1_i64) wait(%1) signal(%fence) commands([%cmd])
-    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
-    util.status.check_ok %status, "failed to wait on timepoint"
-    util.return
-  }
-}
diff --git a/tuner/examples/punet/README.md b/tuner/examples/punet/README.md
index 85ea3539e..ed5490121 100644
--- a/tuner/examples/punet/README.md
+++ b/tuner/examples/punet/README.md
@@ -5,39 +5,42 @@ Follow instructions in [`/tuner/README.md`](../README.md)
 
 ## Shell Scripts
 
-The required shell scripts can be downloaded from: [sdxl-scripts](https://github.com/nod-ai/sdxl-scripts)
+The required shell scripts can be downloaded from:
+[sdxl-scripts](https://github.com/nod-ai/sdxl-scripts).
 
 These scripts include:
 1. `compile-punet-base.sh` - Used for compiling model candidates.
 2. `compile_candidate.sh` - Used for compiling dispatch candidates.
 3. `punet.sh` - Invoked by `compile_candidate.sh`.
 
-Please configure the file paths and update commands in `PunetClient`.
-**Note:** Alternatively, add these scripts to your `PATH` environment variable
+Add the parent directories of these scripts to your `PATH` environment variable,
+so that they can be picked up by `punet_autotune.py`.
 
 ## Running the Tuner
 
 ### [Optional] Generate a tunable mlir
-A sample `2.mlir` is provided for test run. Hoever, this file may become outdated if IREE makes changes to the MLIR format. To ensure you are working with the latest format, please follow the instructions below to compile and generate the most recent benchmark file.
-Use [`punet.sh`](https://github.com/nod-ai/sdxl-scripts/blob/main/tuning/punet.sh) to compile the sample matmul `mmt.mlir` (can also find here: [`mmt_unet.mlir`](https://github.com/nod-ai/sdxl-scripts/blob/main/tuning/mmt_unet.mlir)):
-```
-./punet.sh ./mmt.mlir -o baseline.vmfb --iree-hal-dump-executable-files-to=dump-mmt
-cp ./dump-mmt/module_main_2_dispatch_0_rocm_hsaco_fb_benchmark.mlir ./2.mlir
+Use
+[`punet.sh`](https://github.com/nod-ai/sdxl-scripts/blob/main/tuning/punet.sh)
+to compile the sample matmul `mmt.mlir` (can also find here:
+[`mmt_unet.mlir`](https://github.com/nod-ai/sdxl-scripts/blob/main/tuning/mmt_unet.mlir)):
+```shell
+punet.sh mmt.mlir -o mmt.vmfb --iree-hal-dump-executable-files-to=dump-mmt
+cp ./dump-mmt/module_main_0_dispatch_0_rocm_hsaco_fb_benchmark.mlir test-benchmark.mlir
 ```
 
 ### Recommended Trial Run
 For an initial trial to test the tuning loop, use:
-```
-python punet_autotune.py 2.mlir --num-candidates=1
+```shell
+python punet_autotune.py test-benchmark.mlir --num-candidates=10
 ```
 
 ### Dry Run Test
 To perform a dry run (no GPU required), use:
-```
-python punet_autotune.py 2.mlir --num-candidates=64 --num-model-candidates=10 --dry-run
+```shell
+python punet_autotune.py test-benchmark.mlir --num-candidates=64 --num-model-candidates=10 --dry-run
 ```
 
 ### Basic Usage
-```
-python punet_autotune.py 2.mlir
+```shell
+python punet_autotune.py test-benchmark.mlir
 ```
diff --git a/tuner/examples/punet/mmt.mlir b/tuner/examples/punet/mmt.mlir
index 6673d830b..b9d6c5f4c 100644
--- a/tuner/examples/punet/mmt.mlir
+++ b/tuner/examples/punet/mmt.mlir
@@ -1,20 +1,3 @@
-// RUN: iree-compile --iree-hal-target-backends=rocm --iree-rocm-target-chip=gfx942 \
-// RUN:   --iree-rocm-link-bc=true --iree-rocm-bc-dir=/opt/rocm/amdgcn/bitcode \
-// RUN:   --iree-global-opt-propagate-transposes=true --iree-opt-outer-dim-concat=true \
-// RUN:   --iree-opt-const-eval=false --iree-codegen-gpu-native-math-precision=true --iree-rocm-waves-per-eu=2 \
-// RUN:   --iree-preprocessing-pass-pipeline='builtin.module(iree-preprocessing-transpose-convolution-pipeline)' \
-// RUN:   --iree-codegen-llvmgpu-use-vector-distribution --iree-codegen-transform-dialect-library=config.mlir \
-// RUN:   %s -o %s.vmfb
-
-// To compile to for benchmarking, add:
-//  --iree-flow-export-benchmark-funcs --iree-hal-benchmark-dispatch-repeat-count=1000
-//
-// To benchmark:
-//   for i in {0..4} ; do
-//     iree-benchmark-module --device=rocm://7 --module=%s.vmfb --function="main_${i}_benchmark" --device_allocator=caching \
-//       --batch_size=1000 --benchmark_repetitions=5
-//   done
-
 !matA_0 = tensor<2048x1280xf16>
 !matB_0 = tensor<10240x1280xf16>
 !matC_0 = tensor<2048x10240xf32>
@@ -26,51 +9,3 @@ func.func @main_0(%arg0: !matA_0, %arg1: !matB_0) -> !matC_0 {
   %8 = linalg.matmul_transpose_b ins(%arg0, %arg1 : !matA_0, !matB_0) outs(%6 : !matC_0) -> !matC_0
   return %8 : !matC_0
 }
-
-!matA_1 = tensor<2048x1280xf16>
-!matB_1 = tensor<1280x1280xf16>
-!matC_1 = tensor<2048x1280xf32>
-
-func.func @main_1(%arg0: !matA_1, %arg1: !matB_1) -> !matC_1 {
-  %cst = arith.constant 0.000000e+00 : f16
-  %5 = tensor.empty() : !matC_1
-  %6 = linalg.fill ins(%cst : f16) outs(%5 : !matC_1) -> !matC_1
-  %8 = linalg.matmul_transpose_b ins(%arg0, %arg1 : !matA_1, !matB_1) outs(%6 : !matC_1) -> !matC_1
-  return %8 : !matC_1
-}
-
-!matA_2 = tensor<2048x5120xf16>
-!matB_2 = tensor<1280x5120xf16>
-!matC_2 = tensor<2048x1280xf32>
-
-func.func @main_2(%arg0: !matA_2, %arg1: !matB_2) -> !matC_2 {
-  %cst = arith.constant 0.000000e+00 : f16
-  %5 = tensor.empty() : !matC_2
-  %6 = linalg.fill ins(%cst : f16) outs(%5 : !matC_2) -> !matC_2
-  %8 = linalg.matmul_transpose_b ins(%arg0, %arg1 : !matA_2, !matB_2) outs(%6 : !matC_2) -> !matC_2
-  return %8 : !matC_2
-}
-
-!matA_3 = tensor<128x2048xf16>
-!matB_3 = tensor<1280x2048xf16>
-!matC_3 = tensor<128x1280xf32>
-
-func.func @main_3(%arg0: !matA_3, %arg1: !matB_3) -> !matC_3 {
-  %cst = arith.constant 0.000000e+00 : f16
-  %5 = tensor.empty() : !matC_3
-  %6 = linalg.fill ins(%cst : f16) outs(%5 : !matC_3) -> !matC_3
-  %8 = linalg.matmul_transpose_b ins(%arg0, %arg1 : !matA_3, !matB_3) outs(%6 : !matC_3) -> !matC_3
-  return %8 : !matC_3
-}
-
-!matA_4 = tensor<8192x640xf16>
-!matB_4 = tensor<5120x640xf16>
-!matC_4 = tensor<8192x5120xf32>
-
-func.func @main_4(%arg0: !matA_4, %arg1: !matB_4) -> !matC_4 {
-  %cst = arith.constant 0.000000e+00 : f16
-  %5 = tensor.empty() : !matC_4
-  %6 = linalg.fill ins(%cst : f16) outs(%5 : !matC_4) -> !matC_4
-  %8 = linalg.matmul_transpose_b ins(%arg0, %arg1 : !matA_4, !matB_4) outs(%6 : !matC_4) -> !matC_4
-  return %8 : !matC_4
-}
diff --git a/tuner/libtuner.py b/tuner/libtuner.py
old mode 100755
new mode 100644
diff --git a/tuner/libtuner_test.py b/tuner/libtuner_test.py
index 66342faa7..187ceb251 100644
--- a/tuner/libtuner_test.py
+++ b/tuner/libtuner_test.py
@@ -120,20 +120,11 @@ def test_parse_dispatch_benchmark_results():
     mock_result_3.candidate_id = 3
     benchmark_results = [mock_result_1, mock_result_2, mock_result_3]
 
-    candidate_tracker_0 = libtuner.CandidateTracker(candidate_id=0)
-    candidate_tracker_0.dispatch_mlir_path = libtuner.Path("/mock/mlir/path/0.mlir")
-    candidate_tracker_1 = libtuner.CandidateTracker(candidate_id=1)
-    candidate_tracker_1.dispatch_mlir_path = libtuner.Path("/mock/mlir/path/1.mlir")
-    candidate_tracker_2 = libtuner.CandidateTracker(candidate_id=2)
-    candidate_tracker_2.dispatch_mlir_path = libtuner.Path("/mock/mlir/path/2.mlir")
-    candidate_tracker_3 = libtuner.CandidateTracker(candidate_id=3)
-    candidate_tracker_3.dispatch_mlir_path = libtuner.Path("/mock/mlir/path/3.mlir")
-    candidate_trackers = [
-        candidate_tracker_0,
-        candidate_tracker_1,
-        candidate_tracker_2,
-        candidate_tracker_3,
-    ]
+    candidate_trackers = []
+    for i in range(4):
+        tracker = libtuner.CandidateTracker(candidate_id=i)
+        tracker.dispatch_mlir_path = libtuner.Path(f"/mock/mlir/path/{i}.mlir")
+        candidate_trackers.append(tracker)
 
     expected_parsed_results = [
         libtuner.ParsedDisptachBenchmarkResult(