From 7adbb4a3ed4ce05d366222e640fed8ce32598f2a Mon Sep 17 00:00:00 2001
From: Jonathan MERCIER <bioinfornatics@gmail.com>
Date: Tue, 12 Dec 2023 01:08:51 +0100
Subject: [PATCH 01/13] Put tensile source into src dir and  remove tests from
 source

---
 Tensile/Configs/build_client.yaml             |  28 ------------------
 .../bugs/2sum_src_pgr1_smallsum.yaml          |   0
 {Tensile/Tests => Tests}/bugs/d2lds.yaml      |   0
 .../bugs/fractional_plus_pbc.yaml             |   0
 .../Tests => Tests}/bugs/free10_swap.yaml     |   0
 {Tensile/Tests => Tests}/bugs/hpa_beta.yaml   |   0
 .../Tests => Tests}/bugs/nosourcetmp.yaml     |   0
 .../bugs/simple_use_initial_strides_1.yaml    |   0
 {Tensile/Tests => Tests}/bugs/swizzlec1.yaml  |   0
 .../bugs/test_glvw4_edge_no_asem.yaml         |   0
 ...lts[Run_Contraction-src1].contraction.yaml |   0
 {Tensile/Tests => Tests}/conftest.py          |   0
 {Tensile/Tests => Tests}/create_tests.py      |   0
 .../disabled/classic/test_convolution.yaml    |   0
 .../convolution/test_conv_act1d_filter1d.yaml |   0
 .../test_conv_act1d_filter1d_simple.yaml      |   0
 .../test_conv_act1d_filter2d_simple.yaml      |   0
 .../test_conv_act1d_filter3d_simple.yaml      |   0
 .../test_conv_act1d_filter5d_simple.yaml      |   0
 .../convolution/test_conv_act2d_filter1d.yaml |   0
 .../test_conv_act2d_filter1d_simple.yaml      |   0
 .../disabled/direct_to_lds/dtl_dgemm.yaml     |   0
 .../direct_to_lds/dtl_dgemm_lite.yaml         |   0
 .../direct_to_lds/dtl_tsgr_dgemm.yaml         |   0
 .../disabled/hgemm_nn_source.yaml             |   0
 .../disabled/multi_sum/test_.py               |   0
 .../disabled/starter_packed_case.yaml         |   0
 .../disabled/stridea0_pack_nt.yaml            |   0
 .../disabled/strideb0_pack_nn.yaml            |   0
 .../disabled/test_assertion_selection.yaml    |   0
 .../disabled/test_create_library.yaml         |   0
 {Tensile/Tests => Tests}/dot/mixmad-nt.yaml   |   0
 {Tensile/Tests => Tests}/dot/mixmad.yaml      |   0
 .../bfloat16/bfloat16_hpa_source_nn.yaml      |   0
 .../bfloat16/bfloat16_hpa_source_nt.yaml      |   0
 .../bfloat16/bfloat16_hpa_source_tn.yaml      |   0
 .../bfloat16/bfloat16_hpa_source_tt.yaml      |   0
 .../Tests => Tests}/emulation/dgemm_asm.yaml  |   0
 .../double_complex/double_complex_hip_cn.yaml |   0
 .../b8f8gemm_hybrid_b8f8b8s_SR_gfx940.yaml    |   0
 .../b8f8gemm_hybrid_b8f8b8s_gfx940.yaml       |   0
 .../float8/b8f8gemm_hybrid_b8f8hs_gfx940.yaml |   0
 .../float8/b8f8gemm_hybrid_b8f8ss_gfx940.yaml |   0
 .../float8/b8gemm_b8b8s_SR_gfx940.yaml        |   0
 .../emulation/float8/b8gemm_b8b8s_gfx940.yaml |   0
 .../emulation/float8/b8gemm_b8hs_gfx940.yaml  |   0
 .../emulation/float8/b8gemm_b8ss_gfx940.yaml  |   0
 .../f8b8gemm_hybrid_f8b8b8s_SR_gfx940.yaml    |   0
 .../f8b8gemm_hybrid_f8b8b8s_gfx940.yaml       |   0
 .../float8/f8b8gemm_hybrid_f8b8hs_gfx940.yaml |   0
 .../float8/f8b8gemm_hybrid_f8b8ss_gfx940.yaml |   0
 ...f8s-NT-edge-range-A3B3C3-alpha2-beta1.yaml |   0
 .../float8/f8gemm_f8f8s_SR_gfx940.yaml        |   0
 .../emulation/float8/f8gemm_f8f8s_gfx940.yaml |   0
 .../emulation/float8/f8gemm_f8hs_gfx940.yaml  |   0
 .../emulation/float8/f8gemm_f8ss_gfx940.yaml  |   0
 .../float_complex/float_complex_hip_cc.yaml   |   0
 .../emulation/hgemm_asm_nn.yaml               |   0
 .../emulation/hgemm_asm_nt.yaml               |   0
 .../emulation/hgemm_asm_tn.yaml               |   0
 .../emulation/hgemm_asm_tt.yaml               |   0
 .../emulation/hgemm_hpa_asm_nn.yaml           |   0
 .../emulation/hgemm_hpa_asm_nt.yaml           |   0
 .../emulation/hgemm_hpa_asm_tn.yaml           |   0
 .../emulation/hgemm_hpa_asm_tt.yaml           |   0
 .../emulation/igemm_hpa_hip_nn.yaml           |   0
 .../emulation/igemm_hpa_hip_nt.yaml           |   0
 .../emulation/igemm_hpa_hip_tn.yaml           |   0
 .../emulation/igemm_hpa_hip_tt.yaml           |   0
 .../Tests => Tests}/emulation/mfma/1LDSB.yaml |   0
 .../emulation/mfma/cgemm_asm.yaml             |   0
 .../emulation/mfma/cgemm_asm_conjugate.yaml   |   0
 .../Tests => Tests}/emulation/mfma/dgemm.yaml |   0
 .../emulation/mfma/hpa_bfloat16_gemm_asm.yaml |   0
 .../mfma/hpa_bfloat16_gemm_asm_gfx940.yaml    |   0
 .../emulation/mfma/hpa_hgemm_asm.yaml         |   0
 .../mfma/hpa_igemm_i8_asm_gfx940.yaml         |   0
 .../Tests => Tests}/emulation/mfma/sgemm.yaml |   0
 .../extended/big_tensor/biga.yaml             |   0
 .../extended/big_tensor/bigskinny_nt.yaml     |   0
 .../extended/big_tensor/largec.yaml           |   0
 .../rocblas_dgemm_bufferload_limit.yaml       |   0
 .../rocblas_sgemm_bufferload_limit.yaml       |   0
 .../extended/classic/test_persistent.yaml     |   0
 .../classic/test_tensor_contraction.yaml      |   0
 .../extended/classic_source/test_dgemm.yaml   |   0
 .../classic_source/test_hgemm_nn.yaml         |   0
 .../classic_source/test_hgemm_nt.yaml         |   0
 .../classic_source/test_hgemm_tn_tt.yaml      |   0
 .../extended/classic_source/test_sgemm.yaml   |   0
 .../YamlBuilder/YamlBuilder.py                |   0
 .../convolution_config/YamlBuilder/header.yml |   0
 .../YamlBuilder/solutions/sgemm_1.yml         |   0
 .../YamlBuilder/solutions/sgemm_src.yml       |   0
 .../extended/convolution_config/conftest.py   |   0
 .../test_backwarddata_nchw.py                 |   0
 .../test_backwardweights_nchw.py              |   0
 .../convolution_config/test_bad_input.py      |   0
 .../test_conv_vs_contraction.py               |   0
 .../convolution_config/test_forward_cnhw.py   |   0
 .../convolution_config/test_forward_nchw.py   |   0
 .../test_forward_nchw_ckyx.py                 |   0
 .../convolution_config/test_forward_nhwc.py   |   0
 .../convolution_config/test_forward_pad.py    |   0
 .../convolution_config/test_simple.py         |   0
 .../unittests/test_problem_sizes.py           |   0
 .../unittests/test_string_swap.py             |   0
 .../custom_kernel/ck_dgemm_90a_nn.yaml        |   0
 .../ck_dgemm_90a_nn_large_offset.yaml         |   0
 .../extended/direct_to_lds/dtl_dgemm.yaml     |   0
 .../extended/direct_to_lds/dtl_hgemm.yaml     |   0
 .../extended/direct_to_lds/dtl_sgemm.yaml     |   0
 .../extended/direct_to_lds/dtl_tsgr_f8.yaml   |   0
 .../direct_to_lds/dtl_tsgr_hgemm.yaml         |   0
 .../direct_to_lds/dtl_tsgr_sgemm.yaml         |   0
 .../extended/direct_to_vgpr/dtv_cgemm.yaml    |   0
 .../extended/direct_to_vgpr/dtv_dgemm.yaml    |   0
 .../direct_to_vgpr/dtv_dgemm_a1b0.yaml        |   0
 .../extended/direct_to_vgpr/dtv_f8gemm.yaml   |   0
 .../extended/direct_to_vgpr/dtv_hgemm.yaml    |   0
 .../extended/direct_to_vgpr/dtv_igemm.yaml    |   0
 .../extended/dot2/hgemm_hpa_dot2_nn.yaml      |   0
 .../extended/dot2/hgemm_hpa_dot2_tn.yaml      |   0
 .../extended/dot2/hgemm_hpa_dot2_tn_2.yaml    |   0
 .../extended/double_complex/zgemm_asm.yaml    |   0
 .../double_complex/zgemm_hip_source_cc.yaml   |   0
 .../double_complex/zgemm_hip_source_cn.yaml   |   0
 .../double_complex/zgemm_hip_source_ct.yaml   |   0
 .../double_complex/zgemm_hip_source_nc.yaml   |   0
 .../double_complex/zgemm_hip_source_nn.yaml   |   0
 .../double_complex/zgemm_hip_source_nt.yaml   |   0
 .../double_complex/zgemm_hip_source_tc.yaml   |   0
 .../double_complex/zgemm_hip_source_tn.yaml   |   0
 .../double_complex/zgemm_hip_source_tt.yaml   |   0
 .../extended/flat/test_dgemm_asm_flat.yaml    |   0
 .../extended/flat/test_sgemm_asm_flat.yaml    |   0
 .../extended/flat/test_sgemm_asm_flat_nt.yaml |   0
 .../extended/flat/test_sgemm_asm_flat_tn.yaml |   0
 .../extended/flat/test_sgemm_asm_flat_tt.yaml |   0
 .../extended/float8/f8gemm-hybrid-ss.yaml     |   0
 .../extended/float_complex/cgemm_asm.yaml     |   0
 .../float_complex/cgemm_hip_source_cc.yaml    |   0
 .../float_complex/cgemm_hip_source_cn.yaml    |   0
 .../float_complex/cgemm_hip_source_ct.yaml    |   0
 .../float_complex/cgemm_hip_source_nc.yaml    |   0
 .../float_complex/cgemm_hip_source_nn.yaml    |   0
 .../float_complex/cgemm_hip_source_nt.yaml    |   0
 .../float_complex/cgemm_hip_source_tc.yaml    |   0
 .../float_complex/cgemm_hip_source_tn.yaml    |   0
 .../float_complex/cgemm_hip_source_tt.yaml    |   0
 .../test_dgemm_fractional_tile_sweep.yaml     |   0
 .../test_hgemm_fractional_tile_sweep.yaml     |   0
 .../test_sgemm_fractional_edge.yaml           |   0
 .../test_sgemm_fractional_tile_sweep.yaml     |   0
 .../extended/global_split_u/hgemm_gsu.yaml    |   0
 .../global_split_u/hgemm_gsu_minkforgsu.yaml  |   0
 .../global_split_u/sgemm_gsu_batch.yaml       |   0
 .../global_split_u/sgemm_gsu_beta0.yaml       |   0
 .../global_split_u/sgemm_gsu_beta1.yaml       |   0
 .../global_split_u/sgemm_gsu_beta2.yaml       |   0
 .../global_split_u/sgemm_gsu_usebeta0.yaml    |   0
 .../hpa_source/test_hgemm_hpa_src_nn.yaml     |   0
 .../hpa_source/test_hgemm_hpa_src_nt.yaml     |   0
 .../hpa_source/test_hgemm_hpa_src_tn.yaml     |   0
 .../hpa_source/test_hgemm_hpa_src_tt.yaml     |   0
 .../local_split_u/bfloat16_lsu_mfma.yaml      |   0
 .../local_split_u/cgemm_lsu_mfma.yaml         |   0
 .../extended/local_split_u/dgemm_lsu.yaml     |   0
 .../local_split_u/dgemm_lsu_mfma.yaml         |   0
 .../local_split_u/f8gemm_lsu_mfma.yaml        |   0
 .../extended/local_split_u/hgemm_lsu.yaml     |   0
 .../local_split_u/hgemm_lsu_grvw2.yaml        |   0
 .../local_split_u/hgemm_lsu_mfma.yaml         |   0
 .../local_split_u/hgemm_lsu_mfma_a1b0.yaml    |   0
 .../local_split_u/igemm_lsu_mfma.yaml         |   0
 .../extended/local_split_u/sgemm_lsu.yaml     |   0
 .../local_split_u/sgemm_lsu_mfma.yaml         |   0
 .../local_split_u/zgemm_lsu_mfma.yaml         |   0
 .../mirror_dims/mirror_dims_1sum_zp.yaml      |   0
 .../mirror_dims_2sum_mir_summ.yaml            |   0
 .../mirror_dims_2sum_mir_summ_zp_other.yaml   |   0
 .../mirror_dims_2sum_mir_summ_zp_unroll.yaml  |   0
 .../mirror_dims_2sum_mir_unroll.yaml          |   0
 .../mirror_dims_2sum_mir_unroll_summ.yaml     |   0
 .../mirror_dims_2sum_mir_unroll_zp_other.yaml |   0
 ...mirror_dims_2sum_mir_unroll_zp_unroll.yaml |   0
 .../mirror_dims_3sum_mir_summ1.yaml           |   0
 .../mirror_dims_3sum_mir_summ1_summ2.yaml     |   0
 .../mirror_dims_3sum_mir_summ2.yaml           |   0
 .../mirror_dims_3sum_mir_summ_zp_other.yaml   |   0
 .../mirror_dims_3sum_mir_unroll.yaml          |   0
 .../mirror_dims_3sum_mir_unroll_summ1.yaml    |   0
 .../mirror_dims_3sum_mir_unroll_zp_other.yaml |   0
 .../extended/multi_sum/2sum.yaml              |   0
 .../extended/multi_sum/2sum_gsu.yaml          |   0
 .../extended/multi_sum/2sum_gsu_simple.yaml   |   0
 .../extended/multi_sum/2sum_gsu_src.yaml      |   0
 .../extended/multi_sum/2sum_src.yaml          |   0
 .../extended/multi_sum/3sum_gsu.yaml          |   0
 .../multi_sum/simple_sum2_scrambled.yaml      |   0
 .../multi_sum_psd/1sum_gsu_simple.yaml        |   0
 .../extended/multi_sum_psd/1sum_simple.yaml   |   0
 .../extended/multi_sum_psd/2sum.yaml          |   0
 .../extended/multi_sum_psd/2sum_gsu.yaml      |   0
 .../multi_sum_psd/2sum_gsu_simple.yaml        |   0
 .../multi_sum_psd/2sum_gsuremainder.yaml      |   0
 .../2sum_gsuremainder_simple.yaml             |   0
 .../extended/multi_sum_psd/2sum_pbd.yaml      |   0
 .../multi_sum_psd/2sum_scrambled_simple.yaml  |   0
 .../extended/multi_sum_psd/3sum.yaml          |   0
 .../extended/multi_sum_psd/3sum_gsu.yaml      |   0
 .../multi_sum_psd/3sum_gsu_simple.yaml        |   0
 .../extended/multi_sum_psd/3sum_simple.yaml   |   0
 .../extended/multi_sum_psd/README             |   0
 .../hackable_simple_unrollinc1.yaml           |   0
 .../extended/nonbatched/sgemm_asm_nn.yaml     |   0
 .../extended/nonbatched/sgemm_asm_nt.yaml     |   0
 .../extended/nonbatched/sgemm_asm_tn.yaml     |   0
 .../extended/nonbatched/sgemm_asm_tt.yaml     |   0
 .../pack_tensor_dims/multi_free2.yaml         |   0
 .../pack_tensor_dims/multi_free_batch.yaml    |   0
 .../pack_tensor_dims/packed_perf_nn.yaml      |   0
 .../simple_stridea0_pack.yaml                 |   0
 .../simple_strideb0_pack.yaml                 |   0
 .../pack_tensor_dims/strideb0_pack_nt.yaml    |   0
 .../pack_tensor_dims/strideb0_pack_tn.yaml    |   0
 .../pack_tensor_dims/vectorstore0.yaml        |   0
 .../extended/stagger_u/big_skinny_A_NN.yaml   |   0
 .../extended/stagger_u/big_skinny_A_NT.yaml   |   0
 .../extended/stagger_u/big_skinny_A_TN.yaml   |   0
 .../extended/stagger_u/big_skinny_A_TT.yaml   |   0
 .../extended/stagger_u/big_skinny_B_NN.yaml   |   0
 .../extended/stagger_u/big_skinny_B_NT.yaml   |   0
 .../extended/stagger_u/big_skinny_B_TN.yaml   |   0
 .../extended/stagger_u/big_skinny_B_TT.yaml   |   0
 .../extended/stream_k/sk_2tile_hgemm_hhs.yaml |   0
 .../extended/stream_k/sk_2tile_sgemm.yaml     |   0
 .../extended/stream_k/sk_hgemm_hhs.yaml       |   0
 .../extended/stream_k/sk_sgemm.yaml           |   0
 .../extended/tensor_contraction/README        |   0
 .../tensor_contraction/allownofree.yaml       |   0
 .../tensor_contraction/assert_size_equal.yaml |   0
 .../tensor_contraction/exact_conv.yaml        |   0
 .../extended/tensor_contraction/filter.yaml   |   0
 .../extended/tensor_contraction/ncdhw.yaml    |   0
 .../tensor_contraction/sweep_packed_dims.yaml |   0
 .../extended/tensor_contraction/swizzle0.yaml |   0
 .../extended/tensor_contraction/swizzle1.yaml |   0
 .../extended/tensor_contraction/swizzle2.yaml |   0
 .../extended/tensor_contraction/swizzle3.yaml |   0
 ...packed_strides3d_defaults.contraction.yaml |   0
 ...w_packed_strides_filter3d.contraction.yaml |   0
 .../test_nchw_filter_contraction.yaml         |   0
 .../tlu0_non_unit_stride.yaml                 |   0
 .../simple_use_initial_strides_1.yaml         |   0
 .../extended/use_initial_strides/test_1.yaml  |   0
 .../extended/use_initial_strides/test_2.yaml  |   0
 .../use_initial_strides/test_strides.yaml     |   0
 .../use_initial_strides/test_strides1.yaml    |   0
 .../perf_uis_cd_specialized.yaml              |   0
 .../test_use_initial_strides_cd_0.yaml        |   0
 .../test_use_initial_strides_cd_2.yaml        |   0
 .../extended/vector_width/hgemm_nn_asm.yaml   |   0
 .../extended/vector_width/sgemm_nn_asm.yaml   |   0
 .../vector_width/sgemm_nn_source.yaml         |   0
 .../zeropad/test_zp_2sum_zpother.yaml         |   0
 .../extended/zeropad/test_zp_simple_1sum.yaml |   0
 .../zeropad/test_zp_simple_2sum_zp_both.yaml  |   0
 .../zeropad/test_zp_simple_2sum_zp_other.yaml |   0
 .../test_zp_simple_2sum_zp_unroll.yaml        |   0
 .../zeropad/test_zp_simple_3sum_zp_other.yaml |   0
 .../hipModuleLoad_timing/Makefile             |   0
 .../hipModuleLoadTiming.cpp                   |   0
 .../integration/test_integration.py           |   0
 .../pre_checkin/4xi8gemm_hpa_hip_nn.yaml      |   0
 .../pre_checkin/4xi8gemm_hpa_hip_nt.yaml      |   0
 .../pre_checkin/4xi8gemm_hpa_hip_tn.yaml      |   0
 .../pre_checkin/4xi8gemm_hpa_hip_tt.yaml      |   0
 .../bfloat16/bfloat16_hpa_source_nn.yaml      |   0
 .../bfloat16/bfloat16_hpa_source_nt.yaml      |   0
 .../bfloat16/bfloat16_hpa_source_tn.yaml      |   0
 .../bfloat16/bfloat16_hpa_source_tt.yaml      |   0
 .../bfloat16/bfloat16s_hpa_source_nn.yaml     |   0
 .../bfloat16/bfloat16s_hpa_source_nt.yaml     |   0
 .../bfloat16/bfloat16s_hpa_source_tn.yaml     |   0
 .../bfloat16/bfloat16s_hpa_source_tt.yaml     |   0
 .../Tests => Tests}/pre_checkin/cov/COV4.yaml |   0
 .../Tests => Tests}/pre_checkin/cov/COV5.yaml |   0
 .../pre_checkin/cov/COVDefault.yaml           |   0
 .../denorm/bfloat16_hpa_source_nn.yaml        |   0
 .../pre_checkin/denorm/dgemm_asm.yaml         |   0
 .../pre_checkin/denorm/hgemm_hpa_asm_nn.yaml  |   0
 .../denorm/mfma/bfloat16_1k_denorm.yaml       |   0
 .../denorm/mfma/bfloat16_denorm.yaml          |   0
 .../pre_checkin/denorm/mfma/dgemm_denorm.yaml |   0
 .../pre_checkin/denorm/mfma/hgemm_denorm.yaml |   0
 .../denorm/mfma/hgemm_denorm_alt.yaml         |   0
 .../denorm/mfma/hgemm_denorm_alt_rnz.yaml     |   0
 .../pre_checkin/denorm/mfma/sgemm_denorm.yaml |   0
 .../pre_checkin/denorm/sgemm_asm_nn.yaml      |   0
 .../pre_checkin/dgemm_asm.yaml                |   0
 .../pre_checkin/dgemm_general_batch_asm.yaml  |   0
 .../direct_to_vgpr/dtv_sgemm_lite.yaml        |   0
 .../double_complex/double_complex_asm_cc.yaml |   0
 .../double_complex/double_complex_asm_cn.yaml |   0
 .../double_complex/double_complex_asm_ct.yaml |   0
 .../double_complex/double_complex_asm_nc.yaml |   0
 .../double_complex/double_complex_asm_nn.yaml |   0
 .../double_complex/double_complex_asm_nt.yaml |   0
 .../double_complex/double_complex_asm_tc.yaml |   0
 .../double_complex/double_complex_asm_tn.yaml |   0
 .../double_complex/double_complex_asm_tt.yaml |   0
 .../double_complex/double_complex_hip_cc.yaml |   0
 .../double_complex/double_complex_hip_cn.yaml |   0
 .../double_complex/double_complex_hip_ct.yaml |   0
 .../double_complex/double_complex_hip_nc.yaml |   0
 .../double_complex/double_complex_hip_nn.yaml |   0
 .../double_complex/double_complex_hip_nt.yaml |   0
 .../double_complex/double_complex_hip_tc.yaml |   0
 .../double_complex/double_complex_hip_tn.yaml |   0
 .../double_complex/double_complex_hip_tt.yaml |   0
 .../float_complex/float_complex_asm_cc.yaml   |   0
 .../float_complex/float_complex_asm_cn.yaml   |   0
 .../float_complex/float_complex_asm_ct.yaml   |   0
 .../float_complex/float_complex_asm_nc.yaml   |   0
 .../float_complex/float_complex_asm_nn.yaml   |   0
 .../float_complex/float_complex_asm_nt.yaml   |   0
 .../float_complex/float_complex_asm_tc.yaml   |   0
 .../float_complex/float_complex_asm_tn.yaml   |   0
 .../float_complex/float_complex_asm_tt.yaml   |   0
 .../float_complex/float_complex_hip_cc.yaml   |   0
 .../float_complex/float_complex_hip_cn.yaml   |   0
 .../float_complex/float_complex_hip_ct.yaml   |   0
 .../float_complex/float_complex_hip_nc.yaml   |   0
 .../float_complex/float_complex_hip_nn.yaml   |   0
 .../float_complex/float_complex_hip_nt.yaml   |   0
 .../float_complex/float_complex_hip_tc.yaml   |   0
 .../float_complex/float_complex_hip_tn.yaml   |   0
 .../float_complex/float_complex_hip_tt.yaml   |   0
 .../pre_checkin/hgemm_asm_nn.yaml             |   0
 .../pre_checkin/hgemm_asm_nt.yaml             |   0
 .../pre_checkin/hgemm_asm_tn.yaml             |   0
 .../pre_checkin/hgemm_asm_tt.yaml             |   0
 .../hgemm_general_batch_asm_nn.yaml           |   0
 .../hgemm_general_batch_hpa_asm_nn.yaml       |   0
 .../hgemm_hpa_asm_f32_alphabeta_nn.yaml       |   0
 .../hgemm_hpa_asm_f32_alphabeta_nt.yaml       |   0
 .../hgemm_hpa_asm_f32_alphabeta_tn.yaml       |   0
 .../hgemm_hpa_asm_f32_alphabeta_tt.yaml       |   0
 .../pre_checkin/hgemm_hpa_asm_nn.yaml         |   0
 .../pre_checkin/hgemm_hpa_asm_nt.yaml         |   0
 .../pre_checkin/hgemm_hpa_asm_tn.yaml         |   0
 .../pre_checkin/hgemm_hpa_asm_tt.yaml         |   0
 .../pre_checkin/hgemm_hpa_iu2_asm_nn.yaml     |   0
 .../pre_checkin/hgemm_hpa_iu2_asm_nt.yaml     |   0
 .../pre_checkin/hgemm_hpa_iu2_asm_tn.yaml     |   0
 .../pre_checkin/hgemm_hpa_iu2_asm_tt.yaml     |   0
 .../pre_checkin/hsgemm_hpa_asm_nn.yaml        |   0
 .../pre_checkin/hsgemm_hpa_asm_nt.yaml        |   0
 .../pre_checkin/hsgemm_hpa_asm_tn.yaml        |   0
 .../pre_checkin/hsgemm_hpa_asm_tt.yaml        |   0
 .../pre_checkin/hsgemm_hpa_iu2_asm_nn.yaml    |   0
 .../pre_checkin/hsgemm_hpa_iu2_asm_nt.yaml    |   0
 .../pre_checkin/hsgemm_hpa_iu2_asm_tn.yaml    |   0
 .../pre_checkin/hsgemm_hpa_iu2_asm_tt.yaml    |   0
 .../pre_checkin/igemm_hpa_asm_nn.yaml         |   0
 .../pre_checkin/igemm_hpa_hip_nn.yaml         |   0
 .../pre_checkin/mfma/1LDSB.yaml               |   0
 .../pre_checkin/mfma/c-tile-reuse-no-nll.yaml |   0
 .../pre_checkin/mfma/cgemm_asm.yaml           |   0
 .../pre_checkin/mfma/cgemm_asm_conjugate.yaml |   0
 .../mfma/dgemm_alpha1_beta0_sgpr.yaml         |   0
 .../pre_checkin/mfma/dgemm_asm.yaml           |   0
 .../pre_checkin/mfma/dgemm_gb_global_ldd.yaml |   0
 .../pre_checkin/mfma/dgemm_large_offset.yaml  |   0
 .../mfma/hpa_bfloat16_gemm_asm.yaml           |   0
 .../mfma/hpa_bfloat16_gemm_asm_gfx940.yaml    |   0
 .../hpa_bfloat16_general_batch_gemm_asm.yaml  |   0
 ...float16_general_batch_gemm_asm_gfx940.yaml |   0
 .../mfma/hpa_bfloat16s_gemm_asm.yaml          |   0
 .../mfma/hpa_bfloat16s_gemm_asm_gfx940.yaml   |   0
 .../pre_checkin/mfma/hpa_hgemm_asm.yaml       |   0
 .../mfma/hpa_hgemm_f32_alphabeta_asm.yaml     |   0
 .../mfma/hpa_hgemm_general_batch_asm.yaml     |   0
 .../pre_checkin/mfma/hpa_hgemm_split_lds.yaml |   0
 .../pre_checkin/mfma/hpa_hsgemm_asm.yaml      |   0
 .../pre_checkin/mfma/hpa_igemm_i8_asm.yaml    |   0
 .../mfma/hpa_igemm_i8_asm_gfx940.yaml         |   0
 .../mfma/hpa_igemm_i8_split_lds.yaml          |   0
 .../mfma/hpa_igemm_i8_split_lds_gfx940.yaml   |   0
 .../pre_checkin/mfma/sgemm_64bit_offset.yaml  |   0
 .../mfma/sgemm_64bit_offset_post.yaml         |   0
 .../pre_checkin/mfma/sgemm_asm.yaml           |   0
 .../mfma/sgemm_general_batch_asm.yaml         |   0
 .../pre_checkin/mfma/sgemm_split_lds.yaml     |   0
 .../mfma/sgemm_xf32_asm_gfx940.yaml           |   0
 .../pre_checkin/mfma/wider_local_read.yaml    |   0
 .../pre_checkin/mfma/zgemm_asm.yaml           |   0
 .../pre_checkin/mfma/zgemm_asm_conjugate.yaml |   0
 .../no_load_loop/nll_reproduce_bug.yaml       |   0
 .../no_load_loop/sgemm_nll_asm_nn.yaml        |   0
 .../no_load_loop/sgemm_nll_asm_nt.yaml        |   0
 .../no_load_loop/sgemm_nll_asm_tn.yaml        |   0
 .../no_load_loop/sgemm_nll_asm_tt.yaml        |   0
 .../regression/persistent_kernel.yaml         |   0
 .../pre_checkin/sgemm_asm_nn.yaml             |   0
 .../pre_checkin/sgemm_asm_nt.yaml             |   0
 .../pre_checkin/sgemm_asm_tn.yaml             |   0
 .../pre_checkin/sgemm_asm_tn_bigk.yaml        |   0
 .../pre_checkin/sgemm_asm_tt.yaml             |   0
 .../pre_checkin/sgemm_exact_dict.yaml         |   0
 .../sgemm_general_batch_asm_nn.yaml           |   0
 .../source/test_dgemm_defaults.yaml           |   0
 .../source/test_hgemm_defaults.yaml           |   0
 .../pre_checkin/source/test_hgemm_hpa.yaml    |   0
 .../source/test_sgemm_defaults.yaml           |   0
 .../pre_checkin/wmma/hgemm_wmma.yaml          |   0
 .../wmma/hpa_bfloat16_gemm_wmma.yaml          |   0
 .../pre_checkin/wmma/hpa_hgemm_wmma.yaml      |   0
 .../pre_checkin/wmma/hpa_igemm_wmma.yaml      |   0
 .../special/global_split_u_src/README         |   0
 .../special/global_split_u_src/hgemm_gsu.yaml |   0
 .../global_split_u_src/sgemm_gsu_beta0.yaml   |   0
 .../global_split_u_src/sgemm_gsu_beta1.yaml   |   0
 .../global_split_u_src/sgemm_gsu_beta2.yaml   |   0
 .../sgemm_gsu_usebeta0.yaml                   |   0
 .../special/igemm/igemm_hpa_hip_lsu.yaml      |   0
 .../special/igemm/igemm_hpa_hip_nn.yaml       |   0
 .../special/igemm/igemm_hpa_hip_tt.yaml       |   0
 .../library_data/hardcodedParameters.yaml     |   0
 .../initialSolutionParameters.yaml            |   0
 .../library/Kernels.so-000-gfx1010.hsaco      |   0
 .../library/Kernels.so-000-gfx1011.hsaco      |   0
 .../library/Kernels.so-000-gfx803.hsaco       |   0
 .../library/Kernels.so-000-gfx900.hsaco       |   0
 .../library/Kernels.so-000-gfx906.hsaco       |   0
 .../library/Kernels.so-000-gfx908.hsaco       |   0
 .../library_data/library/TensileLibrary.yaml  |   0
 .../library/TensileLibrary_gfx1010.co         |   0
 .../library/TensileLibrary_gfx1011.co         |   0
 .../library/TensileLibrary_gfx803.co          |   0
 .../library/TensileLibrary_gfx900.co          |   0
 .../library/TensileLibrary_gfx906.co          |   0
 .../library/TensileLibrary_gfx908.co          |   0
 .../unit/library_data/library/metadata.yaml   |   0
 .../unit/library_data/problemType.yaml        |   0
 .../unit/solutions/solutions_nn_3.yaml        |   0
 {Tensile/Tests => Tests}/unit/__init__.py     |   0
 .../unit/customKernels/TestKernel.s           |   0
 .../unit/replacement/bad_file/bad.txt         |   0
 .../unit/replacement/duplicate_kernel/a.txt   |   0
 .../unit/replacement/duplicate_kernel/b.txt   |   0
 .../replacement/known_kernels_v2/baz.s.txt    |   0
 .../known_kernels_v2/kernel_named_bar.txt     |   0
 .../known_kernels_v2/kernel_named_foo.txt     |   0
 .../replacement/known_kernels_v3/baz.s.txt    |   0
 .../known_kernels_v3/kernel_named_bar.txt     |   0
 .../known_kernels_v3/kernel_named_foo.txt     |   0
 {Tensile/Tests => Tests}/unit/test_Common.py  |   0
 .../Tests => Tests}/unit/test_Component.py    |   0
 .../unit/test_Configuration.py                |   0
 .../unit/test_CustomKernels.py                |   0
 .../Tests => Tests}/unit/test_DataType.py     |   0
 .../unit/test_HardwarePredicates.py           |   0
 .../unit/test_KernelWriterAssembly.py         |   0
 .../Tests => Tests}/unit/test_LibraryIO.py    |   0
 .../unit/test_PerfMetricPredicates.py         |   0
 .../Tests => Tests}/unit/test_Priority.py     |   0
 .../unit/test_ReplacementKernels.py           |   0
 .../unit/test_TensileCreateLibrary.py         |   0
 .../Tests => Tests}/unit/test_conv_problem.py |   0
 .../unit/test_exact_problem.py                |   0
 .../Tests => Tests}/unit/test_makeProblem.py  |   0
 .../Tests => Tests}/unit/test_mergeLogic.py   |   0
 .../Tests => Tests}/unit/test_tryAssembler.py |   0
 .../unit/test_useGlobalParameters.py          |   0
 .../vega_20/fast/igemm_asm_nn.yaml            |   0
 .../vega_20/fast/igemm_asm_nt.yaml            |   0
 .../vega_20/fast/igemm_asm_tn.yaml            |   0
 .../vega_20/fast/igemm_asm_tt.yaml            |   0
 .../global_split_u/igemm_gsu_beta0.yaml       |   0
 .../global_split_u/igemm_gsu_beta1.yaml       |   0
 .../global_split_u/igemm_gsu_beta2.yaml       |   0
 .../nightly/local_split_u/igemm_lsu.yaml      |   0
 .../Tests => Tests}/weekly/assertions/README  |   0
 .../assertions/test_hgemm_asem2_asm.yaml      |   0
 .../classic_source/test_hgemm_vectors.yaml    |   0
 .../classic_source/test_sgemm_vectors.yaml    |   0
 .../Tests => Tests}/yaml_only/test_config.py  |   0
 {Tensile/Tests => Tests}/yaml_only/test_ya    |   0
 .../Tensile}/AsmMemoryInstruction.py          |   0
 {Tensile => src/Tensile}/AsmRegisterPool.py   |   0
 {Tensile => src/Tensile}/AsmUtils.py          |   0
 {Tensile => src/Tensile}/BenchmarkProblems.py |   0
 {Tensile => src/Tensile}/BenchmarkSplitter.py |   0
 {Tensile => src/Tensile}/BenchmarkStructs.py  |   0
 {Tensile => src/Tensile}/ClientExecutable.py  |   0
 {Tensile => src/Tensile}/ClientWriter.py      |   0
 {Tensile => src/Tensile}/Code.py              |   0
 {Tensile => src/Tensile}/Common.py            |   0
 {Tensile => src/Tensile}/Component.py         |   0
 .../Tensile}/Components/ComputeStoreVgprs.py  |   0
 .../Tensile}/Components/LocalRead.py          |   0
 .../Tensile}/Components/LraTileAssignment.py  |   0
 .../Tensile}/Components/MAC_BF16_HPA.py       |   0
 .../Tensile}/Components/MAC_F16.py            |   0
 .../Tensile}/Components/MAC_F16_HPA.py        |   0
 .../Tensile}/Components/MAC_F32.py            |   0
 .../Tensile}/Components/MAC_F32C.py           |   0
 .../Tensile}/Components/MAC_F64.py            |   0
 .../Tensile}/Components/MAC_F64C.py           |   0
 .../Tensile}/Components/MAC_I8X4.py           |   0
 .../Tensile}/Components/MAC_I8_HPA.py         |   0
 {Tensile => src/Tensile}/Components/MFMA.py   |   0
 .../Components/NotLocalFullTileElements.py    |   0
 .../Tensile}/Components/Priority.py           |   0
 .../Components/PseudoRandomGenerator.py       |   0
 .../Components/ShiftVectorComponents.py       |   0
 .../Tensile}/Components/Signature.py          |   0
 .../Tensile}/Components/__init__.py           |   0
 {Tensile => src/Tensile}/Configuration.py     |   0
 {Tensile => src/Tensile}/Contractions.py      |   0
 {Tensile => src/Tensile}/CustomKernels.py     |   0
 ...128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4.s |   0
 {Tensile => src/Tensile}/DataType.py          |   0
 {Tensile => src/Tensile}/EmbeddedData.py      |   0
 .../Tensile}/GenerateSummations.py            |   0
 {Tensile => src/Tensile}/Hardware.py          |   0
 {Tensile => src/Tensile}/KernelWriter.py      |   0
 .../Tensile}/KernelWriterAssembly.py          |   0
 {Tensile => src/Tensile}/KernelWriterBase.py  |   0
 .../Tensile}/KernelWriterBetaOnly.py          |   0
 .../Tensile}/KernelWriterConversion.py        |   0
 .../Tensile}/KernelWriterSource.py            |   0
 .../Tensile}/KernelWriterStreamKInit.py       |   0
 {Tensile => src/Tensile}/LibraryIO.py         |   0
 {Tensile => src/Tensile}/LibraryLogic.py      |   0
 {Tensile => src/Tensile}/Parallel.py          |   0
 {Tensile => src/Tensile}/Properties.py        |   0
 .../Tensile}/ReplacementKernels.py            |   0
 {Tensile => src/Tensile}/SolutionLibrary.py   |   0
 .../Tensile}/SolutionSelectionLibrary.py      |   0
 {Tensile => src/Tensile}/SolutionStructs.py   |   0
 {Tensile => src/Tensile}/SolutionWriter.py    |   0
 {Tensile => src/Tensile}/Tensile.py           |   0
 .../Tensile}/TensileBenchmarkCluster.py       |   0
 .../TensileBenchmarkClusterScripts.py         |   0
 .../Tensile}/TensileBenchmarkLibraryClient.py |   0
 .../Tensile}/TensileClientConfig.py           |   0
 .../Tensile}/TensileCreateLibrary.py          |   0
 .../Tensile}/TensileLibLogicToYaml.py         |   0
 .../Tensile}/TensileMergeLibrary.py           |   0
 .../Tensile}/TensileRetuneLibrary.py          |   0
 .../Tensile}/TensileUpdateLibrary.py          |   0
 {Tensile => src/Tensile}/Utils.py             |   0
 {Tensile => src/Tensile}/__init__.py          |   0
 {Tensile => src/Tensile}/bin/Tensile          |   0
 .../Tensile}/bin/TensileBenchmarkCluster      |   0
 .../Tensile}/bin/TensileClientConfig          |   0
 .../Tensile}/bin/TensileCreateLibrary         |   0
 .../Tensile}/bin/TensileGenerateSummations    |   0
 .../Tensile}/bin/TensileLibLogicToYaml        |   0
 .../Tensile}/bin/TensileMergeLibrary          |   0
 .../Tensile}/bin/TensileRetuneLibrary         |   0
 .../Tensile}/bin/TensileUpdateLibrary         |   0
 .../alternate-format/sizeList-example.yaml    |   0
 .../alternate-format/vega20-example.yaml      |   0
 .../Tensile/data}/Configs/deep_bench_nn.csv   |   0
 .../data}/Configs/deep_bench_nn_batched.csv   |   0
 .../Tensile/data}/Configs/deep_bench_nt.csv   |   0
 .../data}/Configs/deep_bench_nt_batched.csv   |   0
 .../Tensile/data}/Configs/deep_bench_tn.csv   |   0
 .../data}/Configs/deep_bench_tn_batched.csv   |   0
 .../Configs/mfma/mfma_hpa_bf16_nt_test.yaml   |   0
 .../Configs/mfma/mfma_igemm_lite_test.yaml    |   0
 .../Configs/mfma/mfma_igemm_nn_asm_full.yaml  |   0
 .../Configs/mfma/mfma_igemm_nt_asm_full.yaml  |   0
 .../Configs/mfma/mfma_igemm_tn_asm_full.yaml  |   0
 .../Configs/mfma/mfma_igemm_tt_asm_full.yaml  |   0
 .../Tensile/data}/Configs/mfma/mfma_test.yaml |   0
 .../mfma/rocblas_cgemm_asm_xdlops.yaml        |   0
 .../mfma/rocblas_sgemm_asm_single_kernel.yaml |   0
 .../mfma/rocblas_sgemm_nt_hpl1_asm_full.yaml  |   0
 .../data}/Configs/mfma/sgemm_tlunn.yaml       |   0
 .../Configs/mfma/sgemm_transposeLDS.yaml      |   0
 .../vega10_Cijk_Ailk_Bljk_HB.yaml             |   0
 .../vega10_Cijk_Ailk_Bljk_SB.yaml             |   0
 .../vega10_Cijk_Ailk_Bjlk_HB.yaml             |   0
 .../vega10_Cijk_Ailk_Bjlk_SB.yaml             |   0
 .../vega10_Cijk_Ailk_Bljk_HB.yaml             |   0
 .../vega10_Cijk_Ailk_Bljk_SB.yaml             |   0
 .../vega10_Cijk_Alik_Bljk_HB.yaml             |   0
 .../vega10_Cijk_Alik_Bljk_SB.yaml             |   0
 .../Tensile/data}/Configs/miopen/Makefile     |   0
 .../Tensile/data}/Configs/miopen/README.md    |   0
 .../configs/vega20_sgemm_nn_bert.yaml         |   0
 .../configs/vega20_sgemm_nt_bert.yaml         |   0
 .../configs/vega20_sgemm_tn_bert.yaml         |   0
 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml       |   0
 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml       |   0
 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml       |   0
 .../configs/vega20_sgemm_nn_bert.yaml         |   0
 .../configs/vega20_sgemm_nt_bert.yaml         |   0
 .../configs/vega20_sgemm_tn_bert.yaml         |   0
 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml       |   0
 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml       |   0
 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml       |   0
 .../configs/arcturus_sgemm_nn_bert.yaml       |   0
 .../configs/arcturus_sgemm_nt_bert.yaml       |   0
 .../configs/arcturus_sgemm_tn_bert.yaml       |   0
 .../exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml     |   0
 .../exact/arcturus_Cijk_Ailk_Bljk_SB.yaml     |   0
 .../exact/arcturus_Cijk_Alik_Bljk_SB.yaml     |   0
 .../configs/vega20_sgemm_nn_msra.yaml         |   0
 .../configs/vega20_sgemm_nt_msra.yaml         |   0
 .../configs/vega20_sgemm_tn_msra.yaml         |   0
 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml       |   0
 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml       |   0
 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml       |   0
 .../configs/vega20_sgemm_nn_bert.yaml         |   0
 .../configs/vega20_sgemm_nt_bert.yaml         |   0
 .../configs/vega20_sgemm_tn_bert.yaml         |   0
 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml       |   0
 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml       |   0
 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml       |   0
 .../configs/vega20_hgemm_nn_bert_f16.yaml     |   0
 .../configs/vega20_hgemm_nt_bert_f16.yaml     |   0
 .../configs/vega20_hgemm_tn_bert_f16.yaml     |   0
 .../exact/vega20_Cijk_Ailk_Bjlk_HB.yaml       |   0
 .../exact/vega20_Cijk_Ailk_Bljk_HB.yaml       |   0
 .../exact/vega20_Cijk_Alik_Bljk_HB.yaml       |   0
 .../configs/bert_sgemm_xdlops_nn.yaml         |   0
 .../configs/bert_sgemm_xdlops_tn.yaml         |   0
 .../2020-05-18/configs/dlrm_sgemm_xdlops.yaml |   0
 .../configs/dlrm_sgemm_xdlops_nt.yaml         |   0
 .../replacement-kernel-arcturus-tn.yaml       |   0
 .../rocblas_sgemm_nn_inc1_asm_full.yaml       |   0
 .../rocblas_sgemm_nt_inc1_asm_full.yaml       |   0
 .../rocblas_sgemm_tn_inc1_asm_full.yaml       |   0
 .../exact/arcturus_Cijk_Alik_Bljk_SB.yaml     |   0
 .../configs/vega20_sgemm_nn_batched_msra.yaml |   0
 .../configs/vega20_sgemm_nt_batched_msra.yaml |   0
 .../configs/vega20_sgemm_tn_batched_msra.yaml |   0
 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml       |   0
 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml       |   0
 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml       |   0
 .../configs/vega20_sgemm_nn_onnx.yaml         |   0
 .../configs/vega20_sgemm_nt_onnx.yaml         |   0
 .../configs/vega20_sgemm_tn_onnx.yaml         |   0
 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml       |   0
 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml       |   0
 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml       |   0
 .../configs/vega20_hgemm_nn_megatron.yaml     |   0
 .../configs/vega20_hgemm_nt_megatron.yaml     |   0
 .../configs/vega20_hgemm_tn_megatron.yaml     |   0
 .../exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml      |   0
 .../exact/vega20_Cijk_Ailk_Bljk_HBH.yaml      |   0
 .../exact/vega20_Cijk_Alik_Bljk_HBH.yaml      |   0
 .../archives/bert/2020-11-06/configs/doit.sh  |   0
 .../archives/bert/2020-11-06/configs/nn.yaml  |   0
 .../archives/bert/2020-11-06/configs/nt.yaml  |   0
 .../archives/bert/2020-11-06/configs/tn.yaml  |   0
 .../exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml     |   0
 .../exact/arcturus_Cijk_Ailk_Bljk_SB.yaml     |   0
 .../exact/arcturus_Cijk_Alik_Bljk_SB.yaml     |   0
 .../bert/2020-11-08/configs/bert-nn.yaml      |   0
 .../bert/2020-11-08/configs/bert-nt.yaml      |   0
 .../bert/2020-11-08/configs/bert-tn.yaml      |   0
 .../archives/bert/2020-11-08/configs/doit.sh  |   0
 .../exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml     |   0
 .../exact/arcturus_Cijk_Ailk_Bljk_SB.yaml     |   0
 .../exact/arcturus_Cijk_Alik_Bljk_SB.yaml     |   0
 .../configs/vega20_sgemm_nn_dlrm.yaml         |   0
 .../configs/vega20_sgemm_nt_dlrm.yaml         |   0
 .../configs/vega20_sgemm_tn_dlrm.yaml         |   0
 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml       |   0
 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml       |   0
 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml       |   0
 .../configs/arcturus_sgemm_nn_dlrm.yaml       |   0
 .../configs/arcturus_sgemm_nt_dlrm.yaml       |   0
 .../configs/arcturus_sgemm_tn_dlrm.yaml       |   0
 .../exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml     |   0
 .../exact/arcturus_Cijk_Ailk_Bljk_SB.yaml     |   0
 .../exact/arcturus_Cijk_Alik_Bljk_SB.yaml     |   0
 .../dlrm/2020-07-02/configs/temp.yaml         |   0
 .../exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml     |   0
 .../configs/sgemm_xdlops_nn_terabyte.yaml     |   0
 .../configs/sgemm_xdlops_nt_terabyte.yaml     |   0
 .../configs/sgemm_xdlops_tn_terabyte.yaml     |   0
 .../exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml     |   0
 .../exact/arcturus_Cijk_Ailk_Bljk_SB.yaml     |   0
 .../exact/arcturus_Cijk_Alik_Bljk_SB.yaml     |   0
 ...urus_sgemm_nn_last-dlrm-terabyte-tt-2.yaml |   0
 ...urus_sgemm_nt_last-dlrm-terabyte-tt-2.yaml |   0
 ...urus_sgemm_tn_last-dlrm-terabyte-tt-2.yaml |   0
 .../exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml     |   0
 .../exact/arcturus_Cijk_Ailk_Bljk_SB.yaml     |   0
 .../exact/arcturus_Cijk_Alik_Bljk_SB.yaml     |   0
 .../miopen/archives/ext2/2020-11-05/README    |   0
 .../clients/samples/example_gemm_ext2-tn.cpp  |   0
 .../ext2/2020-11-05/gfx900/configs/doit.sh    |   0
 .../gfx900/configs/spec2-nn-gfx900.yaml       |   0
 .../gfx900/configs/spec2-tn-gfx900.yaml       |   0
 .../gfx900/configs/speccd-nn-gfx900.yaml      |   0
 .../gfx900/configs/speccd-tn-gfx900.yaml      |   0
 .../joined/vega10_Cijk_Ailk_Bljk_SBIIc.yaml   |   0
 .../joined/vega10_Cijk_Ailk_Bljk_SBIc.yaml    |   0
 .../raw/nn/vega10_Cijk_Ailk_Bljk_SBIIc.yaml   |   0
 .../raw/nn/vega10_Cijk_Ailk_Bljk_SBIc.yaml    |   0
 .../raw/tn/vega10_Cijk_Ailk_Bljk_SBIIc.yaml   |   0
 .../raw/tn/vega10_Cijk_Ailk_Bljk_SBIc.yaml    |   0
 .../ext2/2020-11-05/gfx906/configs/doit.sh    |   0
 .../gfx906/configs/spec2-nn-gfx906.yaml       |   0
 .../gfx906/configs/spec2-tn-gfx906.yaml       |   0
 .../gfx906/configs/speccd-nn-gfx906.yaml      |   0
 .../gfx906/configs/speccd-tn-gfx906.yaml      |   0
 .../joined/vega20_Cijk_Ailk_Bljk_SBIIc.yaml   |   0
 .../joined/vega20_Cijk_Ailk_Bljk_SBIc.yaml    |   0
 .../raw/nn/vega20_Cijk_Ailk_Bljk_SBIIc.yaml   |   0
 .../raw/nn/vega20_Cijk_Ailk_Bljk_SBIc.yaml    |   0
 .../raw/tn/vega20_Cijk_Ailk_Bljk_SBIIc.yaml   |   0
 .../raw/tn/vega20_Cijk_Ailk_Bljk_SBIc.yaml    |   0
 .../ext2/2020-11-05/gfx908/configs/doit.sh    |   0
 .../gfx908/configs/spec2-nn-gfx908.yaml       |   0
 .../gfx908/configs/spec2-tn-gfx908.yaml       |   0
 .../gfx908/configs/speccd-nn-gfx908.yaml      |   0
 .../gfx908/configs/speccd-tn-gfx908.yaml      |   0
 .../joined/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml |   0
 .../joined/arcturus_Cijk_Ailk_Bljk_SBIc.yaml  |   0
 .../raw/nn/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml |   0
 .../raw/nn/arcturus_Cijk_Ailk_Bljk_SBIc.yaml  |   0
 .../raw/tn/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml |   0
 .../raw/tn/arcturus_Cijk_Ailk_Bljk_SBIc.yaml  |   0
 .../configs/sgemm_inception_nn.yaml           |   0
 .../configs/sgemm_inception_nt_batched.yaml   |   0
 .../configs/sgemm_inception_tn.yaml           |   0
 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml       |   0
 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml       |   0
 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml       |   0
 .../configs/vega20_sgemm_nn_riga.yaml         |   0
 .../configs/vega20_sgemm_nt_riga.yaml         |   0
 .../configs/vega20_sgemm_tn_riga.yaml         |   0
 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml       |   0
 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml       |   0
 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml       |   0
 .../arcturus_sgemm_nn_resnext-inception.yaml  |   0
 .../arcturus_sgemm_nt_resnext-inception.yaml  |   0
 .../exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml     |   0
 .../exact/arcturus_Cijk_Ailk_Bljk_SB.yaml     |   0
 .../2021-02-04/2_BenchmarkData.tar.gz         | Bin
 .../configs/vega20_hgemm_nn_hbh.yaml          |   0
 .../configs/vega20_hgemm_nt_hbh.yaml          |   0
 .../configs/vega20_hgemm_tn_hbh.yaml          |   0
 .../exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml      |   0
 .../exact/vega20_Cijk_Ailk_Bljk_HBH.yaml      |   0
 .../exact/vega20_Cijk_Alik_Bljk_HBH.yaml      |   0
 .../configs/vega20_sgemm_nn_mlp.yaml          |   0
 .../configs/vega20_sgemm_nt_mlp.yaml          |   0
 .../configs/vega20_sgemm_tn_mlp.yaml          |   0
 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml       |   0
 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml       |   0
 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml       |   0
 .../configs/vega20_sgemm_nn_k1.yaml           |   0
 .../configs/vega20_sgemm_nt_k1.yaml           |   0
 .../configs/vega20_sgemm_tn_k1.yaml           |   0
 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml       |   0
 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml       |   0
 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml       |   0
 .../archive/vega20_Cijk_Ailk_Bjlk_SB.yaml     |   0
 .../archive/vega20_Cijk_Ailk_Bljk_SB.yaml     |   0
 .../archive/vega20_Cijk_Alik_Bljk_SB.yaml     |   0
 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml       |   0
 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml       |   0
 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml       |   0
 .../archive/vega20_Cijk_Ailk_Bjlk_SB.yaml     |   0
 .../archive/vega20_Cijk_Ailk_Bljk_SB.yaml     |   0
 .../archive/vega20_Cijk_Alik_Bljk_SB.yaml     |   0
 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml       |   0
 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml       |   0
 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml       |   0
 .../configs1/vega20_sgemm_nn_phantom.yaml     |   0
 .../configs1/vega20_sgemm_tn_phantom.yaml     |   0
 .../configs2/vega20_sgemm_nn_phantom.yaml     |   0
 .../configs2/vega20_sgemm_nt_phantom.yaml     |   0
 .../configs2/vega20_sgemm_tn_phantom.yaml     |   0
 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml       |   0
 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml       |   0
 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml       |   0
 .../configs/vega20_sgemm_nn_riga.yaml         |   0
 .../configs/vega20_sgemm_nt_riga.yaml         |   0
 .../configs/vega20_sgemm_tn_riga.yaml         |   0
 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml       |   0
 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml       |   0
 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml       |   0
 .../configs/resnet-inception-nn-2x2.yaml      |   0
 .../configs/resnet-inception-nn.yaml          |   0
 .../configs/resnet-inception-nt-2x2.yaml      |   0
 .../configs/resnet-inception-nt.yaml          |   0
 .../exact/vega20_Cijk_Ailk_Bjlk_S.yaml        |   0
 .../exact/vega20_Cijk_Ailk_Bljk_S.yaml        |   0
 .../exact/vega20_Cijkl_Aijml_Bkml_SI.yaml     |   0
 .../exact/vega20_Cijkl_Aijml_Bmkl_SI.yaml     |   0
 .../configs/resnet-inception-hgemm-nn.yaml    |   0
 .../configs/resnet-inception-hgemm-nt.yaml    |   0
 .../exact/vega20_Cijk_Ailk_Bjlk_HH.yaml       |   0
 .../exact/vega20_Cijk_Ailk_Bljk_HH.yaml       |   0
 .../arcturus_sgemm_nn_resnext-inception.yaml  |   0
 .../arcturus_sgemm_nt_resnext-inception.yaml  |   0
 .../exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml     |   0
 .../exact/arcturus_Cijk_Ailk_Bljk_SB.yaml     |   0
 .../archives/resnet50/2018-09-12/README.md    |   0
 .../2018-09-12/config/hgemm_resnet50_nn.yaml  |   0
 .../2018-09-12/config/hgemm_resnet50_nt.yaml  |   0
 .../2018-09-12/config/hgemm_resnet50_tn.yaml  |   0
 .../2018-09-12/config/sgemm_resnet50_nn.yaml  |   0
 .../2018-09-12/config/sgemm_resnet50_nt.yaml  |   0
 .../2018-09-12/config/sgemm_resnet50_tn.yaml  |   0
 .../logic/vega20_Cijk_Ailk_Bjlk_HB.yaml       |   0
 .../logic/vega20_Cijk_Ailk_Bjlk_SB.yaml       |   0
 .../logic/vega20_Cijk_Ailk_Bljk_HB.yaml       |   0
 .../logic/vega20_Cijk_Ailk_Bljk_SB.yaml       |   0
 .../logic/vega20_Cijk_Alik_Bljk_HB.yaml       |   0
 .../logic/vega20_Cijk_Alik_Bljk_SB.yaml       |   0
 .../archives/resnet50/2018-10-09/README.md    |   0
 .../2018-10-09/config/hgemm_resnet50_nn.yaml  |   0
 .../2018-10-09/config/hgemm_resnet50_nt.yaml  |   0
 .../2018-10-09/config/hgemm_resnet50_tn.yaml  |   0
 .../2018-10-09/config/hpa_resnet50_nn.yaml    |   0
 .../2018-10-09/config/hpa_resnet50_nt.yaml    |   0
 .../2018-10-09/config/hpa_resnet50_tn.yaml    |   0
 .../2018-10-09/config/sgemm_resnet50_nn.yaml  |   0
 .../2018-10-09/config/sgemm_resnet50_nt.yaml  |   0
 .../2018-10-09/config/sgemm_resnet50_tn.yaml  |   0
 .../logic/main/vega20_Cijk_Ailk_Bjlk_HB.yaml  |   0
 .../logic/main/vega20_Cijk_Ailk_Bjlk_HBH.yaml |   0
 .../logic/main/vega20_Cijk_Ailk_Bjlk_SB.yaml  |   0
 .../logic/main/vega20_Cijk_Ailk_Bljk_HB.yaml  |   0
 .../logic/main/vega20_Cijk_Ailk_Bljk_HBH.yaml |   0
 .../logic/main/vega20_Cijk_Ailk_Bljk_SB.yaml  |   0
 .../logic/main/vega20_Cijk_Alik_Bljk_HB.yaml  |   0
 .../logic/main/vega20_Cijk_Alik_Bljk_HBH.yaml |   0
 .../logic/main/vega20_Cijk_Alik_Bljk_SB.yaml  |   0
 .../merged/vega20_Cijk_Ailk_Bjlk_HB.yaml      |   0
 .../merged/vega20_Cijk_Ailk_Bjlk_HBH.yaml     |   0
 .../merged/vega20_Cijk_Ailk_Bjlk_SB.yaml      |   0
 .../merged/vega20_Cijk_Ailk_Bljk_HB.yaml      |   0
 .../merged/vega20_Cijk_Ailk_Bljk_HBH.yaml     |   0
 .../merged/vega20_Cijk_Ailk_Bljk_SB.yaml      |   0
 .../merged/vega20_Cijk_Alik_Bljk_HB.yaml      |   0
 .../merged/vega20_Cijk_Alik_Bljk_HBH.yaml     |   0
 .../merged/vega20_Cijk_Alik_Bljk_SB.yaml      |   0
 .../resnet50/vega20_Cijk_Ailk_Bjlk_HB.yaml    |   0
 .../resnet50/vega20_Cijk_Ailk_Bjlk_HBH.yaml   |   0
 .../resnet50/vega20_Cijk_Ailk_Bjlk_SB.yaml    |   0
 .../resnet50/vega20_Cijk_Ailk_Bljk_HB.yaml    |   0
 .../resnet50/vega20_Cijk_Ailk_Bljk_HBH.yaml   |   0
 .../resnet50/vega20_Cijk_Ailk_Bljk_SB.yaml    |   0
 .../resnet50/vega20_Cijk_Alik_Bljk_HB.yaml    |   0
 .../resnet50/vega20_Cijk_Alik_Bljk_HBH.yaml   |   0
 .../resnet50/vega20_Cijk_Alik_Bljk_SB.yaml    |   0
 .../configs/vega20_sgemm_nn_resnet50.yaml     |   0
 .../configs/vega20_sgemm_nt_resnet50.yaml     |   0
 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml       |   0
 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml       |   0
 .../2021-02-10/2_BenchmarkData.tar.gz         | Bin
 .../configs/arcturus_sgemm_nn_sb.yaml         |   0
 .../configs/arcturus_sgemm_nt_sb.yaml         |   0
 .../configs/arcturus_sgemm_tn_sb.yaml         |   0
 .../exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml     |   0
 .../exact/arcturus_Cijk_Ailk_Bljk_SB.yaml     |   0
 .../exact/arcturus_Cijk_Alik_Bljk_SB.yaml     |   0
 .../2021-02-17/2_BenchmarkData.tar.gz         | Bin
 .../configs/vega20_sgemm_nn_resnext3d.yaml    |   0
 .../configs/vega20_sgemm_nt_resnext3d.yaml    |   0
 .../configs/vega20_sgemm_tn_resnext3d.yaml    |   0
 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml       |   0
 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml       |   0
 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml       |   0
 .../2021-02-18/2_BenchmarkData.tar.gz         | Bin
 .../configs/vega20_sgemm_nn_resnext3d-r2.yaml |   0
 .../configs/vega20_sgemm_nt_resnext3d-r2.yaml |   0
 .../configs/vega20_sgemm_tn_resnext3d-r2.yaml |   0
 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml       |   0
 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml       |   0
 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml       |   0
 .../replacement-kernel-arcturus-tn.yaml       |   0
 .../exact/arcturus_Cijk_Alik_Bljk_SB.yaml     |   0
 .../base/arcturus_Cijk_Alik_Bljk_SB.yaml      |   0
 .../combined/arcturus_Cijk_Alik_Bljk_SB.yaml  |   0
 .../configuration/sgemm_tn-guard-pr195.yaml   |   0
 .../inc-raw/arcturus_Cijk_Alik_Bljk_SB.yaml   |   0
 .../inc/arcturus_Cijk_Alik_Bljk_SB.yaml       |   0
 .../archives/rk/2020-08-12/logs/convert.log   |   0
 .../archives/rk/2020-08-12/logs/merge.log     |   0
 .../2019-05-29/vega20_Cijk_Ailk_Bjlk_SB.yaml  |   0
 .../2019-05-29/vega20_Cijk_Ailk_Bljk_SB.yaml  |   0
 .../2019-05-29/vega20_Cijk_Alik_Bljk_SB.yaml  |   0
 .../configs/vega20_sgemm_nn_shakespeare.yaml  |   0
 .../configs/vega20_sgemm_nt_shakespeare.yaml  |   0
 .../configs/vega20_sgemm_tn_shakespeare.yaml  |   0
 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml       |   0
 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml       |   0
 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml       |   0
 .../configs/vega10_sgemm_nn_shakespeare.yaml  |   0
 .../configs/vega10_sgemm_nt_shakespeare.yaml  |   0
 .../configs/vega10_sgemm_tn_shakespeare.yaml  |   0
 .../exact/vega10_Cijk_Ailk_Bjlk_SB.yaml       |   0
 .../exact/vega10_Cijk_Ailk_Bljk_SB.yaml       |   0
 .../exact/vega10_Cijk_Alik_Bljk_SB.yaml       |   0
 .../configs/arcturus_sgemm_tn_miopen.yaml     |   0
 .../exact/arcturus_Cijk_Alik_Bljk_SB.yaml     |   0
 .../arcturus_dgemm_nn_skinny_small.yaml       |   0
 .../arcturus_dgemm_nt_skinny_small.yaml       |   0
 .../vegoa20_dgemm_nn_skinny_small.yaml        |   0
 .../vegoa20_dgemm_nt_skinny_small.yaml        |   0
 .../exact/arcturus_Cijk_Ailk_Bjlk_DB.yaml     |   0
 .../exact/arcturus_Cijk_Ailk_Bljk_DB.yaml     |   0
 .../exact/vega20_Cijk_Ailk_Bjlk_DB.yaml       |   0
 .../exact/vega20_Cijk_Ailk_Bljk_DB.yaml       |   0
 .../arcturus_dgemm_nn_skinny_large.yaml       |   0
 .../configs/vega20_dgemm_nn_skinny_large.yaml |   0
 .../exact/arcturus_Cijk_Ailk_Bljk_DB.yaml     |   0
 .../exact/vega20_Cijk_Ailk_Bljk_DB.yaml       |   0
 .../2019-11-11/vega20_Cijk_Alik_Bljk_SB.yaml  |   0
 .../archive/vega20_Cijk_Ailk_Bljk_SB.yaml     |   0
 .../2019-11-11/vega20_Cijk_Alik_Bljk_SB.yaml  |   0
 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml       |   0
 .../configs/sgemm_sparseNN_gemm_nn.yaml       |   0
 .../configs/sgemm_sparseNN_gemm_tn.yaml       |   0
 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml       |   0
 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml       |   0
 .../configs/vega10_sgemm_nn_transformer.yaml  |   0
 .../configs/vega10_sgemm_nt_transformer.yaml  |   0
 .../configs/vega10_sgemm_tn_transformer.yaml  |   0
 .../exact/vega10_Cijk_Ailk_Bjlk_SB.yaml       |   0
 .../exact/vega10_Cijk_Ailk_Bljk_SB.yaml       |   0
 .../exact/vega10_Cijk_Alik_Bljk_SB.yaml       |   0
 .../configs/vega20_sgemm_nn_transformer.yaml  |   0
 .../configs/vega20_sgemm_nt_transformer.yaml  |   0
 .../configs/vega20_sgemm_tn_transformer.yaml  |   0
 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml       |   0
 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml       |   0
 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml       |   0
 .../arcturus_sgemm_nn_transformer.yaml        |   0
 .../arcturus_sgemm_nt_transformer.yaml        |   0
 .../arcturus_sgemm_tn_transformer.yaml        |   0
 .../exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml     |   0
 .../exact/arcturus_Cijk_Ailk_Bljk_SB.yaml     |   0
 .../exact/arcturus_Cijk_Alik_Bljk_SB.yaml     |   0
 .../arcturus_sgemm_nn_transformer.yaml        |   0
 .../arcturus_sgemm_nt_transformer.yaml        |   0
 .../arcturus_sgemm_tn_transformer.yaml        |   0
 .../exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml     |   0
 .../exact/arcturus_Cijk_Ailk_Bljk_SB.yaml     |   0
 .../exact/arcturus_Cijk_Alik_Bljk_SB.yaml     |   0
 .../vega20_sgemm_nn_sgemm_transformer.yaml    |   0
 .../vega20_sgemm_nt_sgemm_transformer.yaml    |   0
 .../vega20_sgemm_tn_sgemm_transformer.yaml    |   0
 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml       |   0
 .../exact/vega20_Cijk_Ailk_Bljk_SB.yaml       |   0
 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml       |   0
 .../vega20_hgemm_nn_hgemm_transformer.yaml    |   0
 .../vega20_hgemm_nt_hgemm_transformer.yaml    |   0
 .../vega20_hgemm_tn_hgemm_transformer.yaml    |   0
 .../exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml      |   0
 .../exact/vega20_Cijk_Ailk_Bljk_HBH.yaml      |   0
 .../exact/vega20_Cijk_Alik_Bljk_HBH.yaml      |   0
 .../configs/vega20_sgemm_nt_winograd.yaml     |   0
 .../exact/vega20_Cijk_Ailk_Bjlk_SB.yaml       |   0
 .../configs/vega20_sgemm_tn_winograd.yaml     |   0
 .../exact/vega20_Cijk_Alik_Bljk_SB.yaml       |   0
 .../data}/Configs/miopen/boiler/header.yml    |   0
 .../miopen/boiler/library_logic_hip_only.yml  |   0
 .../boiler/library_logic_vega10_only.yml      |   0
 .../boiler/library_logic_vega20_only.yml      |   0
 .../data}/Configs/miopen/convert_cfg.py       |   0
 .../Tensile/data}/Configs/miopen/make_all.sh  |   0
 .../problems/nn/deepbench_conv_1x1_batch1.yml |   0
 .../problems/nn/deepbench_conv_1x1_batchN.yml |   0
 .../problems/nn/deepbench_gemm_large.yml      |   0
 .../problems/nn/deepbench_gemm_skinny.yml     |   0
 .../miopen/problems/nn/resnet50_all.yml       |   0
 .../miopen/problems/nn/resnet50_batch64.yml   |   0
 .../miopen/problems/nn/resnet_batch64_B.yml   |   0
 .../problems/nt/deepbench_gemm_large.yml      |   0
 .../problems/nt/deepbench_gemm_skinny.yml     |   0
 .../miopen/problems/nt/resnet50_all.yml       |   0
 .../problems/tn/deepbench_gemm_large.yml      |   0
 .../problems/tn/deepbench_gemm_skinny.yml     |   0
 .../miopen/problems/tn/resnet50_all.yml       |   0
 .../solutions/hgemm_large_explore_3.yml       |   0
 .../solutions/hgemm_large_explore_5.yml       |   0
 .../Configs/miopen/solutions/hgemm_quick.yml  |   0
 .../solutions/hgemm_skinny_explore_3.yml      |   0
 .../solutions/hgemm_skinny_explore_5.yml      |   0
 .../solutions/sgemm_large_explore_3.yml       |   0
 .../solutions/sgemm_large_explore_5.yml       |   0
 .../solutions/sgemm_large_explore_7.yml       |   0
 .../Configs/miopen/solutions/sgemm_quick.yml  |   0
 .../solutions/sgemm_skinny_explore_3.yml      |   0
 .../solutions/sgemm_skinny_explore_4.yml      |   0
 .../solutions/sgemm_skinny_explore_5.yml      |   0
 .../solutions/sgemm_skinny_explore_7.yml      |   0
 .../data}/Configs/miopen/types/hgemm_nn.yml   |   0
 .../data}/Configs/miopen/types/hgemm_nt.yml   |   0
 .../data}/Configs/miopen/types/hgemm_tn.yml   |   0
 .../data}/Configs/miopen/types/hgemm_tt.yml   |   0
 .../data}/Configs/miopen/types/igemm_nn.yml   |   0
 .../data}/Configs/miopen/types/igemm_nt.yml   |   0
 .../data}/Configs/miopen/types/igemm_tn.yml   |   0
 .../data}/Configs/miopen/types/igemm_tt.yml   |   0
 .../data}/Configs/miopen/types/sgemm_nn.yml   |   0
 .../data}/Configs/miopen/types/sgemm_nt.yml   |   0
 .../data}/Configs/miopen/types/sgemm_tn.yml   |   0
 .../data}/Configs/miopen/types/sgemm_tt.yml   |   0
 .../navi21/rocblas_hgemm_gb_nn_asm_full.yaml  |   0
 .../navi21/rocblas_hgemm_gb_nt_asm_full.yaml  |   0
 .../navi21/rocblas_hgemm_gb_tn_asm_full.yaml  |   0
 .../navi21/rocblas_hgemm_gb_tt_asm_full.yaml  |   0
 .../navi21/rocblas_hgemm_sb_nn_asm_full.yaml  |   0
 .../navi21/rocblas_hgemm_sb_nt_asm_full.yaml  |   0
 .../navi21/rocblas_hgemm_sb_tn_asm_full.yaml  |   0
 .../navi21/rocblas_hgemm_sb_tt_asm_full.yaml  |   0
 .../rocblas_hpa_hgemm_gb_nn_asm_full.yaml     |   0
 .../rocblas_hpa_hgemm_gb_nt_asm_full.yaml     |   0
 .../rocblas_hpa_hgemm_gb_tn_asm_full.yaml     |   0
 .../rocblas_hpa_hgemm_gb_tt_asm_full.yaml     |   0
 .../rocblas_hpa_hgemm_sb_nn_asm_full.yaml     |   0
 .../rocblas_hpa_hgemm_sb_nt_asm_full.yaml     |   0
 .../rocblas_hpa_hgemm_sb_tn_asm_full.yaml     |   0
 .../rocblas_hpa_hgemm_sb_tt_asm_full.yaml     |   0
 .../navi21/rocblas_sgemm_gb_nn_asm_full.yaml  |   0
 .../navi21/rocblas_sgemm_gb_nt_asm_full.yaml  |   0
 .../navi21/rocblas_sgemm_gb_tn_asm_full.yaml  |   0
 .../navi21/rocblas_sgemm_gb_tt_asm_full.yaml  |   0
 .../navi21/rocblas_sgemm_sb_nn_asm_full.yaml  |   0
 .../navi21/rocblas_sgemm_sb_nt_asm_full.yaml  |   0
 .../navi21/rocblas_sgemm_sb_tn_asm_full.yaml  |   0
 .../navi21/rocblas_sgemm_sb_tt_asm_full.yaml  |   0
 .../Tensile/data}/Configs/rocblas_cgemm.yaml  |   0
 .../data}/Configs/rocblas_cgemm_asm_lite.yaml |   0
 .../data}/Configs/rocblas_cgemm_hip_lite.yaml |   0
 .../data}/Configs/rocblas_dgemm_asm_lite.yaml |   0
 .../rocblas_dgemm_asm_single_kernel.yaml      |   0
 .../Configs/rocblas_dgemm_asm_square.yaml     |   0
 .../rocblas_dgemm_bufferload_limit.yaml       |   0
 .../data}/Configs/rocblas_dgemm_hip_lite.yaml |   0
 .../Configs/rocblas_dgemm_nn_asm_full.yaml    |   0
 .../rocblas_dgemm_nn_inc0_asm_full.yaml       |   0
 .../Configs/rocblas_dgemm_nt_asm_full.yaml    |   0
 .../rocblas_dgemm_nt_inc0_asm_full.yaml       |   0
 .../rocblas_dgemm_nt_inc1_asm_full.yaml       |   0
 .../rocblas_dgemm_nt_inc2_asm_full.yaml       |   0
 .../rocblas_dgemm_nt_inc3_asm_full.yaml       |   0
 .../rocblas_dgemm_nt_resume_train_exp.yaml    |   0
 .../Configs/rocblas_dgemm_tn_asm_full.yaml    |   0
 .../Configs/rocblas_dgemm_tt_asm_full.yaml    |   0
 .../data}/Configs/rocblas_hgemm_asm_full.yaml |   0
 .../data}/Configs/rocblas_hgemm_asm_lite.yaml |   0
 .../rocblas_hgemm_asm_single_kernel.yaml      |   0
 .../rocblas_hgemm_bufferload_limit.yaml       |   0
 .../data}/Configs/rocblas_hgemm_hip_lite.yaml |   0
 .../rocblas_hpa_bf16_gemm_tn_asm_test.yaml    |   0
 .../rocblas_hpa_bf16s_gemm_tn_asm_test.yaml   |   0
 .../rocblas_hpa_bfloat16_gemm_inc1_hip.yaml   |   0
 ...as_hpa_bfloat16_gemm_nn_inc1_asm_full.yaml |   0
 ...as_hpa_bfloat16_gemm_nt_inc1_asm_full.yaml |   0
 ...as_hpa_bfloat16_gemm_tn_inc1_asm_full.yaml |   0
 .../rocblas_hpa_bfloat16_hip_lite.yaml        |   0
 ...ocblas_hpa_bfloat16_hip_single_kernel.yaml |   0
 ...rocblas_hpa_bfloat16_tn_inc1_asm_full.yaml |   0
 ...rocblas_hpa_bfloat16_tn_inc2_asm_full.yaml |   0
 .../rocblas_hpa_bfloat16s_gemm_inc1_hip.yaml  |   0
 ...s_hpa_bfloat16s_gemm_nn_inc1_asm_full.yaml |   0
 ...s_hpa_bfloat16s_gemm_nt_inc1_asm_full.yaml |   0
 ...s_hpa_bfloat16s_gemm_tn_inc1_asm_full.yaml |   0
 .../rocblas_hpa_bfloat16s_hip_lite.yaml       |   0
 ...cblas_hpa_bfloat16s_hip_single_kernel.yaml |   0
 ...ocblas_hpa_bfloat16s_tn_inc1_asm_full.yaml |   0
 ...ocblas_hpa_bfloat16s_tn_inc2_asm_full.yaml |   0
 .../Configs/rocblas_hpa_hgemm_asm_lite.yaml   |   0
 .../rocblas_hpa_hgemm_asm_single_kernel.yaml  |   0
 .../Configs/rocblas_hpa_hgemm_hip_lite.yaml   |   0
 .../Configs/rocblas_hpa_hgemm_inc1_hip.yaml   |   0
 .../rocblas_hpa_hgemm_nn_asm_full.yaml        |   0
 .../rocblas_hpa_hgemm_nn_inc1_asm_full.yaml   |   0
 .../rocblas_hpa_hgemm_nt_asm_full.yaml        |   0
 .../rocblas_hpa_hgemm_nt_inc1_asm_full.yaml   |   0
 .../rocblas_hpa_hgemm_tn_asm_full.yaml        |   0
 .../rocblas_hpa_hgemm_tn_inc1_asm_full.yaml   |   0
 .../rocblas_hpa_hgemm_tt_asm_full.yaml        |   0
 .../Configs/rocblas_hpa_hsgemm_asm_lite.yaml  |   0
 .../rocblas_hpa_hsgemm_asm_single_kernel.yaml |   0
 .../Configs/rocblas_hpa_hsgemm_hip_lite.yaml  |   0
 .../Configs/rocblas_hpa_hsgemm_inc1_hip.yaml  |   0
 .../rocblas_hpa_hsgemm_nn_asm_full.yaml       |   0
 .../rocblas_hpa_hsgemm_nn_inc1_asm_full.yaml  |   0
 .../rocblas_hpa_hsgemm_nt_asm_full.yaml       |   0
 .../rocblas_hpa_hsgemm_nt_inc1_asm_full.yaml  |   0
 .../rocblas_hpa_hsgemm_tn_asm_full.yaml       |   0
 .../rocblas_hpa_hsgemm_tn_inc1_asm_full.yaml  |   0
 .../rocblas_hpa_hsgemm_tt_asm_full.yaml       |   0
 .../Configs/rocblas_hpa_igemm_nn_hip.yaml     |   0
 .../Configs/rocblas_hpa_igemm_nt_hip.yaml     |   0
 .../Configs/rocblas_hpa_igemm_tn_hip.yaml     |   0
 .../Configs/rocblas_hpa_igemm_tt_hip.yaml     |   0
 .../Configs/rocblas_hsgemm_asm_lite.yaml      |   0
 .../Configs/rocblas_igemm_asm_full_nn.yaml    |   0
 .../Configs/rocblas_igemm_asm_full_nt.yaml    |   0
 .../Configs/rocblas_igemm_asm_full_tn.yaml    |   0
 .../Configs/rocblas_igemm_asm_full_tt.yaml    |   0
 .../rocblas_igemm_hip_single_kernel.yaml      |   0
 .../data}/Configs/rocblas_sgemm_asm_full.yaml |   0
 .../data}/Configs/rocblas_sgemm_asm_lite.yaml |   0
 .../data}/Configs/rocblas_sgemm_asm_only.yaml |   0
 .../rocblas_sgemm_asm_single_kernel.yaml      |   0
 .../rocblas_sgemm_bufferload_limit.yaml       |   0
 .../data}/Configs/rocblas_sgemm_example.yaml  |   0
 .../data}/Configs/rocblas_sgemm_hip_lite.yaml |   0
 .../rocblas_sgemm_nn_inc1_asm_full.yaml       |   0
 .../rocblas_sgemm_nt_inc1_asm_full.yaml       |   0
 .../rocblas_sgemm_tn_inc1_asm_full.yaml       |   0
 .../rocblas_sgemm_tn_inc2_asm_full.yaml       |   0
 .../rocblas_sgemm_tn_inc3_asm_full.yaml       |   0
 .../Tensile/data}/Configs/rocblas_zgemm.yaml  |   0
 .../data}/Configs/rocblas_zgemm_asm_lite.yaml |   0
 .../Tensile/data}/Perf/BDAS/dgemm_kmeans.yaml |   0
 .../Tensile/data}/Perf/BDAS/dgemm_pca.yaml    |   0
 .../Tensile/data}/Perf/BERT/sgemm_xdlops.yaml |   0
 .../Tensile/data}/Perf/DLRM/sgemm_xdlops.yaml |   0
 .../data}/Perf/DLRM/sgemm_xdlops_nn.yaml      |   0
 .../Perf/DLRM/sgemm_xdlops_nn_terabyte.yaml   |   0
 .../data}/Perf/DLRM/sgemm_xdlops_nt.yaml      |   0
 .../Perf/DLRM/sgemm_xdlops_nt_terabyte.yaml   |   0
 .../Perf/DLRM/sgemm_xdlops_tn_terabyte.yaml   |   0
 .../data}/Perf/TRANSFORMER/sgemm_xdlops.yaml  |   0
 .../Perf/TRANSFORMER/sgemm_xdlops_nn.yaml     |   0
 .../Perf/TRANSFORMER/sgemm_xdlops_nt.yaml     |   0
 .../Tensile/data}/Perf/conv/README            |   0
 .../data}/Perf/conv/conv_1x1_af0em.yaml       |   0
 .../data}/Perf/conv/conv_1x1_oddpbd.yaml      |   0
 .../data}/Perf/conv/conv_1x1u2_bdww.yaml      |   0
 .../data}/Perf/conv/conv_1x1u2_fwd.yaml       |   0
 .../Tensile/data}/Perf/conv/conv_1x7_fwd.yaml |   0
 .../Tensile/data}/Perf/conv/conv_7x1_fwd.yaml |   0
 .../data}/Perf/conv/conv_7x1_fwd2.yaml        |   0
 .../data}/Perf/conv/conv_7x1_roundup.yaml     |   0
 .../data}/Perf/conv/conv_7x7u2_fwd.yaml       |   0
 .../data}/Perf/conv/conv_bwdd_pbd.yaml        |   0
 .../Tensile/data}/Perf/conv/conv_fwd.yaml     |   0
 .../Tensile/data}/Perf/conv_bwdd_ex0.yaml     |   0
 .../Tensile/data}/Perf/conv_bwdd_ex1.yaml     |   0
 .../Tensile/data}/Perf/conv_bwdw_big_gsu.yaml |   0
 .../data}/Perf/conv_bwdw_small_gsu.yaml       |   0
 .../Tensile/data}/Perf/conv_fwd_ex0.yaml      |   0
 .../data}/Perf/dgemm_large_square.yaml        |   0
 {Tensile => src/Tensile/data}/Perf/hpl.yaml   |   0
 .../Tensile/data}/Perf/hpl_one.yaml           |   0
 .../Tensile/data}/Perf/hpl_quick.yaml         |   0
 .../Tensile/data}/Perf/hpl_quick44k.yaml      |   0
 .../data}/Perf/inception/conv_1x1u1.yaml      |   0
 .../Perf/inception/conv_1x1u1_starter.yaml    |   0
 .../data}/Perf/inception/conv_NxN.yaml        |   0
 .../data}/Perf/sgemm_large_square_nn.yaml     |   0
 .../data}/Perf/sgemm_large_square_nt.yaml     |   0
 .../data}/Perf/sgemm_large_square_tn.yaml     |   0
 .../data}/Perf/use_initial_strides_cd/README  |   0
 .../perf_baseline0.yaml                       |   0
 .../use_initial_strides_cd/perf_uis_cd0.yaml  |   0
 .../perf_uis_cd_specialized.yaml              |   0
 .../Tensile/data}/Source/CMakeLists.txt       |   0
 .../Tensile/data}/Source/EnableWarnings.cmake |   0
 .../Tensile/data}/Source/FindHIP.cmake        |   0
 .../Tensile/data}/Source/FindOpenCL.cmake     |   0
 .../Tensile/data}/Source/KernelHeader.h       |   0
 .../Tensile/data}/Source/TensileTypes.h       |   0
 .../data}/Source/client/CMakeLists.txt        |   0
 .../Source/client/include/BenchmarkTimer.hpp  |   0
 .../Source/client/include/CSVStackFile.hpp    |   0
 .../client/include/ClientProblemFactory.hpp   |   0
 .../client/include/ConvolutionProblem.hpp     |   0
 .../client/include/DataInitialization.hpp     |   0
 .../include/DataInitializationTyped.hpp       |   0
 .../Source/client/include/HardwareMonitor.hpp |   0
 .../include/HardwareMonitorListener.hpp       |   0
 .../client/include/HardwareMonitorType.hpp    |   0
 .../client/include/HardwareMonitorWindows.hpp |   0
 .../client/include/HardwareMonitor_fwd.hpp    |   0
 .../client/include/LibraryUpdateReporter.hpp  |   0
 .../Source/client/include/LogReporter.hpp     |   0
 .../client/include/MetaResultReporter.hpp     |   0
 .../Source/client/include/MetaRunListener.hpp |   0
 .../client/include/PerformanceReporter.hpp    |   0
 .../client/include/ProgressListener.hpp       |   0
 .../data}/Source/client/include/Reference.hpp |   0
 .../client/include/ReferenceValidator.hpp     |   0
 .../client/include/ResultComparison.hpp       |   0
 .../client/include/ResultFileReporter.hpp     |   0
 .../Source/client/include/ResultReporter.hpp  |   0
 .../client/include/ResultReporter_fwd.hpp     |   0
 .../Source/client/include/RunListener.hpp     |   0
 .../client/include/SolutionIterator.hpp       |   0
 .../Source/client/include/TimingEvents.hpp    |   0
 .../Tensile/data}/Source/client/main.cpp      |   0
 .../Source/client/source/BenchmarkTimer.cpp   |   0
 .../Source/client/source/CSVStackFile.cpp     |   0
 .../client/source/ClientProblemFactory.cpp    |   0
 .../client/source/ConvolutionProblem.cpp      |   0
 .../client/source/DataInitialization.cpp      |   0
 .../Source/client/source/HardwareMonitor.cpp  |   0
 .../client/source/HardwareMonitorListener.cpp |   0
 .../client/source/LibraryUpdateReporter.cpp   |   0
 .../Source/client/source/MetaRunListener.cpp  |   0
 .../client/source/PerformanceReporter.cpp     |   0
 .../Source/client/source/ProgressListener.cpp |   0
 .../data}/Source/client/source/Reference.cpp  |   0
 .../client/source/ReferenceValidator.cpp      |   0
 .../client/source/ResultFileReporter.cpp      |   0
 .../Source/client/source/ResultReporter.cpp   |   0
 .../Source/client/source/SolutionIterator.cpp |   0
 .../Source/client/source/TimingEvents.cpp     |   0
 .../data}/Source/cmake/FindROCmSMI.cmake      |   0
 .../Tensile/data}/Source/hip_f8_impl.h        |   0
 .../Tensile/data}/Source/lib/CMakeLists.txt   |   0
 .../SolutionLibraries/KernelsLiteNavi.yaml    |   0
 .../navi10_Cijk_Ailk_Bjlk_SB.yaml             |   0
 .../navi10_Cijk_Ailk_Bljk_SB.yaml             |   0
 .../navi10_Cijk_Alik_Bjlk_SB.yaml             |   0
 .../navi10_Cijk_Alik_Bljk_SB.yaml             |   0
 .../Source/lib/include/Tensile/AMDGPU.hpp     |   0
 .../lib/include/Tensile/AMDGPUPredicates.hpp  |   0
 .../lib/include/Tensile/AMDGPU_Detail.hpp     |   0
 .../include/Tensile/ArithmeticUnitTypes.hpp   |   0
 .../lib/include/Tensile/CachingLibrary.hpp    |   0
 .../Source/lib/include/Tensile/Comparison.hpp |   0
 .../include/Tensile/ContractionLibrary.hpp    |   0
 .../include/Tensile/ContractionProblem.hpp    |   0
 .../Tensile/ContractionProblemPredicates.hpp  |   0
 .../Tensile/ContractionProblemProperties.hpp  |   0
 .../Tensile/ContractionProblem_Detail.hpp     |   0
 .../Tensile/ContractionProblem_fwd.hpp        |   0
 .../include/Tensile/ContractionSolution.hpp   |   0
 .../Tensile/ContractionSolution_fwd.hpp       |   0
 .../lib/include/Tensile/Contractions.hpp      |   0
 .../Source/lib/include/Tensile/DataTypes.hpp  |   0
 .../include/Tensile/DataTypes_BFloat16.hpp    |   0
 .../Tensile/DataTypes_Float8_BFloat8.hpp      |   0
 .../lib/include/Tensile/DataTypes_Half.hpp    |   0
 .../lib/include/Tensile/DataTypes_Int8.hpp    |   0
 .../lib/include/Tensile/DataTypes_Int8x4.hpp  |   0
 .../include/Tensile/DataTypes_XFloat32.hpp    |   0
 .../Source/lib/include/Tensile/Debug.hpp      |   0
 .../lib/include/Tensile/DecisionTree.hpp      |   0
 .../include/Tensile/DecisionTreeLibrary.hpp   |   0
 .../Source/lib/include/Tensile/Distance.hpp   |   0
 .../lib/include/Tensile/DistinctType.hpp      |   0
 .../lib/include/Tensile/EmbeddedData.hpp      |   0
 .../lib/include/Tensile/EmbeddedLibrary.hpp   |   0
 .../lib/include/Tensile/ExactLogicLibrary.hpp |   0
 .../Tensile/GranularitySelectionLibrary.hpp   |   0
 .../lib/include/Tensile/KernelArguments.hpp   |   0
 .../include/Tensile/KernelLanguageTypes.hpp   |   0
 .../Source/lib/include/Tensile/MLFeatures.hpp |   0
 .../Source/lib/include/Tensile/Macros.hpp     |   0
 .../Source/lib/include/Tensile/MapLibrary.hpp |   0
 .../include/Tensile/MasterSolutionLibrary.hpp |   0
 .../lib/include/Tensile/MatchingLibrary.hpp   |   0
 .../Tensile/PerformanceMetricTypes.hpp        |   0
 .../include/Tensile/PlaceholderLibrary.hpp    |   0
 .../Source/lib/include/Tensile/Predicates.hpp |   0
 .../Source/lib/include/Tensile/ProblemKey.hpp |   0
 .../Source/lib/include/Tensile/Properties.hpp |   0
 .../lib/include/Tensile/PropertyMatching.hpp  |   0
 .../lib/include/Tensile/ScalarValueTypes.hpp  |   0
 .../lib/include/Tensile/Serialization.hpp     |   0
 .../include/Tensile/Serialization/Base.hpp    |   0
 .../Tensile/Serialization/Containers.hpp      |   0
 .../Serialization/ContractionPredicates.hpp   |   0
 .../Serialization/ContractionSolution.hpp     |   0
 .../Serialization/DecisionTreeLibrary.hpp     |   0
 .../Serialization/ExactLogicLibrary.hpp       |   0
 .../GranularitySelectionLibrary.hpp           |   0
 .../Tensile/Serialization/HasTraits.hpp       |   0
 .../Tensile/Serialization/MLFeatures.hpp      |   0
 .../Tensile/Serialization/MapLibrary.hpp      |   0
 .../Tensile/Serialization/MatchingLibrary.hpp |   0
 .../Serialization/PlaceholderLibrary.hpp      |   0
 .../Tensile/Serialization/Predicates.hpp      |   0
 .../Tensile/Serialization/Properties.hpp      |   0
 .../Tensile/Serialization/SolutionLibrary.hpp |   0
 .../include/Tensile/SingleSolutionLibrary.hpp |   0
 .../Source/lib/include/Tensile/Singleton.hpp  |   0
 .../lib/include/Tensile/SolutionLibrary.hpp   |   0
 .../include/Tensile/SolutionLibrary_fwd.hpp   |   0
 .../include/Tensile/SolutionMapLibrary.hpp    |   0
 .../Source/lib/include/Tensile/Tensile.hpp    |   0
 .../lib/include/Tensile/Tensile_fwd.hpp       |   0
 .../lib/include/Tensile/TensorDescriptor.hpp  |   0
 .../Tensile/TensorDescriptor_Detail.hpp       |   0
 .../include/Tensile/TensorDescriptor_fwd.hpp  |   0
 .../Source/lib/include/Tensile/TensorOps.hpp  |   0
 .../lib/include/Tensile/TensorOps_fwd.hpp     |   0
 .../Tensile/UserDrivenTuningParser.hpp        |   0
 .../Source/lib/include/Tensile/Utils.hpp      |   0
 .../data}/Source/lib/include/Tensile/geom.hpp |   0
 .../lib/include/Tensile/hip/HipHardware.hpp   |   0
 .../Tensile/hip/HipSolutionAdapter.hpp        |   0
 .../lib/include/Tensile/hip/HipUtils.hpp      |   0
 .../Source/lib/include/Tensile/hip_f8_impl.h  |   0
 .../lib/include/Tensile/llvm/Loading.hpp      |   0
 .../Source/lib/include/Tensile/llvm/YAML.hpp  |   0
 .../lib/include/Tensile/msgpack/Loading.hpp   |   0
 .../include/Tensile/msgpack/MessagePack.hpp   |   0
 .../Source/lib/include/Tensile/ocl/OclFwd.hpp |   0
 .../lib/include/Tensile/ocl/OclHardware.hpp   |   0
 .../Tensile/ocl/OclSolutionAdapter.hpp        |   0
 .../lib/include/Tensile/ocl/OclUtils.hpp      |   0
 .../data}/Source/lib/source/AMDGPU.cpp        |   0
 .../Source/lib/source/ArithmeticUnitTypes.cpp |   0
 .../Source/lib/source/ContractionProblem.cpp  |   0
 .../Source/lib/source/ContractionSolution.cpp |   0
 .../data}/Source/lib/source/DataTypes.cpp     |   0
 .../Tensile/data}/Source/lib/source/Debug.cpp |   0
 .../data}/Source/lib/source/EmbeddedData.cpp  |   0
 .../Source/lib/source/EmbeddedLibrary.cpp     |   0
 .../Source/lib/source/KernelArguments.cpp     |   0
 .../Source/lib/source/KernelLanguageTypes.cpp |   0
 .../data}/Source/lib/source/MLFeatures.cpp    |   0
 .../lib/source/PerformanceMetricTypes.cpp     |   0
 .../Source/lib/source/ScalarValueTypes.cpp    |   0
 .../data}/Source/lib/source/Tensile.cpp       |   0
 .../Source/lib/source/TensorDescriptor.cpp    |   0
 .../data}/Source/lib/source/TensorOps.cpp     |   0
 .../lib/source/UserDrivenTuningParser.cpp     |   0
 .../Tensile/data}/Source/lib/source/Utils.cpp |   0
 .../Source/lib/source/hip/CMakeLists.txt      |   0
 .../Source/lib/source/hip/HipHardware.cpp     |   0
 .../lib/source/hip/HipSolutionAdapter.cpp     |   0
 .../data}/Source/lib/source/llvm/Loading.cpp  |   0
 .../data}/Source/lib/source/llvm/YAML.cpp     |   0
 .../Source/lib/source/msgpack/MessagePack.cpp |   0
 .../Source/lib/source/ocl/CMakeLists.txt      |   0
 .../Source/lib/source/ocl/OclHardware.cpp     |   0
 .../lib/source/ocl/OclSolutionAdapter.cpp     |   0
 .../data}/Source/lib/source/ocl/OclUtils.cpp  |   0
 .../Tensile/data}/Source/multigpu.sh          |   0
 .../Tensile/data}/Source/tensile_bfloat16.h   |   0
 .../data}/Source/tensile_float8_bfloat8.h     |   0
 .../Tensile/data}/Source/winners.awk          |   0
 .../archive/merge_rocblas_yaml_files.py       |   0
 .../Tensile/data}/Utilities/merge.py          |   0
 .../Tensile/data}/cmake/TensileConfig.cmake   |   0
 .../data}/cmake/TensileConfigVersion.cmake    |   0
 1353 files changed, 28 deletions(-)
 delete mode 100644 Tensile/Configs/build_client.yaml
 rename {Tensile/Tests => Tests}/bugs/2sum_src_pgr1_smallsum.yaml (100%)
 rename {Tensile/Tests => Tests}/bugs/d2lds.yaml (100%)
 rename {Tensile/Tests => Tests}/bugs/fractional_plus_pbc.yaml (100%)
 rename {Tensile/Tests => Tests}/bugs/free10_swap.yaml (100%)
 rename {Tensile/Tests => Tests}/bugs/hpa_beta.yaml (100%)
 rename {Tensile/Tests => Tests}/bugs/nosourcetmp.yaml (100%)
 rename {Tensile/Tests => Tests}/bugs/simple_use_initial_strides_1.yaml (100%)
 rename {Tensile/Tests => Tests}/bugs/swizzlec1.yaml (100%)
 rename {Tensile/Tests => Tests}/bugs/test_glvw4_edge_no_asem.yaml (100%)
 rename {Tensile/Tests => Tests}/bugs/test_nhwc_defaults[Run_Contraction-src1].contraction.yaml (100%)
 rename {Tensile/Tests => Tests}/conftest.py (100%)
 rename {Tensile/Tests => Tests}/create_tests.py (100%)
 rename {Tensile/Tests => Tests}/disabled/classic/test_convolution.yaml (100%)
 rename {Tensile/Tests => Tests}/disabled/convolution/test_conv_act1d_filter1d.yaml (100%)
 rename {Tensile/Tests => Tests}/disabled/convolution/test_conv_act1d_filter1d_simple.yaml (100%)
 rename {Tensile/Tests => Tests}/disabled/convolution/test_conv_act1d_filter2d_simple.yaml (100%)
 rename {Tensile/Tests => Tests}/disabled/convolution/test_conv_act1d_filter3d_simple.yaml (100%)
 rename {Tensile/Tests => Tests}/disabled/convolution/test_conv_act1d_filter5d_simple.yaml (100%)
 rename {Tensile/Tests => Tests}/disabled/convolution/test_conv_act2d_filter1d.yaml (100%)
 rename {Tensile/Tests => Tests}/disabled/convolution/test_conv_act2d_filter1d_simple.yaml (100%)
 rename {Tensile/Tests => Tests}/disabled/direct_to_lds/dtl_dgemm.yaml (100%)
 rename {Tensile/Tests => Tests}/disabled/direct_to_lds/dtl_dgemm_lite.yaml (100%)
 rename {Tensile/Tests => Tests}/disabled/direct_to_lds/dtl_tsgr_dgemm.yaml (100%)
 rename {Tensile/Tests => Tests}/disabled/hgemm_nn_source.yaml (100%)
 rename {Tensile/Tests => Tests}/disabled/multi_sum/test_.py (100%)
 rename {Tensile/Tests => Tests}/disabled/starter_packed_case.yaml (100%)
 rename {Tensile/Tests => Tests}/disabled/stridea0_pack_nt.yaml (100%)
 rename {Tensile/Tests => Tests}/disabled/strideb0_pack_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/disabled/test_assertion_selection.yaml (100%)
 rename {Tensile/Tests => Tests}/disabled/test_create_library.yaml (100%)
 rename {Tensile/Tests => Tests}/dot/mixmad-nt.yaml (100%)
 rename {Tensile/Tests => Tests}/dot/mixmad.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/bfloat16/bfloat16_hpa_source_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/bfloat16/bfloat16_hpa_source_nt.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/bfloat16/bfloat16_hpa_source_tn.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/bfloat16/bfloat16_hpa_source_tt.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/dgemm_asm.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/double_complex/double_complex_hip_cn.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/float8/b8f8gemm_hybrid_b8f8b8s_SR_gfx940.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/float8/b8f8gemm_hybrid_b8f8b8s_gfx940.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/float8/b8f8gemm_hybrid_b8f8hs_gfx940.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/float8/b8f8gemm_hybrid_b8f8ss_gfx940.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/float8/b8gemm_b8b8s_SR_gfx940.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/float8/b8gemm_b8b8s_gfx940.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/float8/b8gemm_b8hs_gfx940.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/float8/b8gemm_b8ss_gfx940.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/float8/f8b8gemm_hybrid_f8b8b8s_SR_gfx940.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/float8/f8b8gemm_hybrid_f8b8b8s_gfx940.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/float8/f8b8gemm_hybrid_f8b8hs_gfx940.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/float8/f8b8gemm_hybrid_f8b8ss_gfx940.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/float8/f8f8s-NT-edge-range-A3B3C3-alpha2-beta1.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/float8/f8gemm_f8f8s_SR_gfx940.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/float8/f8gemm_f8f8s_gfx940.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/float8/f8gemm_f8hs_gfx940.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/float8/f8gemm_f8ss_gfx940.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/float_complex/float_complex_hip_cc.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/hgemm_asm_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/hgemm_asm_nt.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/hgemm_asm_tn.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/hgemm_asm_tt.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/hgemm_hpa_asm_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/hgemm_hpa_asm_nt.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/hgemm_hpa_asm_tn.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/hgemm_hpa_asm_tt.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/igemm_hpa_hip_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/igemm_hpa_hip_nt.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/igemm_hpa_hip_tn.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/igemm_hpa_hip_tt.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/mfma/1LDSB.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/mfma/cgemm_asm.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/mfma/cgemm_asm_conjugate.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/mfma/dgemm.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/mfma/hpa_bfloat16_gemm_asm.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/mfma/hpa_bfloat16_gemm_asm_gfx940.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/mfma/hpa_hgemm_asm.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/mfma/hpa_igemm_i8_asm_gfx940.yaml (100%)
 rename {Tensile/Tests => Tests}/emulation/mfma/sgemm.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/big_tensor/biga.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/big_tensor/bigskinny_nt.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/big_tensor/largec.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/bufferload_offset/rocblas_dgemm_bufferload_limit.yaml (100%)
 rename {Tensile/Configs => Tests/extended/bufferload_offset}/rocblas_sgemm_bufferload_limit.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/classic/test_persistent.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/classic/test_tensor_contraction.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/classic_source/test_dgemm.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/classic_source/test_hgemm_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/classic_source/test_hgemm_nt.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/classic_source/test_hgemm_tn_tt.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/classic_source/test_sgemm.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/convolution_config/YamlBuilder/YamlBuilder.py (100%)
 rename {Tensile/Tests => Tests}/extended/convolution_config/YamlBuilder/header.yml (100%)
 rename {Tensile/Tests => Tests}/extended/convolution_config/YamlBuilder/solutions/sgemm_1.yml (100%)
 rename {Tensile/Tests => Tests}/extended/convolution_config/YamlBuilder/solutions/sgemm_src.yml (100%)
 rename {Tensile/Tests => Tests}/extended/convolution_config/conftest.py (100%)
 rename {Tensile/Tests => Tests}/extended/convolution_config/test_backwarddata_nchw.py (100%)
 rename {Tensile/Tests => Tests}/extended/convolution_config/test_backwardweights_nchw.py (100%)
 rename {Tensile/Tests => Tests}/extended/convolution_config/test_bad_input.py (100%)
 rename {Tensile/Tests => Tests}/extended/convolution_config/test_conv_vs_contraction.py (100%)
 rename {Tensile/Tests => Tests}/extended/convolution_config/test_forward_cnhw.py (100%)
 rename {Tensile/Tests => Tests}/extended/convolution_config/test_forward_nchw.py (100%)
 rename {Tensile/Tests => Tests}/extended/convolution_config/test_forward_nchw_ckyx.py (100%)
 rename {Tensile/Tests => Tests}/extended/convolution_config/test_forward_nhwc.py (100%)
 rename {Tensile/Tests => Tests}/extended/convolution_config/test_forward_pad.py (100%)
 rename {Tensile/Tests => Tests}/extended/convolution_config/test_simple.py (100%)
 rename {Tensile/Tests => Tests}/extended/convolution_config/unittests/test_problem_sizes.py (100%)
 rename {Tensile/Tests => Tests}/extended/convolution_config/unittests/test_string_swap.py (100%)
 rename {Tensile/Tests => Tests}/extended/custom_kernel/ck_dgemm_90a_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/custom_kernel/ck_dgemm_90a_nn_large_offset.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/direct_to_lds/dtl_dgemm.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/direct_to_lds/dtl_hgemm.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/direct_to_lds/dtl_sgemm.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/direct_to_lds/dtl_tsgr_f8.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/direct_to_lds/dtl_tsgr_hgemm.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/direct_to_lds/dtl_tsgr_sgemm.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/direct_to_vgpr/dtv_cgemm.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/direct_to_vgpr/dtv_dgemm.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/direct_to_vgpr/dtv_dgemm_a1b0.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/direct_to_vgpr/dtv_f8gemm.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/direct_to_vgpr/dtv_hgemm.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/direct_to_vgpr/dtv_igemm.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/dot2/hgemm_hpa_dot2_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/dot2/hgemm_hpa_dot2_tn.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/dot2/hgemm_hpa_dot2_tn_2.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/double_complex/zgemm_asm.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/double_complex/zgemm_hip_source_cc.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/double_complex/zgemm_hip_source_cn.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/double_complex/zgemm_hip_source_ct.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/double_complex/zgemm_hip_source_nc.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/double_complex/zgemm_hip_source_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/double_complex/zgemm_hip_source_nt.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/double_complex/zgemm_hip_source_tc.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/double_complex/zgemm_hip_source_tn.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/double_complex/zgemm_hip_source_tt.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/flat/test_dgemm_asm_flat.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/flat/test_sgemm_asm_flat.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/flat/test_sgemm_asm_flat_nt.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/flat/test_sgemm_asm_flat_tn.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/flat/test_sgemm_asm_flat_tt.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/float8/f8gemm-hybrid-ss.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/float_complex/cgemm_asm.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/float_complex/cgemm_hip_source_cc.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/float_complex/cgemm_hip_source_cn.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/float_complex/cgemm_hip_source_ct.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/float_complex/cgemm_hip_source_nc.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/float_complex/cgemm_hip_source_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/float_complex/cgemm_hip_source_nt.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/float_complex/cgemm_hip_source_tc.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/float_complex/cgemm_hip_source_tn.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/float_complex/cgemm_hip_source_tt.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/fractional/test_dgemm_fractional_tile_sweep.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/fractional/test_hgemm_fractional_tile_sweep.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/fractional/test_sgemm_fractional_edge.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/fractional/test_sgemm_fractional_tile_sweep.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/global_split_u/hgemm_gsu.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/global_split_u/hgemm_gsu_minkforgsu.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/global_split_u/sgemm_gsu_batch.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/global_split_u/sgemm_gsu_beta0.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/global_split_u/sgemm_gsu_beta1.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/global_split_u/sgemm_gsu_beta2.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/global_split_u/sgemm_gsu_usebeta0.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/hpa_source/test_hgemm_hpa_src_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/hpa_source/test_hgemm_hpa_src_nt.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/hpa_source/test_hgemm_hpa_src_tn.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/hpa_source/test_hgemm_hpa_src_tt.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/local_split_u/bfloat16_lsu_mfma.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/local_split_u/cgemm_lsu_mfma.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/local_split_u/dgemm_lsu.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/local_split_u/dgemm_lsu_mfma.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/local_split_u/f8gemm_lsu_mfma.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/local_split_u/hgemm_lsu.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/local_split_u/hgemm_lsu_grvw2.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/local_split_u/hgemm_lsu_mfma.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/local_split_u/hgemm_lsu_mfma_a1b0.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/local_split_u/igemm_lsu_mfma.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/local_split_u/sgemm_lsu.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/local_split_u/sgemm_lsu_mfma.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/local_split_u/zgemm_lsu_mfma.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_1sum_zp.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_2sum_mir_summ.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_2sum_mir_summ_zp_other.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_2sum_mir_summ_zp_unroll.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_2sum_mir_unroll.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_2sum_mir_unroll_summ.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_2sum_mir_unroll_zp_other.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_2sum_mir_unroll_zp_unroll.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_3sum_mir_summ1.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_3sum_mir_summ1_summ2.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_3sum_mir_summ2.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_3sum_mir_summ_zp_other.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_3sum_mir_unroll.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_3sum_mir_unroll_summ1.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/mirror_dims/mirror_dims_3sum_mir_unroll_zp_other.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/multi_sum/2sum.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/multi_sum/2sum_gsu.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/multi_sum/2sum_gsu_simple.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/multi_sum/2sum_gsu_src.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/multi_sum/2sum_src.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/multi_sum/3sum_gsu.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/multi_sum/simple_sum2_scrambled.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/multi_sum_psd/1sum_gsu_simple.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/multi_sum_psd/1sum_simple.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/multi_sum_psd/2sum.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/multi_sum_psd/2sum_gsu.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/multi_sum_psd/2sum_gsu_simple.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/multi_sum_psd/2sum_gsuremainder.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/multi_sum_psd/2sum_gsuremainder_simple.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/multi_sum_psd/2sum_pbd.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/multi_sum_psd/2sum_scrambled_simple.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/multi_sum_psd/3sum.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/multi_sum_psd/3sum_gsu.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/multi_sum_psd/3sum_gsu_simple.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/multi_sum_psd/3sum_simple.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/multi_sum_psd/README (100%)
 rename {Tensile/Tests => Tests}/extended/multi_sum_psd/hackable_simple_unrollinc1.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/nonbatched/sgemm_asm_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/nonbatched/sgemm_asm_nt.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/nonbatched/sgemm_asm_tn.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/nonbatched/sgemm_asm_tt.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/pack_tensor_dims/multi_free2.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/pack_tensor_dims/multi_free_batch.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/pack_tensor_dims/packed_perf_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/pack_tensor_dims/simple_stridea0_pack.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/pack_tensor_dims/simple_strideb0_pack.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/pack_tensor_dims/strideb0_pack_nt.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/pack_tensor_dims/strideb0_pack_tn.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/pack_tensor_dims/vectorstore0.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/stagger_u/big_skinny_A_NN.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/stagger_u/big_skinny_A_NT.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/stagger_u/big_skinny_A_TN.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/stagger_u/big_skinny_A_TT.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/stagger_u/big_skinny_B_NN.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/stagger_u/big_skinny_B_NT.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/stagger_u/big_skinny_B_TN.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/stagger_u/big_skinny_B_TT.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/stream_k/sk_2tile_hgemm_hhs.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/stream_k/sk_2tile_sgemm.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/stream_k/sk_hgemm_hhs.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/stream_k/sk_sgemm.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/tensor_contraction/README (100%)
 rename {Tensile/Tests => Tests}/extended/tensor_contraction/allownofree.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/tensor_contraction/assert_size_equal.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/tensor_contraction/exact_conv.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/tensor_contraction/filter.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/tensor_contraction/ncdhw.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/tensor_contraction/sweep_packed_dims.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/tensor_contraction/swizzle0.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/tensor_contraction/swizzle1.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/tensor_contraction/swizzle2.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/tensor_contraction/swizzle3.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/tensor_contraction/test_ncdhw_packed_strides3d_defaults.contraction.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/tensor_contraction/test_ncdhw_packed_strides_filter3d.contraction.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/tensor_contraction/test_nchw_filter_contraction.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/tensor_contraction/tlu0_non_unit_stride.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/use_initial_strides/simple_use_initial_strides_1.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/use_initial_strides/test_1.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/use_initial_strides/test_2.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/use_initial_strides/test_strides.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/use_initial_strides/test_strides1.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/use_initial_strides_cd/perf_uis_cd_specialized.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/use_initial_strides_cd/test_use_initial_strides_cd_0.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/use_initial_strides_cd/test_use_initial_strides_cd_2.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/vector_width/hgemm_nn_asm.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/vector_width/sgemm_nn_asm.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/vector_width/sgemm_nn_source.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/zeropad/test_zp_2sum_zpother.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/zeropad/test_zp_simple_1sum.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/zeropad/test_zp_simple_2sum_zp_both.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/zeropad/test_zp_simple_2sum_zp_other.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/zeropad/test_zp_simple_2sum_zp_unroll.yaml (100%)
 rename {Tensile/Tests => Tests}/extended/zeropad/test_zp_simple_3sum_zp_other.yaml (100%)
 rename {Tensile/Tests => Tests}/hipModuleLoad_timing/Makefile (100%)
 rename {Tensile/Tests => Tests}/hipModuleLoad_timing/hipModuleLoadTiming.cpp (100%)
 rename {Tensile/Tests => Tests}/integration/test_integration.py (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/4xi8gemm_hpa_hip_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/4xi8gemm_hpa_hip_nt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/4xi8gemm_hpa_hip_tn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/4xi8gemm_hpa_hip_tt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/bfloat16/bfloat16_hpa_source_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/bfloat16/bfloat16_hpa_source_nt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/bfloat16/bfloat16_hpa_source_tn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/bfloat16/bfloat16_hpa_source_tt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/bfloat16/bfloat16s_hpa_source_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/bfloat16/bfloat16s_hpa_source_nt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/bfloat16/bfloat16s_hpa_source_tn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/bfloat16/bfloat16s_hpa_source_tt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/cov/COV4.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/cov/COV5.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/cov/COVDefault.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/denorm/bfloat16_hpa_source_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/denorm/dgemm_asm.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/denorm/hgemm_hpa_asm_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/denorm/mfma/bfloat16_1k_denorm.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/denorm/mfma/bfloat16_denorm.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/denorm/mfma/dgemm_denorm.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/denorm/mfma/hgemm_denorm.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/denorm/mfma/hgemm_denorm_alt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/denorm/mfma/hgemm_denorm_alt_rnz.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/denorm/mfma/sgemm_denorm.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/denorm/sgemm_asm_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/dgemm_asm.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/dgemm_general_batch_asm.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/direct_to_vgpr/dtv_sgemm_lite.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_asm_cc.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_asm_cn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_asm_ct.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_asm_nc.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_asm_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_asm_nt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_asm_tc.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_asm_tn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_asm_tt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_hip_cc.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_hip_cn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_hip_ct.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_hip_nc.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_hip_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_hip_nt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_hip_tc.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_hip_tn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/double_complex/double_complex_hip_tt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_asm_cc.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_asm_cn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_asm_ct.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_asm_nc.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_asm_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_asm_nt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_asm_tc.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_asm_tn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_asm_tt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_hip_cc.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_hip_cn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_hip_ct.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_hip_nc.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_hip_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_hip_nt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_hip_tc.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_hip_tn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/float_complex/float_complex_hip_tt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/hgemm_asm_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/hgemm_asm_nt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/hgemm_asm_tn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/hgemm_asm_tt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/hgemm_general_batch_asm_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/hgemm_general_batch_hpa_asm_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/hgemm_hpa_asm_f32_alphabeta_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/hgemm_hpa_asm_f32_alphabeta_nt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/hgemm_hpa_asm_f32_alphabeta_tn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/hgemm_hpa_asm_f32_alphabeta_tt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/hgemm_hpa_asm_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/hgemm_hpa_asm_nt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/hgemm_hpa_asm_tn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/hgemm_hpa_asm_tt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/hgemm_hpa_iu2_asm_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/hgemm_hpa_iu2_asm_nt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/hgemm_hpa_iu2_asm_tn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/hgemm_hpa_iu2_asm_tt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/hsgemm_hpa_asm_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/hsgemm_hpa_asm_nt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/hsgemm_hpa_asm_tn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/hsgemm_hpa_asm_tt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/hsgemm_hpa_iu2_asm_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/hsgemm_hpa_iu2_asm_nt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/hsgemm_hpa_iu2_asm_tn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/hsgemm_hpa_iu2_asm_tt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/igemm_hpa_asm_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/igemm_hpa_hip_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/1LDSB.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/c-tile-reuse-no-nll.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/cgemm_asm.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/cgemm_asm_conjugate.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/dgemm_alpha1_beta0_sgpr.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/dgemm_asm.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/dgemm_gb_global_ldd.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/dgemm_large_offset.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_bfloat16_gemm_asm.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_bfloat16_gemm_asm_gfx940.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_bfloat16_general_batch_gemm_asm.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_bfloat16_general_batch_gemm_asm_gfx940.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_bfloat16s_gemm_asm.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_bfloat16s_gemm_asm_gfx940.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_hgemm_asm.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_hgemm_f32_alphabeta_asm.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_hgemm_general_batch_asm.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_hgemm_split_lds.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_hsgemm_asm.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_igemm_i8_asm.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_igemm_i8_asm_gfx940.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_igemm_i8_split_lds.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/hpa_igemm_i8_split_lds_gfx940.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/sgemm_64bit_offset.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/sgemm_64bit_offset_post.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/sgemm_asm.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/sgemm_general_batch_asm.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/sgemm_split_lds.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/sgemm_xf32_asm_gfx940.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/wider_local_read.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/zgemm_asm.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/mfma/zgemm_asm_conjugate.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/no_load_loop/nll_reproduce_bug.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/no_load_loop/sgemm_nll_asm_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/no_load_loop/sgemm_nll_asm_nt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/no_load_loop/sgemm_nll_asm_tn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/no_load_loop/sgemm_nll_asm_tt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/regression/persistent_kernel.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/sgemm_asm_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/sgemm_asm_nt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/sgemm_asm_tn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/sgemm_asm_tn_bigk.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/sgemm_asm_tt.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/sgemm_exact_dict.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/sgemm_general_batch_asm_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/source/test_dgemm_defaults.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/source/test_hgemm_defaults.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/source/test_hgemm_hpa.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/source/test_sgemm_defaults.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/wmma/hgemm_wmma.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/wmma/hpa_bfloat16_gemm_wmma.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/wmma/hpa_hgemm_wmma.yaml (100%)
 rename {Tensile/Tests => Tests}/pre_checkin/wmma/hpa_igemm_wmma.yaml (100%)
 rename {Tensile/Tests => Tests}/special/global_split_u_src/README (100%)
 rename {Tensile/Tests => Tests}/special/global_split_u_src/hgemm_gsu.yaml (100%)
 rename {Tensile/Tests => Tests}/special/global_split_u_src/sgemm_gsu_beta0.yaml (100%)
 rename {Tensile/Tests => Tests}/special/global_split_u_src/sgemm_gsu_beta1.yaml (100%)
 rename {Tensile/Tests => Tests}/special/global_split_u_src/sgemm_gsu_beta2.yaml (100%)
 rename {Tensile/Tests => Tests}/special/global_split_u_src/sgemm_gsu_usebeta0.yaml (100%)
 rename {Tensile/Tests => Tests}/special/igemm/igemm_hpa_hip_lsu.yaml (100%)
 rename {Tensile/Tests => Tests}/special/igemm/igemm_hpa_hip_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/special/igemm/igemm_hpa_hip_tt.yaml (100%)
 rename {Tensile/Tests => Tests}/test_data/unit/library_data/hardcodedParameters.yaml (100%)
 rename {Tensile/Tests => Tests}/test_data/unit/library_data/initialSolutionParameters.yaml (100%)
 rename {Tensile/Tests => Tests}/test_data/unit/library_data/library/Kernels.so-000-gfx1010.hsaco (100%)
 rename {Tensile/Tests => Tests}/test_data/unit/library_data/library/Kernels.so-000-gfx1011.hsaco (100%)
 rename {Tensile/Tests => Tests}/test_data/unit/library_data/library/Kernels.so-000-gfx803.hsaco (100%)
 rename {Tensile/Tests => Tests}/test_data/unit/library_data/library/Kernels.so-000-gfx900.hsaco (100%)
 rename {Tensile/Tests => Tests}/test_data/unit/library_data/library/Kernels.so-000-gfx906.hsaco (100%)
 rename {Tensile/Tests => Tests}/test_data/unit/library_data/library/Kernels.so-000-gfx908.hsaco (100%)
 rename {Tensile/Tests => Tests}/test_data/unit/library_data/library/TensileLibrary.yaml (100%)
 rename {Tensile/Tests => Tests}/test_data/unit/library_data/library/TensileLibrary_gfx1010.co (100%)
 rename {Tensile/Tests => Tests}/test_data/unit/library_data/library/TensileLibrary_gfx1011.co (100%)
 rename {Tensile/Tests => Tests}/test_data/unit/library_data/library/TensileLibrary_gfx803.co (100%)
 rename {Tensile/Tests => Tests}/test_data/unit/library_data/library/TensileLibrary_gfx900.co (100%)
 rename {Tensile/Tests => Tests}/test_data/unit/library_data/library/TensileLibrary_gfx906.co (100%)
 rename {Tensile/Tests => Tests}/test_data/unit/library_data/library/TensileLibrary_gfx908.co (100%)
 rename {Tensile/Tests => Tests}/test_data/unit/library_data/library/metadata.yaml (100%)
 rename {Tensile/Tests => Tests}/test_data/unit/library_data/problemType.yaml (100%)
 rename {Tensile/Tests => Tests}/test_data/unit/solutions/solutions_nn_3.yaml (100%)
 rename {Tensile/Tests => Tests}/unit/__init__.py (100%)
 rename {Tensile/Tests => Tests}/unit/customKernels/TestKernel.s (100%)
 rename {Tensile/Tests => Tests}/unit/replacement/bad_file/bad.txt (100%)
 rename {Tensile/Tests => Tests}/unit/replacement/duplicate_kernel/a.txt (100%)
 rename {Tensile/Tests => Tests}/unit/replacement/duplicate_kernel/b.txt (100%)
 rename {Tensile/Tests => Tests}/unit/replacement/known_kernels_v2/baz.s.txt (100%)
 rename {Tensile/Tests => Tests}/unit/replacement/known_kernels_v2/kernel_named_bar.txt (100%)
 rename {Tensile/Tests => Tests}/unit/replacement/known_kernels_v2/kernel_named_foo.txt (100%)
 rename {Tensile/Tests => Tests}/unit/replacement/known_kernels_v3/baz.s.txt (100%)
 rename {Tensile/Tests => Tests}/unit/replacement/known_kernels_v3/kernel_named_bar.txt (100%)
 rename {Tensile/Tests => Tests}/unit/replacement/known_kernels_v3/kernel_named_foo.txt (100%)
 rename {Tensile/Tests => Tests}/unit/test_Common.py (100%)
 rename {Tensile/Tests => Tests}/unit/test_Component.py (100%)
 rename {Tensile/Tests => Tests}/unit/test_Configuration.py (100%)
 rename {Tensile/Tests => Tests}/unit/test_CustomKernels.py (100%)
 rename {Tensile/Tests => Tests}/unit/test_DataType.py (100%)
 rename {Tensile/Tests => Tests}/unit/test_HardwarePredicates.py (100%)
 rename {Tensile/Tests => Tests}/unit/test_KernelWriterAssembly.py (100%)
 rename {Tensile/Tests => Tests}/unit/test_LibraryIO.py (100%)
 rename {Tensile/Tests => Tests}/unit/test_PerfMetricPredicates.py (100%)
 rename {Tensile/Tests => Tests}/unit/test_Priority.py (100%)
 rename {Tensile/Tests => Tests}/unit/test_ReplacementKernels.py (100%)
 rename {Tensile/Tests => Tests}/unit/test_TensileCreateLibrary.py (100%)
 rename {Tensile/Tests => Tests}/unit/test_conv_problem.py (100%)
 rename {Tensile/Tests => Tests}/unit/test_exact_problem.py (100%)
 rename {Tensile/Tests => Tests}/unit/test_makeProblem.py (100%)
 rename {Tensile/Tests => Tests}/unit/test_mergeLogic.py (100%)
 rename {Tensile/Tests => Tests}/unit/test_tryAssembler.py (100%)
 rename {Tensile/Tests => Tests}/unit/test_useGlobalParameters.py (100%)
 rename {Tensile/Tests => Tests}/vega_20/fast/igemm_asm_nn.yaml (100%)
 rename {Tensile/Tests => Tests}/vega_20/fast/igemm_asm_nt.yaml (100%)
 rename {Tensile/Tests => Tests}/vega_20/fast/igemm_asm_tn.yaml (100%)
 rename {Tensile/Tests => Tests}/vega_20/fast/igemm_asm_tt.yaml (100%)
 rename {Tensile/Tests => Tests}/vega_20/nightly/global_split_u/igemm_gsu_beta0.yaml (100%)
 rename {Tensile/Tests => Tests}/vega_20/nightly/global_split_u/igemm_gsu_beta1.yaml (100%)
 rename {Tensile/Tests => Tests}/vega_20/nightly/global_split_u/igemm_gsu_beta2.yaml (100%)
 rename {Tensile/Tests => Tests}/vega_20/nightly/local_split_u/igemm_lsu.yaml (100%)
 rename {Tensile/Tests => Tests}/weekly/assertions/README (100%)
 rename {Tensile/Tests => Tests}/weekly/assertions/test_hgemm_asem2_asm.yaml (100%)
 rename {Tensile/Tests => Tests}/weekly/classic_source/test_hgemm_vectors.yaml (100%)
 rename {Tensile/Tests => Tests}/weekly/classic_source/test_sgemm_vectors.yaml (100%)
 rename {Tensile/Tests => Tests}/yaml_only/test_config.py (100%)
 rename {Tensile/Tests => Tests}/yaml_only/test_ya (100%)
 rename {Tensile => src/Tensile}/AsmMemoryInstruction.py (100%)
 rename {Tensile => src/Tensile}/AsmRegisterPool.py (100%)
 rename {Tensile => src/Tensile}/AsmUtils.py (100%)
 rename {Tensile => src/Tensile}/BenchmarkProblems.py (100%)
 rename {Tensile => src/Tensile}/BenchmarkSplitter.py (100%)
 rename {Tensile => src/Tensile}/BenchmarkStructs.py (100%)
 rename {Tensile => src/Tensile}/ClientExecutable.py (100%)
 rename {Tensile => src/Tensile}/ClientWriter.py (100%)
 rename {Tensile => src/Tensile}/Code.py (100%)
 rename {Tensile => src/Tensile}/Common.py (100%)
 rename {Tensile => src/Tensile}/Component.py (100%)
 rename {Tensile => src/Tensile}/Components/ComputeStoreVgprs.py (100%)
 rename {Tensile => src/Tensile}/Components/LocalRead.py (100%)
 rename {Tensile => src/Tensile}/Components/LraTileAssignment.py (100%)
 rename {Tensile => src/Tensile}/Components/MAC_BF16_HPA.py (100%)
 rename {Tensile => src/Tensile}/Components/MAC_F16.py (100%)
 rename {Tensile => src/Tensile}/Components/MAC_F16_HPA.py (100%)
 rename {Tensile => src/Tensile}/Components/MAC_F32.py (100%)
 rename {Tensile => src/Tensile}/Components/MAC_F32C.py (100%)
 rename {Tensile => src/Tensile}/Components/MAC_F64.py (100%)
 rename {Tensile => src/Tensile}/Components/MAC_F64C.py (100%)
 rename {Tensile => src/Tensile}/Components/MAC_I8X4.py (100%)
 rename {Tensile => src/Tensile}/Components/MAC_I8_HPA.py (100%)
 rename {Tensile => src/Tensile}/Components/MFMA.py (100%)
 rename {Tensile => src/Tensile}/Components/NotLocalFullTileElements.py (100%)
 rename {Tensile => src/Tensile}/Components/Priority.py (100%)
 rename {Tensile => src/Tensile}/Components/PseudoRandomGenerator.py (100%)
 rename {Tensile => src/Tensile}/Components/ShiftVectorComponents.py (100%)
 rename {Tensile => src/Tensile}/Components/Signature.py (100%)
 rename {Tensile => src/Tensile}/Components/__init__.py (100%)
 rename {Tensile => src/Tensile}/Configuration.py (100%)
 rename {Tensile => src/Tensile}/Contractions.py (100%)
 rename {Tensile => src/Tensile}/CustomKernels.py (100%)
 rename {Tensile => src/Tensile}/CustomKernels/DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4.s (100%)
 rename {Tensile => src/Tensile}/DataType.py (100%)
 rename {Tensile => src/Tensile}/EmbeddedData.py (100%)
 rename {Tensile => src/Tensile}/GenerateSummations.py (100%)
 rename {Tensile => src/Tensile}/Hardware.py (100%)
 rename {Tensile => src/Tensile}/KernelWriter.py (100%)
 rename {Tensile => src/Tensile}/KernelWriterAssembly.py (100%)
 rename {Tensile => src/Tensile}/KernelWriterBase.py (100%)
 rename {Tensile => src/Tensile}/KernelWriterBetaOnly.py (100%)
 rename {Tensile => src/Tensile}/KernelWriterConversion.py (100%)
 rename {Tensile => src/Tensile}/KernelWriterSource.py (100%)
 rename {Tensile => src/Tensile}/KernelWriterStreamKInit.py (100%)
 rename {Tensile => src/Tensile}/LibraryIO.py (100%)
 rename {Tensile => src/Tensile}/LibraryLogic.py (100%)
 rename {Tensile => src/Tensile}/Parallel.py (100%)
 rename {Tensile => src/Tensile}/Properties.py (100%)
 rename {Tensile => src/Tensile}/ReplacementKernels.py (100%)
 rename {Tensile => src/Tensile}/SolutionLibrary.py (100%)
 rename {Tensile => src/Tensile}/SolutionSelectionLibrary.py (100%)
 rename {Tensile => src/Tensile}/SolutionStructs.py (100%)
 rename {Tensile => src/Tensile}/SolutionWriter.py (100%)
 rename {Tensile => src/Tensile}/Tensile.py (100%)
 rename {Tensile => src/Tensile}/TensileBenchmarkCluster.py (100%)
 rename {Tensile => src/Tensile}/TensileBenchmarkClusterScripts.py (100%)
 rename {Tensile => src/Tensile}/TensileBenchmarkLibraryClient.py (100%)
 rename {Tensile => src/Tensile}/TensileClientConfig.py (100%)
 rename {Tensile => src/Tensile}/TensileCreateLibrary.py (100%)
 rename {Tensile => src/Tensile}/TensileLibLogicToYaml.py (100%)
 rename {Tensile => src/Tensile}/TensileMergeLibrary.py (100%)
 rename {Tensile => src/Tensile}/TensileRetuneLibrary.py (100%)
 rename {Tensile => src/Tensile}/TensileUpdateLibrary.py (100%)
 rename {Tensile => src/Tensile}/Utils.py (100%)
 rename {Tensile => src/Tensile}/__init__.py (100%)
 rename {Tensile => src/Tensile}/bin/Tensile (100%)
 rename {Tensile => src/Tensile}/bin/TensileBenchmarkCluster (100%)
 rename {Tensile => src/Tensile}/bin/TensileClientConfig (100%)
 rename {Tensile => src/Tensile}/bin/TensileCreateLibrary (100%)
 rename {Tensile => src/Tensile}/bin/TensileGenerateSummations (100%)
 rename {Tensile => src/Tensile}/bin/TensileLibLogicToYaml (100%)
 rename {Tensile => src/Tensile}/bin/TensileMergeLibrary (100%)
 rename {Tensile => src/Tensile}/bin/TensileRetuneLibrary (100%)
 rename {Tensile => src/Tensile}/bin/TensileUpdateLibrary (100%)
 rename {Tensile => src/Tensile/data}/Configs/alternate-format/sizeList-example.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/alternate-format/vega20-example.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/deep_bench_nn.csv (100%)
 rename {Tensile => src/Tensile/data}/Configs/deep_bench_nn_batched.csv (100%)
 rename {Tensile => src/Tensile/data}/Configs/deep_bench_nt.csv (100%)
 rename {Tensile => src/Tensile/data}/Configs/deep_bench_nt_batched.csv (100%)
 rename {Tensile => src/Tensile/data}/Configs/deep_bench_tn.csv (100%)
 rename {Tensile => src/Tensile/data}/Configs/deep_bench_tn_batched.csv (100%)
 rename {Tensile => src/Tensile/data}/Configs/mfma/mfma_hpa_bf16_nt_test.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/mfma/mfma_igemm_lite_test.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/mfma/mfma_igemm_nn_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/mfma/mfma_igemm_nt_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/mfma/mfma_igemm_tn_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/mfma/mfma_igemm_tt_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/mfma/mfma_test.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/mfma/rocblas_cgemm_asm_xdlops.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/mfma/rocblas_sgemm_asm_single_kernel.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/mfma/rocblas_sgemm_nt_hpl1_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/mfma/sgemm_tlunn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/mfma/sgemm_transposeLDS.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/Logic/deepbench_conv/vega10_Cijk_Ailk_Bljk_HB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/Logic/deepbench_conv/vega10_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bjlk_HB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bljk_HB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Alik_Bljk_HB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/Makefile (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/README.md (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_nn_bert.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_nt_bert.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_tn_bert.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_nn_bert.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_nt_bert.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_tn_bert.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_nn_bert.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_nt_bert.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_tn_bert.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_nn_msra.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_nt_msra.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_tn_msra.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_nn_bert.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_nt_bert.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_tn_bert.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_nn_bert_f16.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_nt_bert_f16.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_tn_bert_f16.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Ailk_Bjlk_HB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Ailk_Bljk_HB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Alik_Bljk_HB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-18/configs/bert_sgemm_xdlops_nn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-18/configs/bert_sgemm_xdlops_tn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-18/configs/dlrm_sgemm_xdlops.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-18/configs/dlrm_sgemm_xdlops_nt.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-18/configs/replacement-kernel-arcturus-tn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_nn_inc1_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_nt_inc1_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_tn_inc1_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-18/exact/arcturus_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_nn_batched_msra.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_nt_batched_msra.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_tn_batched_msra.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_nn_onnx.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_nt_onnx.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_tn_onnx.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_nn_megatron.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_nt_megatron.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_tn_megatron.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Alik_Bljk_HBH.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-11-06/configs/doit.sh (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-11-06/configs/nn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-11-06/configs/nt.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-11-06/configs/tn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-11-08/configs/bert-nn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-11-08/configs/bert-nt.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-11-08/configs/bert-tn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-11-08/configs/doit.sh (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_nn_dlrm.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_nt_dlrm.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_tn_dlrm.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_nn_dlrm.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_nt_dlrm.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_tn_dlrm.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-07-02/configs/temp.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-07-02/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_nn_terabyte.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_nt_terabyte.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_tn_terabyte.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_nn_last-dlrm-terabyte-tt-2.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_nt_last-dlrm-terabyte-tt-2.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_tn_last-dlrm-terabyte-tt-2.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/README (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/clients/samples/example_gemm_ext2-tn.cpp (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/doit.sh (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/spec2-nn-gfx900.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/spec2-tn-gfx900.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/speccd-nn-gfx900.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/speccd-tn-gfx900.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx900/joined/vega10_Cijk_Ailk_Bljk_SBIIc.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx900/joined/vega10_Cijk_Ailk_Bljk_SBIc.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/nn/vega10_Cijk_Ailk_Bljk_SBIIc.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/nn/vega10_Cijk_Ailk_Bljk_SBIc.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/tn/vega10_Cijk_Ailk_Bljk_SBIIc.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/tn/vega10_Cijk_Ailk_Bljk_SBIc.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/doit.sh (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/spec2-nn-gfx906.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/spec2-tn-gfx906.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/speccd-nn-gfx906.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/speccd-tn-gfx906.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx906/joined/vega20_Cijk_Ailk_Bljk_SBIIc.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx906/joined/vega20_Cijk_Ailk_Bljk_SBIc.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/nn/vega20_Cijk_Ailk_Bljk_SBIIc.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/nn/vega20_Cijk_Ailk_Bljk_SBIc.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/tn/vega20_Cijk_Ailk_Bljk_SBIIc.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/tn/vega20_Cijk_Ailk_Bljk_SBIc.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/doit.sh (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/spec2-nn-gfx908.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/spec2-tn-gfx908.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/speccd-nn-gfx908.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/speccd-tn-gfx908.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx908/joined/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx908/joined/arcturus_Cijk_Ailk_Bljk_SBIc.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/nn/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/nn/arcturus_Cijk_Ailk_Bljk_SBIc.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/tn/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/tn/arcturus_Cijk_Ailk_Bljk_SBIc.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_nn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_nt_batched.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_tn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_nn_riga.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_nt_riga.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_tn_riga.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2020-06-15/configs/arcturus_sgemm_nn_resnext-inception.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2020-06-15/configs/arcturus_sgemm_nt_resnext-inception.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2020-06-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/inception/2020-06-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/megatron/2021-02-04/2_BenchmarkData.tar.gz (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_nn_hbh.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_nt_hbh.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_tn_hbh.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Alik_Bljk_HBH.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_nn_mlp.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_nt_mlp.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_tn_mlp.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_nn_k1.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_nt_k1.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_tn_k1.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/phantom/2019-08-26/configs/configs1/vega20_sgemm_nn_phantom.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/phantom/2019-08-26/configs/configs1/vega20_sgemm_tn_phantom.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_nn_phantom.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_nt_phantom.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_tn_phantom.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_nn_riga.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_nt_riga.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_tn_riga.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nn-2x2.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nt-2x2.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nt.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijk_Ailk_Bjlk_S.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijk_Ailk_Bljk_S.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijkl_Aijml_Bkml_SI.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijkl_Aijml_Bmkl_SI.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-05-06/configs/resnet-inception-hgemm-nn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-05-06/configs/resnet-inception-hgemm-nt.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-05-06/exact/vega20_Cijk_Ailk_Bjlk_HH.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-05-06/exact/vega20_Cijk_Ailk_Bljk_HH.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-06-15/configs/arcturus_sgemm_nn_resnext-inception.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-06-15/configs/arcturus_sgemm_nt_resnext-inception.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-06-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet/2020-06-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-09-12/README.md (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_nn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_nt.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_tn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_nn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_nt.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_tn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bjlk_HB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bljk_HB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Alik_Bljk_HB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/README.md (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_nn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_nt.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_tn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_nn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_nt.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_tn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_nn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_nt.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_tn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_HB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_HBH.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_HB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_HBH.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_HB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_HBH.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_HB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_HBH.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_HB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_HBH.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_HB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_HBH.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_HB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_HBH.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_HB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_HBH.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_HB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_HBH.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2019-12-03/configs/vega20_sgemm_nn_resnet50.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2019-12-03/configs/vega20_sgemm_nt_resnet50.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2019-12-03/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnet50/2019-12-03/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-10/2_BenchmarkData.tar.gz (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_nn_sb.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_nt_sb.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_tn_sb.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-17/2_BenchmarkData.tar.gz (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_nn_resnext3d.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_nt_resnext3d.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_tn_resnext3d.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-18/2_BenchmarkData.tar.gz (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_nn_resnext3d-r2.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_nt_resnext3d-r2.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_tn_resnext3d-r2.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rk/2020-07-23/configs/replacement-kernel-arcturus-tn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rk/2020-07-23/exact/arcturus_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rk/2020-08-12/base/arcturus_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rk/2020-08-12/combined/arcturus_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rk/2020-08-12/configuration/sgemm_tn-guard-pr195.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rk/2020-08-12/inc-raw/arcturus_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rk/2020-08-12/inc/arcturus_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rk/2020-08-12/logs/convert.log (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rk/2020-08-12/logs/merge.log (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_nn_shakespeare.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_nt_shakespeare.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_tn_shakespeare.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_nn_shakespeare.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_nt_shakespeare.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_tn_shakespeare.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2020-03-27/configs/arcturus_sgemm_tn_miopen.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/rnn/2020-03-27/exact/arcturus_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/arcturus_dgemm_nn_skinny_small.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/arcturus_dgemm_nt_skinny_small.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/vegoa20_dgemm_nn_skinny_small.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/vegoa20_dgemm_nt_skinny_small.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/arcturus_Cijk_Ailk_Bjlk_DB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/arcturus_Cijk_Ailk_Bljk_DB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/vega20_Cijk_Ailk_Bjlk_DB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/vega20_Cijk_Ailk_Bljk_DB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/skinny-sizes/2020-05-27/configs/arcturus_dgemm_nn_skinny_large.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/skinny-sizes/2020-05-27/configs/vega20_dgemm_nn_skinny_large.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/skinny-sizes/2020-05-27/exact/arcturus_Cijk_Ailk_Bljk_DB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/skinny-sizes/2020-05-27/exact/vega20_Cijk_Ailk_Bljk_DB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/small-sizes/archive/2019-11-11/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/small-sizes/archive/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/small-sizes/exact/2019-11-11/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/small-sizes/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/sparsNN/configs/sgemm_sparseNN_gemm_nn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/sparsNN/configs/sgemm_sparseNN_gemm_tn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/sparsNN/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/sparsNN/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_nn_transformer.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_nt_transformer.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_tn_transformer.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_nn_transformer.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_nt_transformer.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_tn_transformer.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_nn_transformer.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_nt_transformer.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_tn_transformer.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_nn_transformer.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_nt_transformer.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_tn_transformer.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_nn_sgemm_transformer.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_nt_sgemm_transformer.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_tn_sgemm_transformer.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_nn_hgemm_transformer.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_nt_hgemm_transformer.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_tn_hgemm_transformer.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Alik_Bljk_HBH.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/winograd/2019-08-26/configs/vega20_sgemm_nt_winograd.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/winograd/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/winograd/2019-10-05/configs/vega20_sgemm_tn_winograd.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/archives/winograd/2019-10-05/exact/vega20_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/boiler/header.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/boiler/library_logic_hip_only.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/boiler/library_logic_vega10_only.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/boiler/library_logic_vega20_only.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/convert_cfg.py (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/make_all.sh (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/problems/nn/deepbench_conv_1x1_batch1.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/problems/nn/deepbench_conv_1x1_batchN.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/problems/nn/deepbench_gemm_large.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/problems/nn/deepbench_gemm_skinny.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/problems/nn/resnet50_all.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/problems/nn/resnet50_batch64.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/problems/nn/resnet_batch64_B.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/problems/nt/deepbench_gemm_large.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/problems/nt/deepbench_gemm_skinny.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/problems/nt/resnet50_all.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/problems/tn/deepbench_gemm_large.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/problems/tn/deepbench_gemm_skinny.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/problems/tn/resnet50_all.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/solutions/hgemm_large_explore_3.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/solutions/hgemm_large_explore_5.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/solutions/hgemm_quick.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/solutions/hgemm_skinny_explore_3.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/solutions/hgemm_skinny_explore_5.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/solutions/sgemm_large_explore_3.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/solutions/sgemm_large_explore_5.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/solutions/sgemm_large_explore_7.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/solutions/sgemm_quick.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/solutions/sgemm_skinny_explore_3.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/solutions/sgemm_skinny_explore_4.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/solutions/sgemm_skinny_explore_5.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/solutions/sgemm_skinny_explore_7.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/types/hgemm_nn.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/types/hgemm_nt.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/types/hgemm_tn.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/types/hgemm_tt.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/types/igemm_nn.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/types/igemm_nt.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/types/igemm_tn.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/types/igemm_tt.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/types/sgemm_nn.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/types/sgemm_nt.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/types/sgemm_tn.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/miopen/types/sgemm_tt.yml (100%)
 rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hgemm_gb_nn_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hgemm_gb_nt_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hgemm_gb_tn_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hgemm_gb_tt_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hgemm_sb_nn_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hgemm_sb_nt_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hgemm_sb_tn_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hgemm_sb_tt_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hpa_hgemm_gb_nn_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hpa_hgemm_gb_nt_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hpa_hgemm_gb_tn_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hpa_hgemm_gb_tt_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hpa_hgemm_sb_nn_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hpa_hgemm_sb_nt_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hpa_hgemm_sb_tn_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_hpa_hgemm_sb_tt_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_sgemm_gb_nn_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_sgemm_gb_nt_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_sgemm_gb_tn_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_sgemm_gb_tt_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_sgemm_sb_nn_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_sgemm_sb_nt_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_sgemm_sb_tn_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/navi21/rocblas_sgemm_sb_tt_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_cgemm.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_cgemm_asm_lite.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_cgemm_hip_lite.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_asm_lite.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_asm_single_kernel.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_asm_square.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_bufferload_limit.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_hip_lite.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_nn_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_nn_inc0_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_nt_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_nt_inc0_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_nt_inc1_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_nt_inc2_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_nt_inc3_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_nt_resume_train_exp.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_tn_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_dgemm_tt_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hgemm_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hgemm_asm_lite.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hgemm_asm_single_kernel.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hgemm_bufferload_limit.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hgemm_hip_lite.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bf16_gemm_tn_asm_test.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bf16s_gemm_tn_asm_test.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16_gemm_inc1_hip.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16_gemm_nn_inc1_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16_gemm_nt_inc1_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16_gemm_tn_inc1_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16_hip_lite.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16_hip_single_kernel.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16_tn_inc1_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16_tn_inc2_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16s_gemm_inc1_hip.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16s_gemm_nn_inc1_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16s_gemm_nt_inc1_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16s_gemm_tn_inc1_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16s_hip_lite.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16s_hip_single_kernel.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16s_tn_inc1_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_bfloat16s_tn_inc2_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hgemm_asm_lite.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hgemm_asm_single_kernel.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hgemm_hip_lite.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hgemm_inc1_hip.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hgemm_nn_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hgemm_nn_inc1_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hgemm_nt_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hgemm_nt_inc1_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hgemm_tn_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hgemm_tn_inc1_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hgemm_tt_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hsgemm_asm_lite.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hsgemm_asm_single_kernel.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hsgemm_hip_lite.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hsgemm_inc1_hip.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hsgemm_nn_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hsgemm_nn_inc1_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hsgemm_nt_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hsgemm_nt_inc1_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hsgemm_tn_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hsgemm_tn_inc1_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_hsgemm_tt_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_igemm_nn_hip.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_igemm_nt_hip.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_igemm_tn_hip.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hpa_igemm_tt_hip.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_hsgemm_asm_lite.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_igemm_asm_full_nn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_igemm_asm_full_nt.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_igemm_asm_full_tn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_igemm_asm_full_tt.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_igemm_hip_single_kernel.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_sgemm_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_sgemm_asm_lite.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_sgemm_asm_only.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_sgemm_asm_single_kernel.yaml (100%)
 rename {Tensile/Tests/extended/bufferload_offset => src/Tensile/data/Configs}/rocblas_sgemm_bufferload_limit.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_sgemm_example.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_sgemm_hip_lite.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_sgemm_nn_inc1_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_sgemm_nt_inc1_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_sgemm_tn_inc1_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_sgemm_tn_inc2_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_sgemm_tn_inc3_asm_full.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_zgemm.yaml (100%)
 rename {Tensile => src/Tensile/data}/Configs/rocblas_zgemm_asm_lite.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/BDAS/dgemm_kmeans.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/BDAS/dgemm_pca.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/BERT/sgemm_xdlops.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/DLRM/sgemm_xdlops.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/DLRM/sgemm_xdlops_nn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/DLRM/sgemm_xdlops_nn_terabyte.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/DLRM/sgemm_xdlops_nt.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/DLRM/sgemm_xdlops_nt_terabyte.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/DLRM/sgemm_xdlops_tn_terabyte.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/TRANSFORMER/sgemm_xdlops.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/TRANSFORMER/sgemm_xdlops_nn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/TRANSFORMER/sgemm_xdlops_nt.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/conv/README (100%)
 rename {Tensile => src/Tensile/data}/Perf/conv/conv_1x1_af0em.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/conv/conv_1x1_oddpbd.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/conv/conv_1x1u2_bdww.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/conv/conv_1x1u2_fwd.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/conv/conv_1x7_fwd.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/conv/conv_7x1_fwd.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/conv/conv_7x1_fwd2.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/conv/conv_7x1_roundup.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/conv/conv_7x7u2_fwd.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/conv/conv_bwdd_pbd.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/conv/conv_fwd.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/conv_bwdd_ex0.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/conv_bwdd_ex1.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/conv_bwdw_big_gsu.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/conv_bwdw_small_gsu.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/conv_fwd_ex0.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/dgemm_large_square.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/hpl.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/hpl_one.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/hpl_quick.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/hpl_quick44k.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/inception/conv_1x1u1.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/inception/conv_1x1u1_starter.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/inception/conv_NxN.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/sgemm_large_square_nn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/sgemm_large_square_nt.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/sgemm_large_square_tn.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/use_initial_strides_cd/README (100%)
 rename {Tensile => src/Tensile/data}/Perf/use_initial_strides_cd/perf_baseline0.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/use_initial_strides_cd/perf_uis_cd0.yaml (100%)
 rename {Tensile => src/Tensile/data}/Perf/use_initial_strides_cd/perf_uis_cd_specialized.yaml (100%)
 rename {Tensile => src/Tensile/data}/Source/CMakeLists.txt (100%)
 rename {Tensile => src/Tensile/data}/Source/EnableWarnings.cmake (100%)
 rename {Tensile => src/Tensile/data}/Source/FindHIP.cmake (100%)
 rename {Tensile => src/Tensile/data}/Source/FindOpenCL.cmake (100%)
 rename {Tensile => src/Tensile/data}/Source/KernelHeader.h (100%)
 rename {Tensile => src/Tensile/data}/Source/TensileTypes.h (100%)
 rename {Tensile => src/Tensile/data}/Source/client/CMakeLists.txt (100%)
 rename {Tensile => src/Tensile/data}/Source/client/include/BenchmarkTimer.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/include/CSVStackFile.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/include/ClientProblemFactory.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/include/ConvolutionProblem.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/include/DataInitialization.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/include/DataInitializationTyped.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/include/HardwareMonitor.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/include/HardwareMonitorListener.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/include/HardwareMonitorType.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/include/HardwareMonitorWindows.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/include/HardwareMonitor_fwd.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/include/LibraryUpdateReporter.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/include/LogReporter.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/include/MetaResultReporter.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/include/MetaRunListener.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/include/PerformanceReporter.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/include/ProgressListener.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/include/Reference.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/include/ReferenceValidator.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/include/ResultComparison.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/include/ResultFileReporter.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/include/ResultReporter.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/include/ResultReporter_fwd.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/include/RunListener.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/include/SolutionIterator.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/include/TimingEvents.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/main.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/source/BenchmarkTimer.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/source/CSVStackFile.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/source/ClientProblemFactory.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/source/ConvolutionProblem.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/source/DataInitialization.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/source/HardwareMonitor.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/source/HardwareMonitorListener.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/source/LibraryUpdateReporter.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/source/MetaRunListener.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/source/PerformanceReporter.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/source/ProgressListener.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/source/Reference.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/source/ReferenceValidator.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/source/ResultFileReporter.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/source/ResultReporter.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/source/SolutionIterator.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/client/source/TimingEvents.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/cmake/FindROCmSMI.cmake (100%)
 rename {Tensile => src/Tensile/data}/Source/hip_f8_impl.h (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/CMakeLists.txt (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/configs/SolutionLibraries/KernelsLiteNavi.yaml (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/configs/lite_configs/navi10_Cijk_Ailk_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/configs/lite_configs/navi10_Cijk_Ailk_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/configs/lite_configs/navi10_Cijk_Alik_Bjlk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/configs/lite_configs/navi10_Cijk_Alik_Bljk_SB.yaml (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/AMDGPU.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/AMDGPUPredicates.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/AMDGPU_Detail.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ArithmeticUnitTypes.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/CachingLibrary.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Comparison.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ContractionLibrary.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ContractionProblem.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ContractionProblemPredicates.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ContractionProblemProperties.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ContractionProblem_Detail.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ContractionProblem_fwd.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ContractionSolution.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ContractionSolution_fwd.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Contractions.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/DataTypes.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/DataTypes_BFloat16.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/DataTypes_Float8_BFloat8.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/DataTypes_Half.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/DataTypes_Int8.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/DataTypes_Int8x4.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/DataTypes_XFloat32.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Debug.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/DecisionTree.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/DecisionTreeLibrary.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Distance.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/DistinctType.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/EmbeddedData.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/EmbeddedLibrary.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ExactLogicLibrary.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/GranularitySelectionLibrary.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/KernelArguments.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/KernelLanguageTypes.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/MLFeatures.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Macros.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/MapLibrary.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/MasterSolutionLibrary.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/MatchingLibrary.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/PerformanceMetricTypes.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/PlaceholderLibrary.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Predicates.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ProblemKey.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Properties.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/PropertyMatching.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ScalarValueTypes.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/Base.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/Containers.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/ContractionPredicates.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/ContractionSolution.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/DecisionTreeLibrary.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/ExactLogicLibrary.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/GranularitySelectionLibrary.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/HasTraits.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/MLFeatures.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/MapLibrary.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/MatchingLibrary.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/PlaceholderLibrary.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/Predicates.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/Properties.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Serialization/SolutionLibrary.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/SingleSolutionLibrary.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Singleton.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/SolutionLibrary.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/SolutionLibrary_fwd.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/SolutionMapLibrary.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Tensile.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Tensile_fwd.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/TensorDescriptor.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/TensorDescriptor_Detail.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/TensorDescriptor_fwd.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/TensorOps.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/TensorOps_fwd.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/UserDrivenTuningParser.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/Utils.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/geom.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/hip/HipHardware.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/hip/HipSolutionAdapter.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/hip/HipUtils.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/hip_f8_impl.h (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/llvm/Loading.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/llvm/YAML.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/msgpack/Loading.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/msgpack/MessagePack.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ocl/OclFwd.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ocl/OclHardware.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ocl/OclSolutionAdapter.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/include/Tensile/ocl/OclUtils.hpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/source/AMDGPU.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/source/ArithmeticUnitTypes.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/source/ContractionProblem.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/source/ContractionSolution.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/source/DataTypes.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/source/Debug.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/source/EmbeddedData.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/source/EmbeddedLibrary.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/source/KernelArguments.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/source/KernelLanguageTypes.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/source/MLFeatures.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/source/PerformanceMetricTypes.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/source/ScalarValueTypes.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/source/Tensile.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/source/TensorDescriptor.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/source/TensorOps.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/source/UserDrivenTuningParser.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/source/Utils.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/source/hip/CMakeLists.txt (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/source/hip/HipHardware.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/source/hip/HipSolutionAdapter.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/source/llvm/Loading.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/source/llvm/YAML.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/source/msgpack/MessagePack.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/source/ocl/CMakeLists.txt (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/source/ocl/OclHardware.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/source/ocl/OclSolutionAdapter.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/lib/source/ocl/OclUtils.cpp (100%)
 rename {Tensile => src/Tensile/data}/Source/multigpu.sh (100%)
 rename {Tensile => src/Tensile/data}/Source/tensile_bfloat16.h (100%)
 rename {Tensile => src/Tensile/data}/Source/tensile_float8_bfloat8.h (100%)
 rename {Tensile => src/Tensile/data}/Source/winners.awk (100%)
 rename {Tensile => src/Tensile/data}/Utilities/archive/merge_rocblas_yaml_files.py (100%)
 rename {Tensile => src/Tensile/data}/Utilities/merge.py (100%)
 rename {Tensile => src/Tensile/data}/cmake/TensileConfig.cmake (100%)
 rename {Tensile => src/Tensile/data}/cmake/TensileConfigVersion.cmake (100%)

diff --git a/Tensile/Configs/build_client.yaml b/Tensile/Configs/build_client.yaml
deleted file mode 100644
index 70cc3f62f1..0000000000
--- a/Tensile/Configs/build_client.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-GlobalParameters:
-  MinimumRequiredVersion: 4.4.0
-  PrintLevel: 1
-  ForceRedoBenchmarkProblems: True
-  ForceRedoLibraryLogic: True
-  ForceRedoLibraryClient: True
-  CMakeBuildType: Release
-  EnqueuesPerSync: 1
-  SyncsPerBenchmark: 1
-  LibraryPrintDebug: False
-  NumElementsToValidate: 0
-  ValidationMaxToPrint: 4
-  ValidationPrintValids: False
-  ShortNames: False
-  MergeFiles: True
-  Platform: 0
-  Device: 0
-  KernelTime: True
-  DataInitTypeBeta : 0
-
-BenchmarkProblems:
-  -
-    - # ProblemType
-      OperationType: GEMM
-      DataType: s
-
-    - # BenchmarkProblemSizeGroup - Standard
-      InitialSolutionParameters:
diff --git a/Tensile/Tests/bugs/2sum_src_pgr1_smallsum.yaml b/Tests/bugs/2sum_src_pgr1_smallsum.yaml
similarity index 100%
rename from Tensile/Tests/bugs/2sum_src_pgr1_smallsum.yaml
rename to Tests/bugs/2sum_src_pgr1_smallsum.yaml
diff --git a/Tensile/Tests/bugs/d2lds.yaml b/Tests/bugs/d2lds.yaml
similarity index 100%
rename from Tensile/Tests/bugs/d2lds.yaml
rename to Tests/bugs/d2lds.yaml
diff --git a/Tensile/Tests/bugs/fractional_plus_pbc.yaml b/Tests/bugs/fractional_plus_pbc.yaml
similarity index 100%
rename from Tensile/Tests/bugs/fractional_plus_pbc.yaml
rename to Tests/bugs/fractional_plus_pbc.yaml
diff --git a/Tensile/Tests/bugs/free10_swap.yaml b/Tests/bugs/free10_swap.yaml
similarity index 100%
rename from Tensile/Tests/bugs/free10_swap.yaml
rename to Tests/bugs/free10_swap.yaml
diff --git a/Tensile/Tests/bugs/hpa_beta.yaml b/Tests/bugs/hpa_beta.yaml
similarity index 100%
rename from Tensile/Tests/bugs/hpa_beta.yaml
rename to Tests/bugs/hpa_beta.yaml
diff --git a/Tensile/Tests/bugs/nosourcetmp.yaml b/Tests/bugs/nosourcetmp.yaml
similarity index 100%
rename from Tensile/Tests/bugs/nosourcetmp.yaml
rename to Tests/bugs/nosourcetmp.yaml
diff --git a/Tensile/Tests/bugs/simple_use_initial_strides_1.yaml b/Tests/bugs/simple_use_initial_strides_1.yaml
similarity index 100%
rename from Tensile/Tests/bugs/simple_use_initial_strides_1.yaml
rename to Tests/bugs/simple_use_initial_strides_1.yaml
diff --git a/Tensile/Tests/bugs/swizzlec1.yaml b/Tests/bugs/swizzlec1.yaml
similarity index 100%
rename from Tensile/Tests/bugs/swizzlec1.yaml
rename to Tests/bugs/swizzlec1.yaml
diff --git a/Tensile/Tests/bugs/test_glvw4_edge_no_asem.yaml b/Tests/bugs/test_glvw4_edge_no_asem.yaml
similarity index 100%
rename from Tensile/Tests/bugs/test_glvw4_edge_no_asem.yaml
rename to Tests/bugs/test_glvw4_edge_no_asem.yaml
diff --git a/Tensile/Tests/bugs/test_nhwc_defaults[Run_Contraction-src1].contraction.yaml b/Tests/bugs/test_nhwc_defaults[Run_Contraction-src1].contraction.yaml
similarity index 100%
rename from Tensile/Tests/bugs/test_nhwc_defaults[Run_Contraction-src1].contraction.yaml
rename to Tests/bugs/test_nhwc_defaults[Run_Contraction-src1].contraction.yaml
diff --git a/Tensile/Tests/conftest.py b/Tests/conftest.py
similarity index 100%
rename from Tensile/Tests/conftest.py
rename to Tests/conftest.py
diff --git a/Tensile/Tests/create_tests.py b/Tests/create_tests.py
similarity index 100%
rename from Tensile/Tests/create_tests.py
rename to Tests/create_tests.py
diff --git a/Tensile/Tests/disabled/classic/test_convolution.yaml b/Tests/disabled/classic/test_convolution.yaml
similarity index 100%
rename from Tensile/Tests/disabled/classic/test_convolution.yaml
rename to Tests/disabled/classic/test_convolution.yaml
diff --git a/Tensile/Tests/disabled/convolution/test_conv_act1d_filter1d.yaml b/Tests/disabled/convolution/test_conv_act1d_filter1d.yaml
similarity index 100%
rename from Tensile/Tests/disabled/convolution/test_conv_act1d_filter1d.yaml
rename to Tests/disabled/convolution/test_conv_act1d_filter1d.yaml
diff --git a/Tensile/Tests/disabled/convolution/test_conv_act1d_filter1d_simple.yaml b/Tests/disabled/convolution/test_conv_act1d_filter1d_simple.yaml
similarity index 100%
rename from Tensile/Tests/disabled/convolution/test_conv_act1d_filter1d_simple.yaml
rename to Tests/disabled/convolution/test_conv_act1d_filter1d_simple.yaml
diff --git a/Tensile/Tests/disabled/convolution/test_conv_act1d_filter2d_simple.yaml b/Tests/disabled/convolution/test_conv_act1d_filter2d_simple.yaml
similarity index 100%
rename from Tensile/Tests/disabled/convolution/test_conv_act1d_filter2d_simple.yaml
rename to Tests/disabled/convolution/test_conv_act1d_filter2d_simple.yaml
diff --git a/Tensile/Tests/disabled/convolution/test_conv_act1d_filter3d_simple.yaml b/Tests/disabled/convolution/test_conv_act1d_filter3d_simple.yaml
similarity index 100%
rename from Tensile/Tests/disabled/convolution/test_conv_act1d_filter3d_simple.yaml
rename to Tests/disabled/convolution/test_conv_act1d_filter3d_simple.yaml
diff --git a/Tensile/Tests/disabled/convolution/test_conv_act1d_filter5d_simple.yaml b/Tests/disabled/convolution/test_conv_act1d_filter5d_simple.yaml
similarity index 100%
rename from Tensile/Tests/disabled/convolution/test_conv_act1d_filter5d_simple.yaml
rename to Tests/disabled/convolution/test_conv_act1d_filter5d_simple.yaml
diff --git a/Tensile/Tests/disabled/convolution/test_conv_act2d_filter1d.yaml b/Tests/disabled/convolution/test_conv_act2d_filter1d.yaml
similarity index 100%
rename from Tensile/Tests/disabled/convolution/test_conv_act2d_filter1d.yaml
rename to Tests/disabled/convolution/test_conv_act2d_filter1d.yaml
diff --git a/Tensile/Tests/disabled/convolution/test_conv_act2d_filter1d_simple.yaml b/Tests/disabled/convolution/test_conv_act2d_filter1d_simple.yaml
similarity index 100%
rename from Tensile/Tests/disabled/convolution/test_conv_act2d_filter1d_simple.yaml
rename to Tests/disabled/convolution/test_conv_act2d_filter1d_simple.yaml
diff --git a/Tensile/Tests/disabled/direct_to_lds/dtl_dgemm.yaml b/Tests/disabled/direct_to_lds/dtl_dgemm.yaml
similarity index 100%
rename from Tensile/Tests/disabled/direct_to_lds/dtl_dgemm.yaml
rename to Tests/disabled/direct_to_lds/dtl_dgemm.yaml
diff --git a/Tensile/Tests/disabled/direct_to_lds/dtl_dgemm_lite.yaml b/Tests/disabled/direct_to_lds/dtl_dgemm_lite.yaml
similarity index 100%
rename from Tensile/Tests/disabled/direct_to_lds/dtl_dgemm_lite.yaml
rename to Tests/disabled/direct_to_lds/dtl_dgemm_lite.yaml
diff --git a/Tensile/Tests/disabled/direct_to_lds/dtl_tsgr_dgemm.yaml b/Tests/disabled/direct_to_lds/dtl_tsgr_dgemm.yaml
similarity index 100%
rename from Tensile/Tests/disabled/direct_to_lds/dtl_tsgr_dgemm.yaml
rename to Tests/disabled/direct_to_lds/dtl_tsgr_dgemm.yaml
diff --git a/Tensile/Tests/disabled/hgemm_nn_source.yaml b/Tests/disabled/hgemm_nn_source.yaml
similarity index 100%
rename from Tensile/Tests/disabled/hgemm_nn_source.yaml
rename to Tests/disabled/hgemm_nn_source.yaml
diff --git a/Tensile/Tests/disabled/multi_sum/test_.py b/Tests/disabled/multi_sum/test_.py
similarity index 100%
rename from Tensile/Tests/disabled/multi_sum/test_.py
rename to Tests/disabled/multi_sum/test_.py
diff --git a/Tensile/Tests/disabled/starter_packed_case.yaml b/Tests/disabled/starter_packed_case.yaml
similarity index 100%
rename from Tensile/Tests/disabled/starter_packed_case.yaml
rename to Tests/disabled/starter_packed_case.yaml
diff --git a/Tensile/Tests/disabled/stridea0_pack_nt.yaml b/Tests/disabled/stridea0_pack_nt.yaml
similarity index 100%
rename from Tensile/Tests/disabled/stridea0_pack_nt.yaml
rename to Tests/disabled/stridea0_pack_nt.yaml
diff --git a/Tensile/Tests/disabled/strideb0_pack_nn.yaml b/Tests/disabled/strideb0_pack_nn.yaml
similarity index 100%
rename from Tensile/Tests/disabled/strideb0_pack_nn.yaml
rename to Tests/disabled/strideb0_pack_nn.yaml
diff --git a/Tensile/Tests/disabled/test_assertion_selection.yaml b/Tests/disabled/test_assertion_selection.yaml
similarity index 100%
rename from Tensile/Tests/disabled/test_assertion_selection.yaml
rename to Tests/disabled/test_assertion_selection.yaml
diff --git a/Tensile/Tests/disabled/test_create_library.yaml b/Tests/disabled/test_create_library.yaml
similarity index 100%
rename from Tensile/Tests/disabled/test_create_library.yaml
rename to Tests/disabled/test_create_library.yaml
diff --git a/Tensile/Tests/dot/mixmad-nt.yaml b/Tests/dot/mixmad-nt.yaml
similarity index 100%
rename from Tensile/Tests/dot/mixmad-nt.yaml
rename to Tests/dot/mixmad-nt.yaml
diff --git a/Tensile/Tests/dot/mixmad.yaml b/Tests/dot/mixmad.yaml
similarity index 100%
rename from Tensile/Tests/dot/mixmad.yaml
rename to Tests/dot/mixmad.yaml
diff --git a/Tensile/Tests/emulation/bfloat16/bfloat16_hpa_source_nn.yaml b/Tests/emulation/bfloat16/bfloat16_hpa_source_nn.yaml
similarity index 100%
rename from Tensile/Tests/emulation/bfloat16/bfloat16_hpa_source_nn.yaml
rename to Tests/emulation/bfloat16/bfloat16_hpa_source_nn.yaml
diff --git a/Tensile/Tests/emulation/bfloat16/bfloat16_hpa_source_nt.yaml b/Tests/emulation/bfloat16/bfloat16_hpa_source_nt.yaml
similarity index 100%
rename from Tensile/Tests/emulation/bfloat16/bfloat16_hpa_source_nt.yaml
rename to Tests/emulation/bfloat16/bfloat16_hpa_source_nt.yaml
diff --git a/Tensile/Tests/emulation/bfloat16/bfloat16_hpa_source_tn.yaml b/Tests/emulation/bfloat16/bfloat16_hpa_source_tn.yaml
similarity index 100%
rename from Tensile/Tests/emulation/bfloat16/bfloat16_hpa_source_tn.yaml
rename to Tests/emulation/bfloat16/bfloat16_hpa_source_tn.yaml
diff --git a/Tensile/Tests/emulation/bfloat16/bfloat16_hpa_source_tt.yaml b/Tests/emulation/bfloat16/bfloat16_hpa_source_tt.yaml
similarity index 100%
rename from Tensile/Tests/emulation/bfloat16/bfloat16_hpa_source_tt.yaml
rename to Tests/emulation/bfloat16/bfloat16_hpa_source_tt.yaml
diff --git a/Tensile/Tests/emulation/dgemm_asm.yaml b/Tests/emulation/dgemm_asm.yaml
similarity index 100%
rename from Tensile/Tests/emulation/dgemm_asm.yaml
rename to Tests/emulation/dgemm_asm.yaml
diff --git a/Tensile/Tests/emulation/double_complex/double_complex_hip_cn.yaml b/Tests/emulation/double_complex/double_complex_hip_cn.yaml
similarity index 100%
rename from Tensile/Tests/emulation/double_complex/double_complex_hip_cn.yaml
rename to Tests/emulation/double_complex/double_complex_hip_cn.yaml
diff --git a/Tensile/Tests/emulation/float8/b8f8gemm_hybrid_b8f8b8s_SR_gfx940.yaml b/Tests/emulation/float8/b8f8gemm_hybrid_b8f8b8s_SR_gfx940.yaml
similarity index 100%
rename from Tensile/Tests/emulation/float8/b8f8gemm_hybrid_b8f8b8s_SR_gfx940.yaml
rename to Tests/emulation/float8/b8f8gemm_hybrid_b8f8b8s_SR_gfx940.yaml
diff --git a/Tensile/Tests/emulation/float8/b8f8gemm_hybrid_b8f8b8s_gfx940.yaml b/Tests/emulation/float8/b8f8gemm_hybrid_b8f8b8s_gfx940.yaml
similarity index 100%
rename from Tensile/Tests/emulation/float8/b8f8gemm_hybrid_b8f8b8s_gfx940.yaml
rename to Tests/emulation/float8/b8f8gemm_hybrid_b8f8b8s_gfx940.yaml
diff --git a/Tensile/Tests/emulation/float8/b8f8gemm_hybrid_b8f8hs_gfx940.yaml b/Tests/emulation/float8/b8f8gemm_hybrid_b8f8hs_gfx940.yaml
similarity index 100%
rename from Tensile/Tests/emulation/float8/b8f8gemm_hybrid_b8f8hs_gfx940.yaml
rename to Tests/emulation/float8/b8f8gemm_hybrid_b8f8hs_gfx940.yaml
diff --git a/Tensile/Tests/emulation/float8/b8f8gemm_hybrid_b8f8ss_gfx940.yaml b/Tests/emulation/float8/b8f8gemm_hybrid_b8f8ss_gfx940.yaml
similarity index 100%
rename from Tensile/Tests/emulation/float8/b8f8gemm_hybrid_b8f8ss_gfx940.yaml
rename to Tests/emulation/float8/b8f8gemm_hybrid_b8f8ss_gfx940.yaml
diff --git a/Tensile/Tests/emulation/float8/b8gemm_b8b8s_SR_gfx940.yaml b/Tests/emulation/float8/b8gemm_b8b8s_SR_gfx940.yaml
similarity index 100%
rename from Tensile/Tests/emulation/float8/b8gemm_b8b8s_SR_gfx940.yaml
rename to Tests/emulation/float8/b8gemm_b8b8s_SR_gfx940.yaml
diff --git a/Tensile/Tests/emulation/float8/b8gemm_b8b8s_gfx940.yaml b/Tests/emulation/float8/b8gemm_b8b8s_gfx940.yaml
similarity index 100%
rename from Tensile/Tests/emulation/float8/b8gemm_b8b8s_gfx940.yaml
rename to Tests/emulation/float8/b8gemm_b8b8s_gfx940.yaml
diff --git a/Tensile/Tests/emulation/float8/b8gemm_b8hs_gfx940.yaml b/Tests/emulation/float8/b8gemm_b8hs_gfx940.yaml
similarity index 100%
rename from Tensile/Tests/emulation/float8/b8gemm_b8hs_gfx940.yaml
rename to Tests/emulation/float8/b8gemm_b8hs_gfx940.yaml
diff --git a/Tensile/Tests/emulation/float8/b8gemm_b8ss_gfx940.yaml b/Tests/emulation/float8/b8gemm_b8ss_gfx940.yaml
similarity index 100%
rename from Tensile/Tests/emulation/float8/b8gemm_b8ss_gfx940.yaml
rename to Tests/emulation/float8/b8gemm_b8ss_gfx940.yaml
diff --git a/Tensile/Tests/emulation/float8/f8b8gemm_hybrid_f8b8b8s_SR_gfx940.yaml b/Tests/emulation/float8/f8b8gemm_hybrid_f8b8b8s_SR_gfx940.yaml
similarity index 100%
rename from Tensile/Tests/emulation/float8/f8b8gemm_hybrid_f8b8b8s_SR_gfx940.yaml
rename to Tests/emulation/float8/f8b8gemm_hybrid_f8b8b8s_SR_gfx940.yaml
diff --git a/Tensile/Tests/emulation/float8/f8b8gemm_hybrid_f8b8b8s_gfx940.yaml b/Tests/emulation/float8/f8b8gemm_hybrid_f8b8b8s_gfx940.yaml
similarity index 100%
rename from Tensile/Tests/emulation/float8/f8b8gemm_hybrid_f8b8b8s_gfx940.yaml
rename to Tests/emulation/float8/f8b8gemm_hybrid_f8b8b8s_gfx940.yaml
diff --git a/Tensile/Tests/emulation/float8/f8b8gemm_hybrid_f8b8hs_gfx940.yaml b/Tests/emulation/float8/f8b8gemm_hybrid_f8b8hs_gfx940.yaml
similarity index 100%
rename from Tensile/Tests/emulation/float8/f8b8gemm_hybrid_f8b8hs_gfx940.yaml
rename to Tests/emulation/float8/f8b8gemm_hybrid_f8b8hs_gfx940.yaml
diff --git a/Tensile/Tests/emulation/float8/f8b8gemm_hybrid_f8b8ss_gfx940.yaml b/Tests/emulation/float8/f8b8gemm_hybrid_f8b8ss_gfx940.yaml
similarity index 100%
rename from Tensile/Tests/emulation/float8/f8b8gemm_hybrid_f8b8ss_gfx940.yaml
rename to Tests/emulation/float8/f8b8gemm_hybrid_f8b8ss_gfx940.yaml
diff --git a/Tensile/Tests/emulation/float8/f8f8s-NT-edge-range-A3B3C3-alpha2-beta1.yaml b/Tests/emulation/float8/f8f8s-NT-edge-range-A3B3C3-alpha2-beta1.yaml
similarity index 100%
rename from Tensile/Tests/emulation/float8/f8f8s-NT-edge-range-A3B3C3-alpha2-beta1.yaml
rename to Tests/emulation/float8/f8f8s-NT-edge-range-A3B3C3-alpha2-beta1.yaml
diff --git a/Tensile/Tests/emulation/float8/f8gemm_f8f8s_SR_gfx940.yaml b/Tests/emulation/float8/f8gemm_f8f8s_SR_gfx940.yaml
similarity index 100%
rename from Tensile/Tests/emulation/float8/f8gemm_f8f8s_SR_gfx940.yaml
rename to Tests/emulation/float8/f8gemm_f8f8s_SR_gfx940.yaml
diff --git a/Tensile/Tests/emulation/float8/f8gemm_f8f8s_gfx940.yaml b/Tests/emulation/float8/f8gemm_f8f8s_gfx940.yaml
similarity index 100%
rename from Tensile/Tests/emulation/float8/f8gemm_f8f8s_gfx940.yaml
rename to Tests/emulation/float8/f8gemm_f8f8s_gfx940.yaml
diff --git a/Tensile/Tests/emulation/float8/f8gemm_f8hs_gfx940.yaml b/Tests/emulation/float8/f8gemm_f8hs_gfx940.yaml
similarity index 100%
rename from Tensile/Tests/emulation/float8/f8gemm_f8hs_gfx940.yaml
rename to Tests/emulation/float8/f8gemm_f8hs_gfx940.yaml
diff --git a/Tensile/Tests/emulation/float8/f8gemm_f8ss_gfx940.yaml b/Tests/emulation/float8/f8gemm_f8ss_gfx940.yaml
similarity index 100%
rename from Tensile/Tests/emulation/float8/f8gemm_f8ss_gfx940.yaml
rename to Tests/emulation/float8/f8gemm_f8ss_gfx940.yaml
diff --git a/Tensile/Tests/emulation/float_complex/float_complex_hip_cc.yaml b/Tests/emulation/float_complex/float_complex_hip_cc.yaml
similarity index 100%
rename from Tensile/Tests/emulation/float_complex/float_complex_hip_cc.yaml
rename to Tests/emulation/float_complex/float_complex_hip_cc.yaml
diff --git a/Tensile/Tests/emulation/hgemm_asm_nn.yaml b/Tests/emulation/hgemm_asm_nn.yaml
similarity index 100%
rename from Tensile/Tests/emulation/hgemm_asm_nn.yaml
rename to Tests/emulation/hgemm_asm_nn.yaml
diff --git a/Tensile/Tests/emulation/hgemm_asm_nt.yaml b/Tests/emulation/hgemm_asm_nt.yaml
similarity index 100%
rename from Tensile/Tests/emulation/hgemm_asm_nt.yaml
rename to Tests/emulation/hgemm_asm_nt.yaml
diff --git a/Tensile/Tests/emulation/hgemm_asm_tn.yaml b/Tests/emulation/hgemm_asm_tn.yaml
similarity index 100%
rename from Tensile/Tests/emulation/hgemm_asm_tn.yaml
rename to Tests/emulation/hgemm_asm_tn.yaml
diff --git a/Tensile/Tests/emulation/hgemm_asm_tt.yaml b/Tests/emulation/hgemm_asm_tt.yaml
similarity index 100%
rename from Tensile/Tests/emulation/hgemm_asm_tt.yaml
rename to Tests/emulation/hgemm_asm_tt.yaml
diff --git a/Tensile/Tests/emulation/hgemm_hpa_asm_nn.yaml b/Tests/emulation/hgemm_hpa_asm_nn.yaml
similarity index 100%
rename from Tensile/Tests/emulation/hgemm_hpa_asm_nn.yaml
rename to Tests/emulation/hgemm_hpa_asm_nn.yaml
diff --git a/Tensile/Tests/emulation/hgemm_hpa_asm_nt.yaml b/Tests/emulation/hgemm_hpa_asm_nt.yaml
similarity index 100%
rename from Tensile/Tests/emulation/hgemm_hpa_asm_nt.yaml
rename to Tests/emulation/hgemm_hpa_asm_nt.yaml
diff --git a/Tensile/Tests/emulation/hgemm_hpa_asm_tn.yaml b/Tests/emulation/hgemm_hpa_asm_tn.yaml
similarity index 100%
rename from Tensile/Tests/emulation/hgemm_hpa_asm_tn.yaml
rename to Tests/emulation/hgemm_hpa_asm_tn.yaml
diff --git a/Tensile/Tests/emulation/hgemm_hpa_asm_tt.yaml b/Tests/emulation/hgemm_hpa_asm_tt.yaml
similarity index 100%
rename from Tensile/Tests/emulation/hgemm_hpa_asm_tt.yaml
rename to Tests/emulation/hgemm_hpa_asm_tt.yaml
diff --git a/Tensile/Tests/emulation/igemm_hpa_hip_nn.yaml b/Tests/emulation/igemm_hpa_hip_nn.yaml
similarity index 100%
rename from Tensile/Tests/emulation/igemm_hpa_hip_nn.yaml
rename to Tests/emulation/igemm_hpa_hip_nn.yaml
diff --git a/Tensile/Tests/emulation/igemm_hpa_hip_nt.yaml b/Tests/emulation/igemm_hpa_hip_nt.yaml
similarity index 100%
rename from Tensile/Tests/emulation/igemm_hpa_hip_nt.yaml
rename to Tests/emulation/igemm_hpa_hip_nt.yaml
diff --git a/Tensile/Tests/emulation/igemm_hpa_hip_tn.yaml b/Tests/emulation/igemm_hpa_hip_tn.yaml
similarity index 100%
rename from Tensile/Tests/emulation/igemm_hpa_hip_tn.yaml
rename to Tests/emulation/igemm_hpa_hip_tn.yaml
diff --git a/Tensile/Tests/emulation/igemm_hpa_hip_tt.yaml b/Tests/emulation/igemm_hpa_hip_tt.yaml
similarity index 100%
rename from Tensile/Tests/emulation/igemm_hpa_hip_tt.yaml
rename to Tests/emulation/igemm_hpa_hip_tt.yaml
diff --git a/Tensile/Tests/emulation/mfma/1LDSB.yaml b/Tests/emulation/mfma/1LDSB.yaml
similarity index 100%
rename from Tensile/Tests/emulation/mfma/1LDSB.yaml
rename to Tests/emulation/mfma/1LDSB.yaml
diff --git a/Tensile/Tests/emulation/mfma/cgemm_asm.yaml b/Tests/emulation/mfma/cgemm_asm.yaml
similarity index 100%
rename from Tensile/Tests/emulation/mfma/cgemm_asm.yaml
rename to Tests/emulation/mfma/cgemm_asm.yaml
diff --git a/Tensile/Tests/emulation/mfma/cgemm_asm_conjugate.yaml b/Tests/emulation/mfma/cgemm_asm_conjugate.yaml
similarity index 100%
rename from Tensile/Tests/emulation/mfma/cgemm_asm_conjugate.yaml
rename to Tests/emulation/mfma/cgemm_asm_conjugate.yaml
diff --git a/Tensile/Tests/emulation/mfma/dgemm.yaml b/Tests/emulation/mfma/dgemm.yaml
similarity index 100%
rename from Tensile/Tests/emulation/mfma/dgemm.yaml
rename to Tests/emulation/mfma/dgemm.yaml
diff --git a/Tensile/Tests/emulation/mfma/hpa_bfloat16_gemm_asm.yaml b/Tests/emulation/mfma/hpa_bfloat16_gemm_asm.yaml
similarity index 100%
rename from Tensile/Tests/emulation/mfma/hpa_bfloat16_gemm_asm.yaml
rename to Tests/emulation/mfma/hpa_bfloat16_gemm_asm.yaml
diff --git a/Tensile/Tests/emulation/mfma/hpa_bfloat16_gemm_asm_gfx940.yaml b/Tests/emulation/mfma/hpa_bfloat16_gemm_asm_gfx940.yaml
similarity index 100%
rename from Tensile/Tests/emulation/mfma/hpa_bfloat16_gemm_asm_gfx940.yaml
rename to Tests/emulation/mfma/hpa_bfloat16_gemm_asm_gfx940.yaml
diff --git a/Tensile/Tests/emulation/mfma/hpa_hgemm_asm.yaml b/Tests/emulation/mfma/hpa_hgemm_asm.yaml
similarity index 100%
rename from Tensile/Tests/emulation/mfma/hpa_hgemm_asm.yaml
rename to Tests/emulation/mfma/hpa_hgemm_asm.yaml
diff --git a/Tensile/Tests/emulation/mfma/hpa_igemm_i8_asm_gfx940.yaml b/Tests/emulation/mfma/hpa_igemm_i8_asm_gfx940.yaml
similarity index 100%
rename from Tensile/Tests/emulation/mfma/hpa_igemm_i8_asm_gfx940.yaml
rename to Tests/emulation/mfma/hpa_igemm_i8_asm_gfx940.yaml
diff --git a/Tensile/Tests/emulation/mfma/sgemm.yaml b/Tests/emulation/mfma/sgemm.yaml
similarity index 100%
rename from Tensile/Tests/emulation/mfma/sgemm.yaml
rename to Tests/emulation/mfma/sgemm.yaml
diff --git a/Tensile/Tests/extended/big_tensor/biga.yaml b/Tests/extended/big_tensor/biga.yaml
similarity index 100%
rename from Tensile/Tests/extended/big_tensor/biga.yaml
rename to Tests/extended/big_tensor/biga.yaml
diff --git a/Tensile/Tests/extended/big_tensor/bigskinny_nt.yaml b/Tests/extended/big_tensor/bigskinny_nt.yaml
similarity index 100%
rename from Tensile/Tests/extended/big_tensor/bigskinny_nt.yaml
rename to Tests/extended/big_tensor/bigskinny_nt.yaml
diff --git a/Tensile/Tests/extended/big_tensor/largec.yaml b/Tests/extended/big_tensor/largec.yaml
similarity index 100%
rename from Tensile/Tests/extended/big_tensor/largec.yaml
rename to Tests/extended/big_tensor/largec.yaml
diff --git a/Tensile/Tests/extended/bufferload_offset/rocblas_dgemm_bufferload_limit.yaml b/Tests/extended/bufferload_offset/rocblas_dgemm_bufferload_limit.yaml
similarity index 100%
rename from Tensile/Tests/extended/bufferload_offset/rocblas_dgemm_bufferload_limit.yaml
rename to Tests/extended/bufferload_offset/rocblas_dgemm_bufferload_limit.yaml
diff --git a/Tensile/Configs/rocblas_sgemm_bufferload_limit.yaml b/Tests/extended/bufferload_offset/rocblas_sgemm_bufferload_limit.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_sgemm_bufferload_limit.yaml
rename to Tests/extended/bufferload_offset/rocblas_sgemm_bufferload_limit.yaml
diff --git a/Tensile/Tests/extended/classic/test_persistent.yaml b/Tests/extended/classic/test_persistent.yaml
similarity index 100%
rename from Tensile/Tests/extended/classic/test_persistent.yaml
rename to Tests/extended/classic/test_persistent.yaml
diff --git a/Tensile/Tests/extended/classic/test_tensor_contraction.yaml b/Tests/extended/classic/test_tensor_contraction.yaml
similarity index 100%
rename from Tensile/Tests/extended/classic/test_tensor_contraction.yaml
rename to Tests/extended/classic/test_tensor_contraction.yaml
diff --git a/Tensile/Tests/extended/classic_source/test_dgemm.yaml b/Tests/extended/classic_source/test_dgemm.yaml
similarity index 100%
rename from Tensile/Tests/extended/classic_source/test_dgemm.yaml
rename to Tests/extended/classic_source/test_dgemm.yaml
diff --git a/Tensile/Tests/extended/classic_source/test_hgemm_nn.yaml b/Tests/extended/classic_source/test_hgemm_nn.yaml
similarity index 100%
rename from Tensile/Tests/extended/classic_source/test_hgemm_nn.yaml
rename to Tests/extended/classic_source/test_hgemm_nn.yaml
diff --git a/Tensile/Tests/extended/classic_source/test_hgemm_nt.yaml b/Tests/extended/classic_source/test_hgemm_nt.yaml
similarity index 100%
rename from Tensile/Tests/extended/classic_source/test_hgemm_nt.yaml
rename to Tests/extended/classic_source/test_hgemm_nt.yaml
diff --git a/Tensile/Tests/extended/classic_source/test_hgemm_tn_tt.yaml b/Tests/extended/classic_source/test_hgemm_tn_tt.yaml
similarity index 100%
rename from Tensile/Tests/extended/classic_source/test_hgemm_tn_tt.yaml
rename to Tests/extended/classic_source/test_hgemm_tn_tt.yaml
diff --git a/Tensile/Tests/extended/classic_source/test_sgemm.yaml b/Tests/extended/classic_source/test_sgemm.yaml
similarity index 100%
rename from Tensile/Tests/extended/classic_source/test_sgemm.yaml
rename to Tests/extended/classic_source/test_sgemm.yaml
diff --git a/Tensile/Tests/extended/convolution_config/YamlBuilder/YamlBuilder.py b/Tests/extended/convolution_config/YamlBuilder/YamlBuilder.py
similarity index 100%
rename from Tensile/Tests/extended/convolution_config/YamlBuilder/YamlBuilder.py
rename to Tests/extended/convolution_config/YamlBuilder/YamlBuilder.py
diff --git a/Tensile/Tests/extended/convolution_config/YamlBuilder/header.yml b/Tests/extended/convolution_config/YamlBuilder/header.yml
similarity index 100%
rename from Tensile/Tests/extended/convolution_config/YamlBuilder/header.yml
rename to Tests/extended/convolution_config/YamlBuilder/header.yml
diff --git a/Tensile/Tests/extended/convolution_config/YamlBuilder/solutions/sgemm_1.yml b/Tests/extended/convolution_config/YamlBuilder/solutions/sgemm_1.yml
similarity index 100%
rename from Tensile/Tests/extended/convolution_config/YamlBuilder/solutions/sgemm_1.yml
rename to Tests/extended/convolution_config/YamlBuilder/solutions/sgemm_1.yml
diff --git a/Tensile/Tests/extended/convolution_config/YamlBuilder/solutions/sgemm_src.yml b/Tests/extended/convolution_config/YamlBuilder/solutions/sgemm_src.yml
similarity index 100%
rename from Tensile/Tests/extended/convolution_config/YamlBuilder/solutions/sgemm_src.yml
rename to Tests/extended/convolution_config/YamlBuilder/solutions/sgemm_src.yml
diff --git a/Tensile/Tests/extended/convolution_config/conftest.py b/Tests/extended/convolution_config/conftest.py
similarity index 100%
rename from Tensile/Tests/extended/convolution_config/conftest.py
rename to Tests/extended/convolution_config/conftest.py
diff --git a/Tensile/Tests/extended/convolution_config/test_backwarddata_nchw.py b/Tests/extended/convolution_config/test_backwarddata_nchw.py
similarity index 100%
rename from Tensile/Tests/extended/convolution_config/test_backwarddata_nchw.py
rename to Tests/extended/convolution_config/test_backwarddata_nchw.py
diff --git a/Tensile/Tests/extended/convolution_config/test_backwardweights_nchw.py b/Tests/extended/convolution_config/test_backwardweights_nchw.py
similarity index 100%
rename from Tensile/Tests/extended/convolution_config/test_backwardweights_nchw.py
rename to Tests/extended/convolution_config/test_backwardweights_nchw.py
diff --git a/Tensile/Tests/extended/convolution_config/test_bad_input.py b/Tests/extended/convolution_config/test_bad_input.py
similarity index 100%
rename from Tensile/Tests/extended/convolution_config/test_bad_input.py
rename to Tests/extended/convolution_config/test_bad_input.py
diff --git a/Tensile/Tests/extended/convolution_config/test_conv_vs_contraction.py b/Tests/extended/convolution_config/test_conv_vs_contraction.py
similarity index 100%
rename from Tensile/Tests/extended/convolution_config/test_conv_vs_contraction.py
rename to Tests/extended/convolution_config/test_conv_vs_contraction.py
diff --git a/Tensile/Tests/extended/convolution_config/test_forward_cnhw.py b/Tests/extended/convolution_config/test_forward_cnhw.py
similarity index 100%
rename from Tensile/Tests/extended/convolution_config/test_forward_cnhw.py
rename to Tests/extended/convolution_config/test_forward_cnhw.py
diff --git a/Tensile/Tests/extended/convolution_config/test_forward_nchw.py b/Tests/extended/convolution_config/test_forward_nchw.py
similarity index 100%
rename from Tensile/Tests/extended/convolution_config/test_forward_nchw.py
rename to Tests/extended/convolution_config/test_forward_nchw.py
diff --git a/Tensile/Tests/extended/convolution_config/test_forward_nchw_ckyx.py b/Tests/extended/convolution_config/test_forward_nchw_ckyx.py
similarity index 100%
rename from Tensile/Tests/extended/convolution_config/test_forward_nchw_ckyx.py
rename to Tests/extended/convolution_config/test_forward_nchw_ckyx.py
diff --git a/Tensile/Tests/extended/convolution_config/test_forward_nhwc.py b/Tests/extended/convolution_config/test_forward_nhwc.py
similarity index 100%
rename from Tensile/Tests/extended/convolution_config/test_forward_nhwc.py
rename to Tests/extended/convolution_config/test_forward_nhwc.py
diff --git a/Tensile/Tests/extended/convolution_config/test_forward_pad.py b/Tests/extended/convolution_config/test_forward_pad.py
similarity index 100%
rename from Tensile/Tests/extended/convolution_config/test_forward_pad.py
rename to Tests/extended/convolution_config/test_forward_pad.py
diff --git a/Tensile/Tests/extended/convolution_config/test_simple.py b/Tests/extended/convolution_config/test_simple.py
similarity index 100%
rename from Tensile/Tests/extended/convolution_config/test_simple.py
rename to Tests/extended/convolution_config/test_simple.py
diff --git a/Tensile/Tests/extended/convolution_config/unittests/test_problem_sizes.py b/Tests/extended/convolution_config/unittests/test_problem_sizes.py
similarity index 100%
rename from Tensile/Tests/extended/convolution_config/unittests/test_problem_sizes.py
rename to Tests/extended/convolution_config/unittests/test_problem_sizes.py
diff --git a/Tensile/Tests/extended/convolution_config/unittests/test_string_swap.py b/Tests/extended/convolution_config/unittests/test_string_swap.py
similarity index 100%
rename from Tensile/Tests/extended/convolution_config/unittests/test_string_swap.py
rename to Tests/extended/convolution_config/unittests/test_string_swap.py
diff --git a/Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_nn.yaml b/Tests/extended/custom_kernel/ck_dgemm_90a_nn.yaml
similarity index 100%
rename from Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_nn.yaml
rename to Tests/extended/custom_kernel/ck_dgemm_90a_nn.yaml
diff --git a/Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_nn_large_offset.yaml b/Tests/extended/custom_kernel/ck_dgemm_90a_nn_large_offset.yaml
similarity index 100%
rename from Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_nn_large_offset.yaml
rename to Tests/extended/custom_kernel/ck_dgemm_90a_nn_large_offset.yaml
diff --git a/Tensile/Tests/extended/direct_to_lds/dtl_dgemm.yaml b/Tests/extended/direct_to_lds/dtl_dgemm.yaml
similarity index 100%
rename from Tensile/Tests/extended/direct_to_lds/dtl_dgemm.yaml
rename to Tests/extended/direct_to_lds/dtl_dgemm.yaml
diff --git a/Tensile/Tests/extended/direct_to_lds/dtl_hgemm.yaml b/Tests/extended/direct_to_lds/dtl_hgemm.yaml
similarity index 100%
rename from Tensile/Tests/extended/direct_to_lds/dtl_hgemm.yaml
rename to Tests/extended/direct_to_lds/dtl_hgemm.yaml
diff --git a/Tensile/Tests/extended/direct_to_lds/dtl_sgemm.yaml b/Tests/extended/direct_to_lds/dtl_sgemm.yaml
similarity index 100%
rename from Tensile/Tests/extended/direct_to_lds/dtl_sgemm.yaml
rename to Tests/extended/direct_to_lds/dtl_sgemm.yaml
diff --git a/Tensile/Tests/extended/direct_to_lds/dtl_tsgr_f8.yaml b/Tests/extended/direct_to_lds/dtl_tsgr_f8.yaml
similarity index 100%
rename from Tensile/Tests/extended/direct_to_lds/dtl_tsgr_f8.yaml
rename to Tests/extended/direct_to_lds/dtl_tsgr_f8.yaml
diff --git a/Tensile/Tests/extended/direct_to_lds/dtl_tsgr_hgemm.yaml b/Tests/extended/direct_to_lds/dtl_tsgr_hgemm.yaml
similarity index 100%
rename from Tensile/Tests/extended/direct_to_lds/dtl_tsgr_hgemm.yaml
rename to Tests/extended/direct_to_lds/dtl_tsgr_hgemm.yaml
diff --git a/Tensile/Tests/extended/direct_to_lds/dtl_tsgr_sgemm.yaml b/Tests/extended/direct_to_lds/dtl_tsgr_sgemm.yaml
similarity index 100%
rename from Tensile/Tests/extended/direct_to_lds/dtl_tsgr_sgemm.yaml
rename to Tests/extended/direct_to_lds/dtl_tsgr_sgemm.yaml
diff --git a/Tensile/Tests/extended/direct_to_vgpr/dtv_cgemm.yaml b/Tests/extended/direct_to_vgpr/dtv_cgemm.yaml
similarity index 100%
rename from Tensile/Tests/extended/direct_to_vgpr/dtv_cgemm.yaml
rename to Tests/extended/direct_to_vgpr/dtv_cgemm.yaml
diff --git a/Tensile/Tests/extended/direct_to_vgpr/dtv_dgemm.yaml b/Tests/extended/direct_to_vgpr/dtv_dgemm.yaml
similarity index 100%
rename from Tensile/Tests/extended/direct_to_vgpr/dtv_dgemm.yaml
rename to Tests/extended/direct_to_vgpr/dtv_dgemm.yaml
diff --git a/Tensile/Tests/extended/direct_to_vgpr/dtv_dgemm_a1b0.yaml b/Tests/extended/direct_to_vgpr/dtv_dgemm_a1b0.yaml
similarity index 100%
rename from Tensile/Tests/extended/direct_to_vgpr/dtv_dgemm_a1b0.yaml
rename to Tests/extended/direct_to_vgpr/dtv_dgemm_a1b0.yaml
diff --git a/Tensile/Tests/extended/direct_to_vgpr/dtv_f8gemm.yaml b/Tests/extended/direct_to_vgpr/dtv_f8gemm.yaml
similarity index 100%
rename from Tensile/Tests/extended/direct_to_vgpr/dtv_f8gemm.yaml
rename to Tests/extended/direct_to_vgpr/dtv_f8gemm.yaml
diff --git a/Tensile/Tests/extended/direct_to_vgpr/dtv_hgemm.yaml b/Tests/extended/direct_to_vgpr/dtv_hgemm.yaml
similarity index 100%
rename from Tensile/Tests/extended/direct_to_vgpr/dtv_hgemm.yaml
rename to Tests/extended/direct_to_vgpr/dtv_hgemm.yaml
diff --git a/Tensile/Tests/extended/direct_to_vgpr/dtv_igemm.yaml b/Tests/extended/direct_to_vgpr/dtv_igemm.yaml
similarity index 100%
rename from Tensile/Tests/extended/direct_to_vgpr/dtv_igemm.yaml
rename to Tests/extended/direct_to_vgpr/dtv_igemm.yaml
diff --git a/Tensile/Tests/extended/dot2/hgemm_hpa_dot2_nn.yaml b/Tests/extended/dot2/hgemm_hpa_dot2_nn.yaml
similarity index 100%
rename from Tensile/Tests/extended/dot2/hgemm_hpa_dot2_nn.yaml
rename to Tests/extended/dot2/hgemm_hpa_dot2_nn.yaml
diff --git a/Tensile/Tests/extended/dot2/hgemm_hpa_dot2_tn.yaml b/Tests/extended/dot2/hgemm_hpa_dot2_tn.yaml
similarity index 100%
rename from Tensile/Tests/extended/dot2/hgemm_hpa_dot2_tn.yaml
rename to Tests/extended/dot2/hgemm_hpa_dot2_tn.yaml
diff --git a/Tensile/Tests/extended/dot2/hgemm_hpa_dot2_tn_2.yaml b/Tests/extended/dot2/hgemm_hpa_dot2_tn_2.yaml
similarity index 100%
rename from Tensile/Tests/extended/dot2/hgemm_hpa_dot2_tn_2.yaml
rename to Tests/extended/dot2/hgemm_hpa_dot2_tn_2.yaml
diff --git a/Tensile/Tests/extended/double_complex/zgemm_asm.yaml b/Tests/extended/double_complex/zgemm_asm.yaml
similarity index 100%
rename from Tensile/Tests/extended/double_complex/zgemm_asm.yaml
rename to Tests/extended/double_complex/zgemm_asm.yaml
diff --git a/Tensile/Tests/extended/double_complex/zgemm_hip_source_cc.yaml b/Tests/extended/double_complex/zgemm_hip_source_cc.yaml
similarity index 100%
rename from Tensile/Tests/extended/double_complex/zgemm_hip_source_cc.yaml
rename to Tests/extended/double_complex/zgemm_hip_source_cc.yaml
diff --git a/Tensile/Tests/extended/double_complex/zgemm_hip_source_cn.yaml b/Tests/extended/double_complex/zgemm_hip_source_cn.yaml
similarity index 100%
rename from Tensile/Tests/extended/double_complex/zgemm_hip_source_cn.yaml
rename to Tests/extended/double_complex/zgemm_hip_source_cn.yaml
diff --git a/Tensile/Tests/extended/double_complex/zgemm_hip_source_ct.yaml b/Tests/extended/double_complex/zgemm_hip_source_ct.yaml
similarity index 100%
rename from Tensile/Tests/extended/double_complex/zgemm_hip_source_ct.yaml
rename to Tests/extended/double_complex/zgemm_hip_source_ct.yaml
diff --git a/Tensile/Tests/extended/double_complex/zgemm_hip_source_nc.yaml b/Tests/extended/double_complex/zgemm_hip_source_nc.yaml
similarity index 100%
rename from Tensile/Tests/extended/double_complex/zgemm_hip_source_nc.yaml
rename to Tests/extended/double_complex/zgemm_hip_source_nc.yaml
diff --git a/Tensile/Tests/extended/double_complex/zgemm_hip_source_nn.yaml b/Tests/extended/double_complex/zgemm_hip_source_nn.yaml
similarity index 100%
rename from Tensile/Tests/extended/double_complex/zgemm_hip_source_nn.yaml
rename to Tests/extended/double_complex/zgemm_hip_source_nn.yaml
diff --git a/Tensile/Tests/extended/double_complex/zgemm_hip_source_nt.yaml b/Tests/extended/double_complex/zgemm_hip_source_nt.yaml
similarity index 100%
rename from Tensile/Tests/extended/double_complex/zgemm_hip_source_nt.yaml
rename to Tests/extended/double_complex/zgemm_hip_source_nt.yaml
diff --git a/Tensile/Tests/extended/double_complex/zgemm_hip_source_tc.yaml b/Tests/extended/double_complex/zgemm_hip_source_tc.yaml
similarity index 100%
rename from Tensile/Tests/extended/double_complex/zgemm_hip_source_tc.yaml
rename to Tests/extended/double_complex/zgemm_hip_source_tc.yaml
diff --git a/Tensile/Tests/extended/double_complex/zgemm_hip_source_tn.yaml b/Tests/extended/double_complex/zgemm_hip_source_tn.yaml
similarity index 100%
rename from Tensile/Tests/extended/double_complex/zgemm_hip_source_tn.yaml
rename to Tests/extended/double_complex/zgemm_hip_source_tn.yaml
diff --git a/Tensile/Tests/extended/double_complex/zgemm_hip_source_tt.yaml b/Tests/extended/double_complex/zgemm_hip_source_tt.yaml
similarity index 100%
rename from Tensile/Tests/extended/double_complex/zgemm_hip_source_tt.yaml
rename to Tests/extended/double_complex/zgemm_hip_source_tt.yaml
diff --git a/Tensile/Tests/extended/flat/test_dgemm_asm_flat.yaml b/Tests/extended/flat/test_dgemm_asm_flat.yaml
similarity index 100%
rename from Tensile/Tests/extended/flat/test_dgemm_asm_flat.yaml
rename to Tests/extended/flat/test_dgemm_asm_flat.yaml
diff --git a/Tensile/Tests/extended/flat/test_sgemm_asm_flat.yaml b/Tests/extended/flat/test_sgemm_asm_flat.yaml
similarity index 100%
rename from Tensile/Tests/extended/flat/test_sgemm_asm_flat.yaml
rename to Tests/extended/flat/test_sgemm_asm_flat.yaml
diff --git a/Tensile/Tests/extended/flat/test_sgemm_asm_flat_nt.yaml b/Tests/extended/flat/test_sgemm_asm_flat_nt.yaml
similarity index 100%
rename from Tensile/Tests/extended/flat/test_sgemm_asm_flat_nt.yaml
rename to Tests/extended/flat/test_sgemm_asm_flat_nt.yaml
diff --git a/Tensile/Tests/extended/flat/test_sgemm_asm_flat_tn.yaml b/Tests/extended/flat/test_sgemm_asm_flat_tn.yaml
similarity index 100%
rename from Tensile/Tests/extended/flat/test_sgemm_asm_flat_tn.yaml
rename to Tests/extended/flat/test_sgemm_asm_flat_tn.yaml
diff --git a/Tensile/Tests/extended/flat/test_sgemm_asm_flat_tt.yaml b/Tests/extended/flat/test_sgemm_asm_flat_tt.yaml
similarity index 100%
rename from Tensile/Tests/extended/flat/test_sgemm_asm_flat_tt.yaml
rename to Tests/extended/flat/test_sgemm_asm_flat_tt.yaml
diff --git a/Tensile/Tests/extended/float8/f8gemm-hybrid-ss.yaml b/Tests/extended/float8/f8gemm-hybrid-ss.yaml
similarity index 100%
rename from Tensile/Tests/extended/float8/f8gemm-hybrid-ss.yaml
rename to Tests/extended/float8/f8gemm-hybrid-ss.yaml
diff --git a/Tensile/Tests/extended/float_complex/cgemm_asm.yaml b/Tests/extended/float_complex/cgemm_asm.yaml
similarity index 100%
rename from Tensile/Tests/extended/float_complex/cgemm_asm.yaml
rename to Tests/extended/float_complex/cgemm_asm.yaml
diff --git a/Tensile/Tests/extended/float_complex/cgemm_hip_source_cc.yaml b/Tests/extended/float_complex/cgemm_hip_source_cc.yaml
similarity index 100%
rename from Tensile/Tests/extended/float_complex/cgemm_hip_source_cc.yaml
rename to Tests/extended/float_complex/cgemm_hip_source_cc.yaml
diff --git a/Tensile/Tests/extended/float_complex/cgemm_hip_source_cn.yaml b/Tests/extended/float_complex/cgemm_hip_source_cn.yaml
similarity index 100%
rename from Tensile/Tests/extended/float_complex/cgemm_hip_source_cn.yaml
rename to Tests/extended/float_complex/cgemm_hip_source_cn.yaml
diff --git a/Tensile/Tests/extended/float_complex/cgemm_hip_source_ct.yaml b/Tests/extended/float_complex/cgemm_hip_source_ct.yaml
similarity index 100%
rename from Tensile/Tests/extended/float_complex/cgemm_hip_source_ct.yaml
rename to Tests/extended/float_complex/cgemm_hip_source_ct.yaml
diff --git a/Tensile/Tests/extended/float_complex/cgemm_hip_source_nc.yaml b/Tests/extended/float_complex/cgemm_hip_source_nc.yaml
similarity index 100%
rename from Tensile/Tests/extended/float_complex/cgemm_hip_source_nc.yaml
rename to Tests/extended/float_complex/cgemm_hip_source_nc.yaml
diff --git a/Tensile/Tests/extended/float_complex/cgemm_hip_source_nn.yaml b/Tests/extended/float_complex/cgemm_hip_source_nn.yaml
similarity index 100%
rename from Tensile/Tests/extended/float_complex/cgemm_hip_source_nn.yaml
rename to Tests/extended/float_complex/cgemm_hip_source_nn.yaml
diff --git a/Tensile/Tests/extended/float_complex/cgemm_hip_source_nt.yaml b/Tests/extended/float_complex/cgemm_hip_source_nt.yaml
similarity index 100%
rename from Tensile/Tests/extended/float_complex/cgemm_hip_source_nt.yaml
rename to Tests/extended/float_complex/cgemm_hip_source_nt.yaml
diff --git a/Tensile/Tests/extended/float_complex/cgemm_hip_source_tc.yaml b/Tests/extended/float_complex/cgemm_hip_source_tc.yaml
similarity index 100%
rename from Tensile/Tests/extended/float_complex/cgemm_hip_source_tc.yaml
rename to Tests/extended/float_complex/cgemm_hip_source_tc.yaml
diff --git a/Tensile/Tests/extended/float_complex/cgemm_hip_source_tn.yaml b/Tests/extended/float_complex/cgemm_hip_source_tn.yaml
similarity index 100%
rename from Tensile/Tests/extended/float_complex/cgemm_hip_source_tn.yaml
rename to Tests/extended/float_complex/cgemm_hip_source_tn.yaml
diff --git a/Tensile/Tests/extended/float_complex/cgemm_hip_source_tt.yaml b/Tests/extended/float_complex/cgemm_hip_source_tt.yaml
similarity index 100%
rename from Tensile/Tests/extended/float_complex/cgemm_hip_source_tt.yaml
rename to Tests/extended/float_complex/cgemm_hip_source_tt.yaml
diff --git a/Tensile/Tests/extended/fractional/test_dgemm_fractional_tile_sweep.yaml b/Tests/extended/fractional/test_dgemm_fractional_tile_sweep.yaml
similarity index 100%
rename from Tensile/Tests/extended/fractional/test_dgemm_fractional_tile_sweep.yaml
rename to Tests/extended/fractional/test_dgemm_fractional_tile_sweep.yaml
diff --git a/Tensile/Tests/extended/fractional/test_hgemm_fractional_tile_sweep.yaml b/Tests/extended/fractional/test_hgemm_fractional_tile_sweep.yaml
similarity index 100%
rename from Tensile/Tests/extended/fractional/test_hgemm_fractional_tile_sweep.yaml
rename to Tests/extended/fractional/test_hgemm_fractional_tile_sweep.yaml
diff --git a/Tensile/Tests/extended/fractional/test_sgemm_fractional_edge.yaml b/Tests/extended/fractional/test_sgemm_fractional_edge.yaml
similarity index 100%
rename from Tensile/Tests/extended/fractional/test_sgemm_fractional_edge.yaml
rename to Tests/extended/fractional/test_sgemm_fractional_edge.yaml
diff --git a/Tensile/Tests/extended/fractional/test_sgemm_fractional_tile_sweep.yaml b/Tests/extended/fractional/test_sgemm_fractional_tile_sweep.yaml
similarity index 100%
rename from Tensile/Tests/extended/fractional/test_sgemm_fractional_tile_sweep.yaml
rename to Tests/extended/fractional/test_sgemm_fractional_tile_sweep.yaml
diff --git a/Tensile/Tests/extended/global_split_u/hgemm_gsu.yaml b/Tests/extended/global_split_u/hgemm_gsu.yaml
similarity index 100%
rename from Tensile/Tests/extended/global_split_u/hgemm_gsu.yaml
rename to Tests/extended/global_split_u/hgemm_gsu.yaml
diff --git a/Tensile/Tests/extended/global_split_u/hgemm_gsu_minkforgsu.yaml b/Tests/extended/global_split_u/hgemm_gsu_minkforgsu.yaml
similarity index 100%
rename from Tensile/Tests/extended/global_split_u/hgemm_gsu_minkforgsu.yaml
rename to Tests/extended/global_split_u/hgemm_gsu_minkforgsu.yaml
diff --git a/Tensile/Tests/extended/global_split_u/sgemm_gsu_batch.yaml b/Tests/extended/global_split_u/sgemm_gsu_batch.yaml
similarity index 100%
rename from Tensile/Tests/extended/global_split_u/sgemm_gsu_batch.yaml
rename to Tests/extended/global_split_u/sgemm_gsu_batch.yaml
diff --git a/Tensile/Tests/extended/global_split_u/sgemm_gsu_beta0.yaml b/Tests/extended/global_split_u/sgemm_gsu_beta0.yaml
similarity index 100%
rename from Tensile/Tests/extended/global_split_u/sgemm_gsu_beta0.yaml
rename to Tests/extended/global_split_u/sgemm_gsu_beta0.yaml
diff --git a/Tensile/Tests/extended/global_split_u/sgemm_gsu_beta1.yaml b/Tests/extended/global_split_u/sgemm_gsu_beta1.yaml
similarity index 100%
rename from Tensile/Tests/extended/global_split_u/sgemm_gsu_beta1.yaml
rename to Tests/extended/global_split_u/sgemm_gsu_beta1.yaml
diff --git a/Tensile/Tests/extended/global_split_u/sgemm_gsu_beta2.yaml b/Tests/extended/global_split_u/sgemm_gsu_beta2.yaml
similarity index 100%
rename from Tensile/Tests/extended/global_split_u/sgemm_gsu_beta2.yaml
rename to Tests/extended/global_split_u/sgemm_gsu_beta2.yaml
diff --git a/Tensile/Tests/extended/global_split_u/sgemm_gsu_usebeta0.yaml b/Tests/extended/global_split_u/sgemm_gsu_usebeta0.yaml
similarity index 100%
rename from Tensile/Tests/extended/global_split_u/sgemm_gsu_usebeta0.yaml
rename to Tests/extended/global_split_u/sgemm_gsu_usebeta0.yaml
diff --git a/Tensile/Tests/extended/hpa_source/test_hgemm_hpa_src_nn.yaml b/Tests/extended/hpa_source/test_hgemm_hpa_src_nn.yaml
similarity index 100%
rename from Tensile/Tests/extended/hpa_source/test_hgemm_hpa_src_nn.yaml
rename to Tests/extended/hpa_source/test_hgemm_hpa_src_nn.yaml
diff --git a/Tensile/Tests/extended/hpa_source/test_hgemm_hpa_src_nt.yaml b/Tests/extended/hpa_source/test_hgemm_hpa_src_nt.yaml
similarity index 100%
rename from Tensile/Tests/extended/hpa_source/test_hgemm_hpa_src_nt.yaml
rename to Tests/extended/hpa_source/test_hgemm_hpa_src_nt.yaml
diff --git a/Tensile/Tests/extended/hpa_source/test_hgemm_hpa_src_tn.yaml b/Tests/extended/hpa_source/test_hgemm_hpa_src_tn.yaml
similarity index 100%
rename from Tensile/Tests/extended/hpa_source/test_hgemm_hpa_src_tn.yaml
rename to Tests/extended/hpa_source/test_hgemm_hpa_src_tn.yaml
diff --git a/Tensile/Tests/extended/hpa_source/test_hgemm_hpa_src_tt.yaml b/Tests/extended/hpa_source/test_hgemm_hpa_src_tt.yaml
similarity index 100%
rename from Tensile/Tests/extended/hpa_source/test_hgemm_hpa_src_tt.yaml
rename to Tests/extended/hpa_source/test_hgemm_hpa_src_tt.yaml
diff --git a/Tensile/Tests/extended/local_split_u/bfloat16_lsu_mfma.yaml b/Tests/extended/local_split_u/bfloat16_lsu_mfma.yaml
similarity index 100%
rename from Tensile/Tests/extended/local_split_u/bfloat16_lsu_mfma.yaml
rename to Tests/extended/local_split_u/bfloat16_lsu_mfma.yaml
diff --git a/Tensile/Tests/extended/local_split_u/cgemm_lsu_mfma.yaml b/Tests/extended/local_split_u/cgemm_lsu_mfma.yaml
similarity index 100%
rename from Tensile/Tests/extended/local_split_u/cgemm_lsu_mfma.yaml
rename to Tests/extended/local_split_u/cgemm_lsu_mfma.yaml
diff --git a/Tensile/Tests/extended/local_split_u/dgemm_lsu.yaml b/Tests/extended/local_split_u/dgemm_lsu.yaml
similarity index 100%
rename from Tensile/Tests/extended/local_split_u/dgemm_lsu.yaml
rename to Tests/extended/local_split_u/dgemm_lsu.yaml
diff --git a/Tensile/Tests/extended/local_split_u/dgemm_lsu_mfma.yaml b/Tests/extended/local_split_u/dgemm_lsu_mfma.yaml
similarity index 100%
rename from Tensile/Tests/extended/local_split_u/dgemm_lsu_mfma.yaml
rename to Tests/extended/local_split_u/dgemm_lsu_mfma.yaml
diff --git a/Tensile/Tests/extended/local_split_u/f8gemm_lsu_mfma.yaml b/Tests/extended/local_split_u/f8gemm_lsu_mfma.yaml
similarity index 100%
rename from Tensile/Tests/extended/local_split_u/f8gemm_lsu_mfma.yaml
rename to Tests/extended/local_split_u/f8gemm_lsu_mfma.yaml
diff --git a/Tensile/Tests/extended/local_split_u/hgemm_lsu.yaml b/Tests/extended/local_split_u/hgemm_lsu.yaml
similarity index 100%
rename from Tensile/Tests/extended/local_split_u/hgemm_lsu.yaml
rename to Tests/extended/local_split_u/hgemm_lsu.yaml
diff --git a/Tensile/Tests/extended/local_split_u/hgemm_lsu_grvw2.yaml b/Tests/extended/local_split_u/hgemm_lsu_grvw2.yaml
similarity index 100%
rename from Tensile/Tests/extended/local_split_u/hgemm_lsu_grvw2.yaml
rename to Tests/extended/local_split_u/hgemm_lsu_grvw2.yaml
diff --git a/Tensile/Tests/extended/local_split_u/hgemm_lsu_mfma.yaml b/Tests/extended/local_split_u/hgemm_lsu_mfma.yaml
similarity index 100%
rename from Tensile/Tests/extended/local_split_u/hgemm_lsu_mfma.yaml
rename to Tests/extended/local_split_u/hgemm_lsu_mfma.yaml
diff --git a/Tensile/Tests/extended/local_split_u/hgemm_lsu_mfma_a1b0.yaml b/Tests/extended/local_split_u/hgemm_lsu_mfma_a1b0.yaml
similarity index 100%
rename from Tensile/Tests/extended/local_split_u/hgemm_lsu_mfma_a1b0.yaml
rename to Tests/extended/local_split_u/hgemm_lsu_mfma_a1b0.yaml
diff --git a/Tensile/Tests/extended/local_split_u/igemm_lsu_mfma.yaml b/Tests/extended/local_split_u/igemm_lsu_mfma.yaml
similarity index 100%
rename from Tensile/Tests/extended/local_split_u/igemm_lsu_mfma.yaml
rename to Tests/extended/local_split_u/igemm_lsu_mfma.yaml
diff --git a/Tensile/Tests/extended/local_split_u/sgemm_lsu.yaml b/Tests/extended/local_split_u/sgemm_lsu.yaml
similarity index 100%
rename from Tensile/Tests/extended/local_split_u/sgemm_lsu.yaml
rename to Tests/extended/local_split_u/sgemm_lsu.yaml
diff --git a/Tensile/Tests/extended/local_split_u/sgemm_lsu_mfma.yaml b/Tests/extended/local_split_u/sgemm_lsu_mfma.yaml
similarity index 100%
rename from Tensile/Tests/extended/local_split_u/sgemm_lsu_mfma.yaml
rename to Tests/extended/local_split_u/sgemm_lsu_mfma.yaml
diff --git a/Tensile/Tests/extended/local_split_u/zgemm_lsu_mfma.yaml b/Tests/extended/local_split_u/zgemm_lsu_mfma.yaml
similarity index 100%
rename from Tensile/Tests/extended/local_split_u/zgemm_lsu_mfma.yaml
rename to Tests/extended/local_split_u/zgemm_lsu_mfma.yaml
diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_1sum_zp.yaml b/Tests/extended/mirror_dims/mirror_dims_1sum_zp.yaml
similarity index 100%
rename from Tensile/Tests/extended/mirror_dims/mirror_dims_1sum_zp.yaml
rename to Tests/extended/mirror_dims/mirror_dims_1sum_zp.yaml
diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_2sum_mir_summ.yaml b/Tests/extended/mirror_dims/mirror_dims_2sum_mir_summ.yaml
similarity index 100%
rename from Tensile/Tests/extended/mirror_dims/mirror_dims_2sum_mir_summ.yaml
rename to Tests/extended/mirror_dims/mirror_dims_2sum_mir_summ.yaml
diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_2sum_mir_summ_zp_other.yaml b/Tests/extended/mirror_dims/mirror_dims_2sum_mir_summ_zp_other.yaml
similarity index 100%
rename from Tensile/Tests/extended/mirror_dims/mirror_dims_2sum_mir_summ_zp_other.yaml
rename to Tests/extended/mirror_dims/mirror_dims_2sum_mir_summ_zp_other.yaml
diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_2sum_mir_summ_zp_unroll.yaml b/Tests/extended/mirror_dims/mirror_dims_2sum_mir_summ_zp_unroll.yaml
similarity index 100%
rename from Tensile/Tests/extended/mirror_dims/mirror_dims_2sum_mir_summ_zp_unroll.yaml
rename to Tests/extended/mirror_dims/mirror_dims_2sum_mir_summ_zp_unroll.yaml
diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll.yaml b/Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll.yaml
similarity index 100%
rename from Tensile/Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll.yaml
rename to Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll.yaml
diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll_summ.yaml b/Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll_summ.yaml
similarity index 100%
rename from Tensile/Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll_summ.yaml
rename to Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll_summ.yaml
diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll_zp_other.yaml b/Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll_zp_other.yaml
similarity index 100%
rename from Tensile/Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll_zp_other.yaml
rename to Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll_zp_other.yaml
diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll_zp_unroll.yaml b/Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll_zp_unroll.yaml
similarity index 100%
rename from Tensile/Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll_zp_unroll.yaml
rename to Tests/extended/mirror_dims/mirror_dims_2sum_mir_unroll_zp_unroll.yaml
diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ1.yaml b/Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ1.yaml
similarity index 100%
rename from Tensile/Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ1.yaml
rename to Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ1.yaml
diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ1_summ2.yaml b/Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ1_summ2.yaml
similarity index 100%
rename from Tensile/Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ1_summ2.yaml
rename to Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ1_summ2.yaml
diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ2.yaml b/Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ2.yaml
similarity index 100%
rename from Tensile/Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ2.yaml
rename to Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ2.yaml
diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ_zp_other.yaml b/Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ_zp_other.yaml
similarity index 100%
rename from Tensile/Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ_zp_other.yaml
rename to Tests/extended/mirror_dims/mirror_dims_3sum_mir_summ_zp_other.yaml
diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_3sum_mir_unroll.yaml b/Tests/extended/mirror_dims/mirror_dims_3sum_mir_unroll.yaml
similarity index 100%
rename from Tensile/Tests/extended/mirror_dims/mirror_dims_3sum_mir_unroll.yaml
rename to Tests/extended/mirror_dims/mirror_dims_3sum_mir_unroll.yaml
diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_3sum_mir_unroll_summ1.yaml b/Tests/extended/mirror_dims/mirror_dims_3sum_mir_unroll_summ1.yaml
similarity index 100%
rename from Tensile/Tests/extended/mirror_dims/mirror_dims_3sum_mir_unroll_summ1.yaml
rename to Tests/extended/mirror_dims/mirror_dims_3sum_mir_unroll_summ1.yaml
diff --git a/Tensile/Tests/extended/mirror_dims/mirror_dims_3sum_mir_unroll_zp_other.yaml b/Tests/extended/mirror_dims/mirror_dims_3sum_mir_unroll_zp_other.yaml
similarity index 100%
rename from Tensile/Tests/extended/mirror_dims/mirror_dims_3sum_mir_unroll_zp_other.yaml
rename to Tests/extended/mirror_dims/mirror_dims_3sum_mir_unroll_zp_other.yaml
diff --git a/Tensile/Tests/extended/multi_sum/2sum.yaml b/Tests/extended/multi_sum/2sum.yaml
similarity index 100%
rename from Tensile/Tests/extended/multi_sum/2sum.yaml
rename to Tests/extended/multi_sum/2sum.yaml
diff --git a/Tensile/Tests/extended/multi_sum/2sum_gsu.yaml b/Tests/extended/multi_sum/2sum_gsu.yaml
similarity index 100%
rename from Tensile/Tests/extended/multi_sum/2sum_gsu.yaml
rename to Tests/extended/multi_sum/2sum_gsu.yaml
diff --git a/Tensile/Tests/extended/multi_sum/2sum_gsu_simple.yaml b/Tests/extended/multi_sum/2sum_gsu_simple.yaml
similarity index 100%
rename from Tensile/Tests/extended/multi_sum/2sum_gsu_simple.yaml
rename to Tests/extended/multi_sum/2sum_gsu_simple.yaml
diff --git a/Tensile/Tests/extended/multi_sum/2sum_gsu_src.yaml b/Tests/extended/multi_sum/2sum_gsu_src.yaml
similarity index 100%
rename from Tensile/Tests/extended/multi_sum/2sum_gsu_src.yaml
rename to Tests/extended/multi_sum/2sum_gsu_src.yaml
diff --git a/Tensile/Tests/extended/multi_sum/2sum_src.yaml b/Tests/extended/multi_sum/2sum_src.yaml
similarity index 100%
rename from Tensile/Tests/extended/multi_sum/2sum_src.yaml
rename to Tests/extended/multi_sum/2sum_src.yaml
diff --git a/Tensile/Tests/extended/multi_sum/3sum_gsu.yaml b/Tests/extended/multi_sum/3sum_gsu.yaml
similarity index 100%
rename from Tensile/Tests/extended/multi_sum/3sum_gsu.yaml
rename to Tests/extended/multi_sum/3sum_gsu.yaml
diff --git a/Tensile/Tests/extended/multi_sum/simple_sum2_scrambled.yaml b/Tests/extended/multi_sum/simple_sum2_scrambled.yaml
similarity index 100%
rename from Tensile/Tests/extended/multi_sum/simple_sum2_scrambled.yaml
rename to Tests/extended/multi_sum/simple_sum2_scrambled.yaml
diff --git a/Tensile/Tests/extended/multi_sum_psd/1sum_gsu_simple.yaml b/Tests/extended/multi_sum_psd/1sum_gsu_simple.yaml
similarity index 100%
rename from Tensile/Tests/extended/multi_sum_psd/1sum_gsu_simple.yaml
rename to Tests/extended/multi_sum_psd/1sum_gsu_simple.yaml
diff --git a/Tensile/Tests/extended/multi_sum_psd/1sum_simple.yaml b/Tests/extended/multi_sum_psd/1sum_simple.yaml
similarity index 100%
rename from Tensile/Tests/extended/multi_sum_psd/1sum_simple.yaml
rename to Tests/extended/multi_sum_psd/1sum_simple.yaml
diff --git a/Tensile/Tests/extended/multi_sum_psd/2sum.yaml b/Tests/extended/multi_sum_psd/2sum.yaml
similarity index 100%
rename from Tensile/Tests/extended/multi_sum_psd/2sum.yaml
rename to Tests/extended/multi_sum_psd/2sum.yaml
diff --git a/Tensile/Tests/extended/multi_sum_psd/2sum_gsu.yaml b/Tests/extended/multi_sum_psd/2sum_gsu.yaml
similarity index 100%
rename from Tensile/Tests/extended/multi_sum_psd/2sum_gsu.yaml
rename to Tests/extended/multi_sum_psd/2sum_gsu.yaml
diff --git a/Tensile/Tests/extended/multi_sum_psd/2sum_gsu_simple.yaml b/Tests/extended/multi_sum_psd/2sum_gsu_simple.yaml
similarity index 100%
rename from Tensile/Tests/extended/multi_sum_psd/2sum_gsu_simple.yaml
rename to Tests/extended/multi_sum_psd/2sum_gsu_simple.yaml
diff --git a/Tensile/Tests/extended/multi_sum_psd/2sum_gsuremainder.yaml b/Tests/extended/multi_sum_psd/2sum_gsuremainder.yaml
similarity index 100%
rename from Tensile/Tests/extended/multi_sum_psd/2sum_gsuremainder.yaml
rename to Tests/extended/multi_sum_psd/2sum_gsuremainder.yaml
diff --git a/Tensile/Tests/extended/multi_sum_psd/2sum_gsuremainder_simple.yaml b/Tests/extended/multi_sum_psd/2sum_gsuremainder_simple.yaml
similarity index 100%
rename from Tensile/Tests/extended/multi_sum_psd/2sum_gsuremainder_simple.yaml
rename to Tests/extended/multi_sum_psd/2sum_gsuremainder_simple.yaml
diff --git a/Tensile/Tests/extended/multi_sum_psd/2sum_pbd.yaml b/Tests/extended/multi_sum_psd/2sum_pbd.yaml
similarity index 100%
rename from Tensile/Tests/extended/multi_sum_psd/2sum_pbd.yaml
rename to Tests/extended/multi_sum_psd/2sum_pbd.yaml
diff --git a/Tensile/Tests/extended/multi_sum_psd/2sum_scrambled_simple.yaml b/Tests/extended/multi_sum_psd/2sum_scrambled_simple.yaml
similarity index 100%
rename from Tensile/Tests/extended/multi_sum_psd/2sum_scrambled_simple.yaml
rename to Tests/extended/multi_sum_psd/2sum_scrambled_simple.yaml
diff --git a/Tensile/Tests/extended/multi_sum_psd/3sum.yaml b/Tests/extended/multi_sum_psd/3sum.yaml
similarity index 100%
rename from Tensile/Tests/extended/multi_sum_psd/3sum.yaml
rename to Tests/extended/multi_sum_psd/3sum.yaml
diff --git a/Tensile/Tests/extended/multi_sum_psd/3sum_gsu.yaml b/Tests/extended/multi_sum_psd/3sum_gsu.yaml
similarity index 100%
rename from Tensile/Tests/extended/multi_sum_psd/3sum_gsu.yaml
rename to Tests/extended/multi_sum_psd/3sum_gsu.yaml
diff --git a/Tensile/Tests/extended/multi_sum_psd/3sum_gsu_simple.yaml b/Tests/extended/multi_sum_psd/3sum_gsu_simple.yaml
similarity index 100%
rename from Tensile/Tests/extended/multi_sum_psd/3sum_gsu_simple.yaml
rename to Tests/extended/multi_sum_psd/3sum_gsu_simple.yaml
diff --git a/Tensile/Tests/extended/multi_sum_psd/3sum_simple.yaml b/Tests/extended/multi_sum_psd/3sum_simple.yaml
similarity index 100%
rename from Tensile/Tests/extended/multi_sum_psd/3sum_simple.yaml
rename to Tests/extended/multi_sum_psd/3sum_simple.yaml
diff --git a/Tensile/Tests/extended/multi_sum_psd/README b/Tests/extended/multi_sum_psd/README
similarity index 100%
rename from Tensile/Tests/extended/multi_sum_psd/README
rename to Tests/extended/multi_sum_psd/README
diff --git a/Tensile/Tests/extended/multi_sum_psd/hackable_simple_unrollinc1.yaml b/Tests/extended/multi_sum_psd/hackable_simple_unrollinc1.yaml
similarity index 100%
rename from Tensile/Tests/extended/multi_sum_psd/hackable_simple_unrollinc1.yaml
rename to Tests/extended/multi_sum_psd/hackable_simple_unrollinc1.yaml
diff --git a/Tensile/Tests/extended/nonbatched/sgemm_asm_nn.yaml b/Tests/extended/nonbatched/sgemm_asm_nn.yaml
similarity index 100%
rename from Tensile/Tests/extended/nonbatched/sgemm_asm_nn.yaml
rename to Tests/extended/nonbatched/sgemm_asm_nn.yaml
diff --git a/Tensile/Tests/extended/nonbatched/sgemm_asm_nt.yaml b/Tests/extended/nonbatched/sgemm_asm_nt.yaml
similarity index 100%
rename from Tensile/Tests/extended/nonbatched/sgemm_asm_nt.yaml
rename to Tests/extended/nonbatched/sgemm_asm_nt.yaml
diff --git a/Tensile/Tests/extended/nonbatched/sgemm_asm_tn.yaml b/Tests/extended/nonbatched/sgemm_asm_tn.yaml
similarity index 100%
rename from Tensile/Tests/extended/nonbatched/sgemm_asm_tn.yaml
rename to Tests/extended/nonbatched/sgemm_asm_tn.yaml
diff --git a/Tensile/Tests/extended/nonbatched/sgemm_asm_tt.yaml b/Tests/extended/nonbatched/sgemm_asm_tt.yaml
similarity index 100%
rename from Tensile/Tests/extended/nonbatched/sgemm_asm_tt.yaml
rename to Tests/extended/nonbatched/sgemm_asm_tt.yaml
diff --git a/Tensile/Tests/extended/pack_tensor_dims/multi_free2.yaml b/Tests/extended/pack_tensor_dims/multi_free2.yaml
similarity index 100%
rename from Tensile/Tests/extended/pack_tensor_dims/multi_free2.yaml
rename to Tests/extended/pack_tensor_dims/multi_free2.yaml
diff --git a/Tensile/Tests/extended/pack_tensor_dims/multi_free_batch.yaml b/Tests/extended/pack_tensor_dims/multi_free_batch.yaml
similarity index 100%
rename from Tensile/Tests/extended/pack_tensor_dims/multi_free_batch.yaml
rename to Tests/extended/pack_tensor_dims/multi_free_batch.yaml
diff --git a/Tensile/Tests/extended/pack_tensor_dims/packed_perf_nn.yaml b/Tests/extended/pack_tensor_dims/packed_perf_nn.yaml
similarity index 100%
rename from Tensile/Tests/extended/pack_tensor_dims/packed_perf_nn.yaml
rename to Tests/extended/pack_tensor_dims/packed_perf_nn.yaml
diff --git a/Tensile/Tests/extended/pack_tensor_dims/simple_stridea0_pack.yaml b/Tests/extended/pack_tensor_dims/simple_stridea0_pack.yaml
similarity index 100%
rename from Tensile/Tests/extended/pack_tensor_dims/simple_stridea0_pack.yaml
rename to Tests/extended/pack_tensor_dims/simple_stridea0_pack.yaml
diff --git a/Tensile/Tests/extended/pack_tensor_dims/simple_strideb0_pack.yaml b/Tests/extended/pack_tensor_dims/simple_strideb0_pack.yaml
similarity index 100%
rename from Tensile/Tests/extended/pack_tensor_dims/simple_strideb0_pack.yaml
rename to Tests/extended/pack_tensor_dims/simple_strideb0_pack.yaml
diff --git a/Tensile/Tests/extended/pack_tensor_dims/strideb0_pack_nt.yaml b/Tests/extended/pack_tensor_dims/strideb0_pack_nt.yaml
similarity index 100%
rename from Tensile/Tests/extended/pack_tensor_dims/strideb0_pack_nt.yaml
rename to Tests/extended/pack_tensor_dims/strideb0_pack_nt.yaml
diff --git a/Tensile/Tests/extended/pack_tensor_dims/strideb0_pack_tn.yaml b/Tests/extended/pack_tensor_dims/strideb0_pack_tn.yaml
similarity index 100%
rename from Tensile/Tests/extended/pack_tensor_dims/strideb0_pack_tn.yaml
rename to Tests/extended/pack_tensor_dims/strideb0_pack_tn.yaml
diff --git a/Tensile/Tests/extended/pack_tensor_dims/vectorstore0.yaml b/Tests/extended/pack_tensor_dims/vectorstore0.yaml
similarity index 100%
rename from Tensile/Tests/extended/pack_tensor_dims/vectorstore0.yaml
rename to Tests/extended/pack_tensor_dims/vectorstore0.yaml
diff --git a/Tensile/Tests/extended/stagger_u/big_skinny_A_NN.yaml b/Tests/extended/stagger_u/big_skinny_A_NN.yaml
similarity index 100%
rename from Tensile/Tests/extended/stagger_u/big_skinny_A_NN.yaml
rename to Tests/extended/stagger_u/big_skinny_A_NN.yaml
diff --git a/Tensile/Tests/extended/stagger_u/big_skinny_A_NT.yaml b/Tests/extended/stagger_u/big_skinny_A_NT.yaml
similarity index 100%
rename from Tensile/Tests/extended/stagger_u/big_skinny_A_NT.yaml
rename to Tests/extended/stagger_u/big_skinny_A_NT.yaml
diff --git a/Tensile/Tests/extended/stagger_u/big_skinny_A_TN.yaml b/Tests/extended/stagger_u/big_skinny_A_TN.yaml
similarity index 100%
rename from Tensile/Tests/extended/stagger_u/big_skinny_A_TN.yaml
rename to Tests/extended/stagger_u/big_skinny_A_TN.yaml
diff --git a/Tensile/Tests/extended/stagger_u/big_skinny_A_TT.yaml b/Tests/extended/stagger_u/big_skinny_A_TT.yaml
similarity index 100%
rename from Tensile/Tests/extended/stagger_u/big_skinny_A_TT.yaml
rename to Tests/extended/stagger_u/big_skinny_A_TT.yaml
diff --git a/Tensile/Tests/extended/stagger_u/big_skinny_B_NN.yaml b/Tests/extended/stagger_u/big_skinny_B_NN.yaml
similarity index 100%
rename from Tensile/Tests/extended/stagger_u/big_skinny_B_NN.yaml
rename to Tests/extended/stagger_u/big_skinny_B_NN.yaml
diff --git a/Tensile/Tests/extended/stagger_u/big_skinny_B_NT.yaml b/Tests/extended/stagger_u/big_skinny_B_NT.yaml
similarity index 100%
rename from Tensile/Tests/extended/stagger_u/big_skinny_B_NT.yaml
rename to Tests/extended/stagger_u/big_skinny_B_NT.yaml
diff --git a/Tensile/Tests/extended/stagger_u/big_skinny_B_TN.yaml b/Tests/extended/stagger_u/big_skinny_B_TN.yaml
similarity index 100%
rename from Tensile/Tests/extended/stagger_u/big_skinny_B_TN.yaml
rename to Tests/extended/stagger_u/big_skinny_B_TN.yaml
diff --git a/Tensile/Tests/extended/stagger_u/big_skinny_B_TT.yaml b/Tests/extended/stagger_u/big_skinny_B_TT.yaml
similarity index 100%
rename from Tensile/Tests/extended/stagger_u/big_skinny_B_TT.yaml
rename to Tests/extended/stagger_u/big_skinny_B_TT.yaml
diff --git a/Tensile/Tests/extended/stream_k/sk_2tile_hgemm_hhs.yaml b/Tests/extended/stream_k/sk_2tile_hgemm_hhs.yaml
similarity index 100%
rename from Tensile/Tests/extended/stream_k/sk_2tile_hgemm_hhs.yaml
rename to Tests/extended/stream_k/sk_2tile_hgemm_hhs.yaml
diff --git a/Tensile/Tests/extended/stream_k/sk_2tile_sgemm.yaml b/Tests/extended/stream_k/sk_2tile_sgemm.yaml
similarity index 100%
rename from Tensile/Tests/extended/stream_k/sk_2tile_sgemm.yaml
rename to Tests/extended/stream_k/sk_2tile_sgemm.yaml
diff --git a/Tensile/Tests/extended/stream_k/sk_hgemm_hhs.yaml b/Tests/extended/stream_k/sk_hgemm_hhs.yaml
similarity index 100%
rename from Tensile/Tests/extended/stream_k/sk_hgemm_hhs.yaml
rename to Tests/extended/stream_k/sk_hgemm_hhs.yaml
diff --git a/Tensile/Tests/extended/stream_k/sk_sgemm.yaml b/Tests/extended/stream_k/sk_sgemm.yaml
similarity index 100%
rename from Tensile/Tests/extended/stream_k/sk_sgemm.yaml
rename to Tests/extended/stream_k/sk_sgemm.yaml
diff --git a/Tensile/Tests/extended/tensor_contraction/README b/Tests/extended/tensor_contraction/README
similarity index 100%
rename from Tensile/Tests/extended/tensor_contraction/README
rename to Tests/extended/tensor_contraction/README
diff --git a/Tensile/Tests/extended/tensor_contraction/allownofree.yaml b/Tests/extended/tensor_contraction/allownofree.yaml
similarity index 100%
rename from Tensile/Tests/extended/tensor_contraction/allownofree.yaml
rename to Tests/extended/tensor_contraction/allownofree.yaml
diff --git a/Tensile/Tests/extended/tensor_contraction/assert_size_equal.yaml b/Tests/extended/tensor_contraction/assert_size_equal.yaml
similarity index 100%
rename from Tensile/Tests/extended/tensor_contraction/assert_size_equal.yaml
rename to Tests/extended/tensor_contraction/assert_size_equal.yaml
diff --git a/Tensile/Tests/extended/tensor_contraction/exact_conv.yaml b/Tests/extended/tensor_contraction/exact_conv.yaml
similarity index 100%
rename from Tensile/Tests/extended/tensor_contraction/exact_conv.yaml
rename to Tests/extended/tensor_contraction/exact_conv.yaml
diff --git a/Tensile/Tests/extended/tensor_contraction/filter.yaml b/Tests/extended/tensor_contraction/filter.yaml
similarity index 100%
rename from Tensile/Tests/extended/tensor_contraction/filter.yaml
rename to Tests/extended/tensor_contraction/filter.yaml
diff --git a/Tensile/Tests/extended/tensor_contraction/ncdhw.yaml b/Tests/extended/tensor_contraction/ncdhw.yaml
similarity index 100%
rename from Tensile/Tests/extended/tensor_contraction/ncdhw.yaml
rename to Tests/extended/tensor_contraction/ncdhw.yaml
diff --git a/Tensile/Tests/extended/tensor_contraction/sweep_packed_dims.yaml b/Tests/extended/tensor_contraction/sweep_packed_dims.yaml
similarity index 100%
rename from Tensile/Tests/extended/tensor_contraction/sweep_packed_dims.yaml
rename to Tests/extended/tensor_contraction/sweep_packed_dims.yaml
diff --git a/Tensile/Tests/extended/tensor_contraction/swizzle0.yaml b/Tests/extended/tensor_contraction/swizzle0.yaml
similarity index 100%
rename from Tensile/Tests/extended/tensor_contraction/swizzle0.yaml
rename to Tests/extended/tensor_contraction/swizzle0.yaml
diff --git a/Tensile/Tests/extended/tensor_contraction/swizzle1.yaml b/Tests/extended/tensor_contraction/swizzle1.yaml
similarity index 100%
rename from Tensile/Tests/extended/tensor_contraction/swizzle1.yaml
rename to Tests/extended/tensor_contraction/swizzle1.yaml
diff --git a/Tensile/Tests/extended/tensor_contraction/swizzle2.yaml b/Tests/extended/tensor_contraction/swizzle2.yaml
similarity index 100%
rename from Tensile/Tests/extended/tensor_contraction/swizzle2.yaml
rename to Tests/extended/tensor_contraction/swizzle2.yaml
diff --git a/Tensile/Tests/extended/tensor_contraction/swizzle3.yaml b/Tests/extended/tensor_contraction/swizzle3.yaml
similarity index 100%
rename from Tensile/Tests/extended/tensor_contraction/swizzle3.yaml
rename to Tests/extended/tensor_contraction/swizzle3.yaml
diff --git a/Tensile/Tests/extended/tensor_contraction/test_ncdhw_packed_strides3d_defaults.contraction.yaml b/Tests/extended/tensor_contraction/test_ncdhw_packed_strides3d_defaults.contraction.yaml
similarity index 100%
rename from Tensile/Tests/extended/tensor_contraction/test_ncdhw_packed_strides3d_defaults.contraction.yaml
rename to Tests/extended/tensor_contraction/test_ncdhw_packed_strides3d_defaults.contraction.yaml
diff --git a/Tensile/Tests/extended/tensor_contraction/test_ncdhw_packed_strides_filter3d.contraction.yaml b/Tests/extended/tensor_contraction/test_ncdhw_packed_strides_filter3d.contraction.yaml
similarity index 100%
rename from Tensile/Tests/extended/tensor_contraction/test_ncdhw_packed_strides_filter3d.contraction.yaml
rename to Tests/extended/tensor_contraction/test_ncdhw_packed_strides_filter3d.contraction.yaml
diff --git a/Tensile/Tests/extended/tensor_contraction/test_nchw_filter_contraction.yaml b/Tests/extended/tensor_contraction/test_nchw_filter_contraction.yaml
similarity index 100%
rename from Tensile/Tests/extended/tensor_contraction/test_nchw_filter_contraction.yaml
rename to Tests/extended/tensor_contraction/test_nchw_filter_contraction.yaml
diff --git a/Tensile/Tests/extended/tensor_contraction/tlu0_non_unit_stride.yaml b/Tests/extended/tensor_contraction/tlu0_non_unit_stride.yaml
similarity index 100%
rename from Tensile/Tests/extended/tensor_contraction/tlu0_non_unit_stride.yaml
rename to Tests/extended/tensor_contraction/tlu0_non_unit_stride.yaml
diff --git a/Tensile/Tests/extended/use_initial_strides/simple_use_initial_strides_1.yaml b/Tests/extended/use_initial_strides/simple_use_initial_strides_1.yaml
similarity index 100%
rename from Tensile/Tests/extended/use_initial_strides/simple_use_initial_strides_1.yaml
rename to Tests/extended/use_initial_strides/simple_use_initial_strides_1.yaml
diff --git a/Tensile/Tests/extended/use_initial_strides/test_1.yaml b/Tests/extended/use_initial_strides/test_1.yaml
similarity index 100%
rename from Tensile/Tests/extended/use_initial_strides/test_1.yaml
rename to Tests/extended/use_initial_strides/test_1.yaml
diff --git a/Tensile/Tests/extended/use_initial_strides/test_2.yaml b/Tests/extended/use_initial_strides/test_2.yaml
similarity index 100%
rename from Tensile/Tests/extended/use_initial_strides/test_2.yaml
rename to Tests/extended/use_initial_strides/test_2.yaml
diff --git a/Tensile/Tests/extended/use_initial_strides/test_strides.yaml b/Tests/extended/use_initial_strides/test_strides.yaml
similarity index 100%
rename from Tensile/Tests/extended/use_initial_strides/test_strides.yaml
rename to Tests/extended/use_initial_strides/test_strides.yaml
diff --git a/Tensile/Tests/extended/use_initial_strides/test_strides1.yaml b/Tests/extended/use_initial_strides/test_strides1.yaml
similarity index 100%
rename from Tensile/Tests/extended/use_initial_strides/test_strides1.yaml
rename to Tests/extended/use_initial_strides/test_strides1.yaml
diff --git a/Tensile/Tests/extended/use_initial_strides_cd/perf_uis_cd_specialized.yaml b/Tests/extended/use_initial_strides_cd/perf_uis_cd_specialized.yaml
similarity index 100%
rename from Tensile/Tests/extended/use_initial_strides_cd/perf_uis_cd_specialized.yaml
rename to Tests/extended/use_initial_strides_cd/perf_uis_cd_specialized.yaml
diff --git a/Tensile/Tests/extended/use_initial_strides_cd/test_use_initial_strides_cd_0.yaml b/Tests/extended/use_initial_strides_cd/test_use_initial_strides_cd_0.yaml
similarity index 100%
rename from Tensile/Tests/extended/use_initial_strides_cd/test_use_initial_strides_cd_0.yaml
rename to Tests/extended/use_initial_strides_cd/test_use_initial_strides_cd_0.yaml
diff --git a/Tensile/Tests/extended/use_initial_strides_cd/test_use_initial_strides_cd_2.yaml b/Tests/extended/use_initial_strides_cd/test_use_initial_strides_cd_2.yaml
similarity index 100%
rename from Tensile/Tests/extended/use_initial_strides_cd/test_use_initial_strides_cd_2.yaml
rename to Tests/extended/use_initial_strides_cd/test_use_initial_strides_cd_2.yaml
diff --git a/Tensile/Tests/extended/vector_width/hgemm_nn_asm.yaml b/Tests/extended/vector_width/hgemm_nn_asm.yaml
similarity index 100%
rename from Tensile/Tests/extended/vector_width/hgemm_nn_asm.yaml
rename to Tests/extended/vector_width/hgemm_nn_asm.yaml
diff --git a/Tensile/Tests/extended/vector_width/sgemm_nn_asm.yaml b/Tests/extended/vector_width/sgemm_nn_asm.yaml
similarity index 100%
rename from Tensile/Tests/extended/vector_width/sgemm_nn_asm.yaml
rename to Tests/extended/vector_width/sgemm_nn_asm.yaml
diff --git a/Tensile/Tests/extended/vector_width/sgemm_nn_source.yaml b/Tests/extended/vector_width/sgemm_nn_source.yaml
similarity index 100%
rename from Tensile/Tests/extended/vector_width/sgemm_nn_source.yaml
rename to Tests/extended/vector_width/sgemm_nn_source.yaml
diff --git a/Tensile/Tests/extended/zeropad/test_zp_2sum_zpother.yaml b/Tests/extended/zeropad/test_zp_2sum_zpother.yaml
similarity index 100%
rename from Tensile/Tests/extended/zeropad/test_zp_2sum_zpother.yaml
rename to Tests/extended/zeropad/test_zp_2sum_zpother.yaml
diff --git a/Tensile/Tests/extended/zeropad/test_zp_simple_1sum.yaml b/Tests/extended/zeropad/test_zp_simple_1sum.yaml
similarity index 100%
rename from Tensile/Tests/extended/zeropad/test_zp_simple_1sum.yaml
rename to Tests/extended/zeropad/test_zp_simple_1sum.yaml
diff --git a/Tensile/Tests/extended/zeropad/test_zp_simple_2sum_zp_both.yaml b/Tests/extended/zeropad/test_zp_simple_2sum_zp_both.yaml
similarity index 100%
rename from Tensile/Tests/extended/zeropad/test_zp_simple_2sum_zp_both.yaml
rename to Tests/extended/zeropad/test_zp_simple_2sum_zp_both.yaml
diff --git a/Tensile/Tests/extended/zeropad/test_zp_simple_2sum_zp_other.yaml b/Tests/extended/zeropad/test_zp_simple_2sum_zp_other.yaml
similarity index 100%
rename from Tensile/Tests/extended/zeropad/test_zp_simple_2sum_zp_other.yaml
rename to Tests/extended/zeropad/test_zp_simple_2sum_zp_other.yaml
diff --git a/Tensile/Tests/extended/zeropad/test_zp_simple_2sum_zp_unroll.yaml b/Tests/extended/zeropad/test_zp_simple_2sum_zp_unroll.yaml
similarity index 100%
rename from Tensile/Tests/extended/zeropad/test_zp_simple_2sum_zp_unroll.yaml
rename to Tests/extended/zeropad/test_zp_simple_2sum_zp_unroll.yaml
diff --git a/Tensile/Tests/extended/zeropad/test_zp_simple_3sum_zp_other.yaml b/Tests/extended/zeropad/test_zp_simple_3sum_zp_other.yaml
similarity index 100%
rename from Tensile/Tests/extended/zeropad/test_zp_simple_3sum_zp_other.yaml
rename to Tests/extended/zeropad/test_zp_simple_3sum_zp_other.yaml
diff --git a/Tensile/Tests/hipModuleLoad_timing/Makefile b/Tests/hipModuleLoad_timing/Makefile
similarity index 100%
rename from Tensile/Tests/hipModuleLoad_timing/Makefile
rename to Tests/hipModuleLoad_timing/Makefile
diff --git a/Tensile/Tests/hipModuleLoad_timing/hipModuleLoadTiming.cpp b/Tests/hipModuleLoad_timing/hipModuleLoadTiming.cpp
similarity index 100%
rename from Tensile/Tests/hipModuleLoad_timing/hipModuleLoadTiming.cpp
rename to Tests/hipModuleLoad_timing/hipModuleLoadTiming.cpp
diff --git a/Tensile/Tests/integration/test_integration.py b/Tests/integration/test_integration.py
similarity index 100%
rename from Tensile/Tests/integration/test_integration.py
rename to Tests/integration/test_integration.py
diff --git a/Tensile/Tests/pre_checkin/4xi8gemm_hpa_hip_nn.yaml b/Tests/pre_checkin/4xi8gemm_hpa_hip_nn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/4xi8gemm_hpa_hip_nn.yaml
rename to Tests/pre_checkin/4xi8gemm_hpa_hip_nn.yaml
diff --git a/Tensile/Tests/pre_checkin/4xi8gemm_hpa_hip_nt.yaml b/Tests/pre_checkin/4xi8gemm_hpa_hip_nt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/4xi8gemm_hpa_hip_nt.yaml
rename to Tests/pre_checkin/4xi8gemm_hpa_hip_nt.yaml
diff --git a/Tensile/Tests/pre_checkin/4xi8gemm_hpa_hip_tn.yaml b/Tests/pre_checkin/4xi8gemm_hpa_hip_tn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/4xi8gemm_hpa_hip_tn.yaml
rename to Tests/pre_checkin/4xi8gemm_hpa_hip_tn.yaml
diff --git a/Tensile/Tests/pre_checkin/4xi8gemm_hpa_hip_tt.yaml b/Tests/pre_checkin/4xi8gemm_hpa_hip_tt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/4xi8gemm_hpa_hip_tt.yaml
rename to Tests/pre_checkin/4xi8gemm_hpa_hip_tt.yaml
diff --git a/Tensile/Tests/pre_checkin/bfloat16/bfloat16_hpa_source_nn.yaml b/Tests/pre_checkin/bfloat16/bfloat16_hpa_source_nn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/bfloat16/bfloat16_hpa_source_nn.yaml
rename to Tests/pre_checkin/bfloat16/bfloat16_hpa_source_nn.yaml
diff --git a/Tensile/Tests/pre_checkin/bfloat16/bfloat16_hpa_source_nt.yaml b/Tests/pre_checkin/bfloat16/bfloat16_hpa_source_nt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/bfloat16/bfloat16_hpa_source_nt.yaml
rename to Tests/pre_checkin/bfloat16/bfloat16_hpa_source_nt.yaml
diff --git a/Tensile/Tests/pre_checkin/bfloat16/bfloat16_hpa_source_tn.yaml b/Tests/pre_checkin/bfloat16/bfloat16_hpa_source_tn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/bfloat16/bfloat16_hpa_source_tn.yaml
rename to Tests/pre_checkin/bfloat16/bfloat16_hpa_source_tn.yaml
diff --git a/Tensile/Tests/pre_checkin/bfloat16/bfloat16_hpa_source_tt.yaml b/Tests/pre_checkin/bfloat16/bfloat16_hpa_source_tt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/bfloat16/bfloat16_hpa_source_tt.yaml
rename to Tests/pre_checkin/bfloat16/bfloat16_hpa_source_tt.yaml
diff --git a/Tensile/Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_nn.yaml b/Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_nn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_nn.yaml
rename to Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_nn.yaml
diff --git a/Tensile/Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_nt.yaml b/Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_nt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_nt.yaml
rename to Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_nt.yaml
diff --git a/Tensile/Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_tn.yaml b/Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_tn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_tn.yaml
rename to Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_tn.yaml
diff --git a/Tensile/Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_tt.yaml b/Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_tt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_tt.yaml
rename to Tests/pre_checkin/bfloat16/bfloat16s_hpa_source_tt.yaml
diff --git a/Tensile/Tests/pre_checkin/cov/COV4.yaml b/Tests/pre_checkin/cov/COV4.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/cov/COV4.yaml
rename to Tests/pre_checkin/cov/COV4.yaml
diff --git a/Tensile/Tests/pre_checkin/cov/COV5.yaml b/Tests/pre_checkin/cov/COV5.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/cov/COV5.yaml
rename to Tests/pre_checkin/cov/COV5.yaml
diff --git a/Tensile/Tests/pre_checkin/cov/COVDefault.yaml b/Tests/pre_checkin/cov/COVDefault.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/cov/COVDefault.yaml
rename to Tests/pre_checkin/cov/COVDefault.yaml
diff --git a/Tensile/Tests/pre_checkin/denorm/bfloat16_hpa_source_nn.yaml b/Tests/pre_checkin/denorm/bfloat16_hpa_source_nn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/denorm/bfloat16_hpa_source_nn.yaml
rename to Tests/pre_checkin/denorm/bfloat16_hpa_source_nn.yaml
diff --git a/Tensile/Tests/pre_checkin/denorm/dgemm_asm.yaml b/Tests/pre_checkin/denorm/dgemm_asm.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/denorm/dgemm_asm.yaml
rename to Tests/pre_checkin/denorm/dgemm_asm.yaml
diff --git a/Tensile/Tests/pre_checkin/denorm/hgemm_hpa_asm_nn.yaml b/Tests/pre_checkin/denorm/hgemm_hpa_asm_nn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/denorm/hgemm_hpa_asm_nn.yaml
rename to Tests/pre_checkin/denorm/hgemm_hpa_asm_nn.yaml
diff --git a/Tensile/Tests/pre_checkin/denorm/mfma/bfloat16_1k_denorm.yaml b/Tests/pre_checkin/denorm/mfma/bfloat16_1k_denorm.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/denorm/mfma/bfloat16_1k_denorm.yaml
rename to Tests/pre_checkin/denorm/mfma/bfloat16_1k_denorm.yaml
diff --git a/Tensile/Tests/pre_checkin/denorm/mfma/bfloat16_denorm.yaml b/Tests/pre_checkin/denorm/mfma/bfloat16_denorm.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/denorm/mfma/bfloat16_denorm.yaml
rename to Tests/pre_checkin/denorm/mfma/bfloat16_denorm.yaml
diff --git a/Tensile/Tests/pre_checkin/denorm/mfma/dgemm_denorm.yaml b/Tests/pre_checkin/denorm/mfma/dgemm_denorm.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/denorm/mfma/dgemm_denorm.yaml
rename to Tests/pre_checkin/denorm/mfma/dgemm_denorm.yaml
diff --git a/Tensile/Tests/pre_checkin/denorm/mfma/hgemm_denorm.yaml b/Tests/pre_checkin/denorm/mfma/hgemm_denorm.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/denorm/mfma/hgemm_denorm.yaml
rename to Tests/pre_checkin/denorm/mfma/hgemm_denorm.yaml
diff --git a/Tensile/Tests/pre_checkin/denorm/mfma/hgemm_denorm_alt.yaml b/Tests/pre_checkin/denorm/mfma/hgemm_denorm_alt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/denorm/mfma/hgemm_denorm_alt.yaml
rename to Tests/pre_checkin/denorm/mfma/hgemm_denorm_alt.yaml
diff --git a/Tensile/Tests/pre_checkin/denorm/mfma/hgemm_denorm_alt_rnz.yaml b/Tests/pre_checkin/denorm/mfma/hgemm_denorm_alt_rnz.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/denorm/mfma/hgemm_denorm_alt_rnz.yaml
rename to Tests/pre_checkin/denorm/mfma/hgemm_denorm_alt_rnz.yaml
diff --git a/Tensile/Tests/pre_checkin/denorm/mfma/sgemm_denorm.yaml b/Tests/pre_checkin/denorm/mfma/sgemm_denorm.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/denorm/mfma/sgemm_denorm.yaml
rename to Tests/pre_checkin/denorm/mfma/sgemm_denorm.yaml
diff --git a/Tensile/Tests/pre_checkin/denorm/sgemm_asm_nn.yaml b/Tests/pre_checkin/denorm/sgemm_asm_nn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/denorm/sgemm_asm_nn.yaml
rename to Tests/pre_checkin/denorm/sgemm_asm_nn.yaml
diff --git a/Tensile/Tests/pre_checkin/dgemm_asm.yaml b/Tests/pre_checkin/dgemm_asm.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/dgemm_asm.yaml
rename to Tests/pre_checkin/dgemm_asm.yaml
diff --git a/Tensile/Tests/pre_checkin/dgemm_general_batch_asm.yaml b/Tests/pre_checkin/dgemm_general_batch_asm.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/dgemm_general_batch_asm.yaml
rename to Tests/pre_checkin/dgemm_general_batch_asm.yaml
diff --git a/Tensile/Tests/pre_checkin/direct_to_vgpr/dtv_sgemm_lite.yaml b/Tests/pre_checkin/direct_to_vgpr/dtv_sgemm_lite.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/direct_to_vgpr/dtv_sgemm_lite.yaml
rename to Tests/pre_checkin/direct_to_vgpr/dtv_sgemm_lite.yaml
diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_asm_cc.yaml b/Tests/pre_checkin/double_complex/double_complex_asm_cc.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/double_complex/double_complex_asm_cc.yaml
rename to Tests/pre_checkin/double_complex/double_complex_asm_cc.yaml
diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_asm_cn.yaml b/Tests/pre_checkin/double_complex/double_complex_asm_cn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/double_complex/double_complex_asm_cn.yaml
rename to Tests/pre_checkin/double_complex/double_complex_asm_cn.yaml
diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_asm_ct.yaml b/Tests/pre_checkin/double_complex/double_complex_asm_ct.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/double_complex/double_complex_asm_ct.yaml
rename to Tests/pre_checkin/double_complex/double_complex_asm_ct.yaml
diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_asm_nc.yaml b/Tests/pre_checkin/double_complex/double_complex_asm_nc.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/double_complex/double_complex_asm_nc.yaml
rename to Tests/pre_checkin/double_complex/double_complex_asm_nc.yaml
diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_asm_nn.yaml b/Tests/pre_checkin/double_complex/double_complex_asm_nn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/double_complex/double_complex_asm_nn.yaml
rename to Tests/pre_checkin/double_complex/double_complex_asm_nn.yaml
diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_asm_nt.yaml b/Tests/pre_checkin/double_complex/double_complex_asm_nt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/double_complex/double_complex_asm_nt.yaml
rename to Tests/pre_checkin/double_complex/double_complex_asm_nt.yaml
diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_asm_tc.yaml b/Tests/pre_checkin/double_complex/double_complex_asm_tc.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/double_complex/double_complex_asm_tc.yaml
rename to Tests/pre_checkin/double_complex/double_complex_asm_tc.yaml
diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_asm_tn.yaml b/Tests/pre_checkin/double_complex/double_complex_asm_tn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/double_complex/double_complex_asm_tn.yaml
rename to Tests/pre_checkin/double_complex/double_complex_asm_tn.yaml
diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_asm_tt.yaml b/Tests/pre_checkin/double_complex/double_complex_asm_tt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/double_complex/double_complex_asm_tt.yaml
rename to Tests/pre_checkin/double_complex/double_complex_asm_tt.yaml
diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_hip_cc.yaml b/Tests/pre_checkin/double_complex/double_complex_hip_cc.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/double_complex/double_complex_hip_cc.yaml
rename to Tests/pre_checkin/double_complex/double_complex_hip_cc.yaml
diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_hip_cn.yaml b/Tests/pre_checkin/double_complex/double_complex_hip_cn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/double_complex/double_complex_hip_cn.yaml
rename to Tests/pre_checkin/double_complex/double_complex_hip_cn.yaml
diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_hip_ct.yaml b/Tests/pre_checkin/double_complex/double_complex_hip_ct.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/double_complex/double_complex_hip_ct.yaml
rename to Tests/pre_checkin/double_complex/double_complex_hip_ct.yaml
diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_hip_nc.yaml b/Tests/pre_checkin/double_complex/double_complex_hip_nc.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/double_complex/double_complex_hip_nc.yaml
rename to Tests/pre_checkin/double_complex/double_complex_hip_nc.yaml
diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_hip_nn.yaml b/Tests/pre_checkin/double_complex/double_complex_hip_nn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/double_complex/double_complex_hip_nn.yaml
rename to Tests/pre_checkin/double_complex/double_complex_hip_nn.yaml
diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_hip_nt.yaml b/Tests/pre_checkin/double_complex/double_complex_hip_nt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/double_complex/double_complex_hip_nt.yaml
rename to Tests/pre_checkin/double_complex/double_complex_hip_nt.yaml
diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_hip_tc.yaml b/Tests/pre_checkin/double_complex/double_complex_hip_tc.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/double_complex/double_complex_hip_tc.yaml
rename to Tests/pre_checkin/double_complex/double_complex_hip_tc.yaml
diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_hip_tn.yaml b/Tests/pre_checkin/double_complex/double_complex_hip_tn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/double_complex/double_complex_hip_tn.yaml
rename to Tests/pre_checkin/double_complex/double_complex_hip_tn.yaml
diff --git a/Tensile/Tests/pre_checkin/double_complex/double_complex_hip_tt.yaml b/Tests/pre_checkin/double_complex/double_complex_hip_tt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/double_complex/double_complex_hip_tt.yaml
rename to Tests/pre_checkin/double_complex/double_complex_hip_tt.yaml
diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_asm_cc.yaml b/Tests/pre_checkin/float_complex/float_complex_asm_cc.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/float_complex/float_complex_asm_cc.yaml
rename to Tests/pre_checkin/float_complex/float_complex_asm_cc.yaml
diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_asm_cn.yaml b/Tests/pre_checkin/float_complex/float_complex_asm_cn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/float_complex/float_complex_asm_cn.yaml
rename to Tests/pre_checkin/float_complex/float_complex_asm_cn.yaml
diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_asm_ct.yaml b/Tests/pre_checkin/float_complex/float_complex_asm_ct.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/float_complex/float_complex_asm_ct.yaml
rename to Tests/pre_checkin/float_complex/float_complex_asm_ct.yaml
diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_asm_nc.yaml b/Tests/pre_checkin/float_complex/float_complex_asm_nc.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/float_complex/float_complex_asm_nc.yaml
rename to Tests/pre_checkin/float_complex/float_complex_asm_nc.yaml
diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_asm_nn.yaml b/Tests/pre_checkin/float_complex/float_complex_asm_nn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/float_complex/float_complex_asm_nn.yaml
rename to Tests/pre_checkin/float_complex/float_complex_asm_nn.yaml
diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_asm_nt.yaml b/Tests/pre_checkin/float_complex/float_complex_asm_nt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/float_complex/float_complex_asm_nt.yaml
rename to Tests/pre_checkin/float_complex/float_complex_asm_nt.yaml
diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_asm_tc.yaml b/Tests/pre_checkin/float_complex/float_complex_asm_tc.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/float_complex/float_complex_asm_tc.yaml
rename to Tests/pre_checkin/float_complex/float_complex_asm_tc.yaml
diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_asm_tn.yaml b/Tests/pre_checkin/float_complex/float_complex_asm_tn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/float_complex/float_complex_asm_tn.yaml
rename to Tests/pre_checkin/float_complex/float_complex_asm_tn.yaml
diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_asm_tt.yaml b/Tests/pre_checkin/float_complex/float_complex_asm_tt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/float_complex/float_complex_asm_tt.yaml
rename to Tests/pre_checkin/float_complex/float_complex_asm_tt.yaml
diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_hip_cc.yaml b/Tests/pre_checkin/float_complex/float_complex_hip_cc.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/float_complex/float_complex_hip_cc.yaml
rename to Tests/pre_checkin/float_complex/float_complex_hip_cc.yaml
diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_hip_cn.yaml b/Tests/pre_checkin/float_complex/float_complex_hip_cn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/float_complex/float_complex_hip_cn.yaml
rename to Tests/pre_checkin/float_complex/float_complex_hip_cn.yaml
diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_hip_ct.yaml b/Tests/pre_checkin/float_complex/float_complex_hip_ct.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/float_complex/float_complex_hip_ct.yaml
rename to Tests/pre_checkin/float_complex/float_complex_hip_ct.yaml
diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_hip_nc.yaml b/Tests/pre_checkin/float_complex/float_complex_hip_nc.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/float_complex/float_complex_hip_nc.yaml
rename to Tests/pre_checkin/float_complex/float_complex_hip_nc.yaml
diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_hip_nn.yaml b/Tests/pre_checkin/float_complex/float_complex_hip_nn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/float_complex/float_complex_hip_nn.yaml
rename to Tests/pre_checkin/float_complex/float_complex_hip_nn.yaml
diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_hip_nt.yaml b/Tests/pre_checkin/float_complex/float_complex_hip_nt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/float_complex/float_complex_hip_nt.yaml
rename to Tests/pre_checkin/float_complex/float_complex_hip_nt.yaml
diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_hip_tc.yaml b/Tests/pre_checkin/float_complex/float_complex_hip_tc.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/float_complex/float_complex_hip_tc.yaml
rename to Tests/pre_checkin/float_complex/float_complex_hip_tc.yaml
diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_hip_tn.yaml b/Tests/pre_checkin/float_complex/float_complex_hip_tn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/float_complex/float_complex_hip_tn.yaml
rename to Tests/pre_checkin/float_complex/float_complex_hip_tn.yaml
diff --git a/Tensile/Tests/pre_checkin/float_complex/float_complex_hip_tt.yaml b/Tests/pre_checkin/float_complex/float_complex_hip_tt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/float_complex/float_complex_hip_tt.yaml
rename to Tests/pre_checkin/float_complex/float_complex_hip_tt.yaml
diff --git a/Tensile/Tests/pre_checkin/hgemm_asm_nn.yaml b/Tests/pre_checkin/hgemm_asm_nn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/hgemm_asm_nn.yaml
rename to Tests/pre_checkin/hgemm_asm_nn.yaml
diff --git a/Tensile/Tests/pre_checkin/hgemm_asm_nt.yaml b/Tests/pre_checkin/hgemm_asm_nt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/hgemm_asm_nt.yaml
rename to Tests/pre_checkin/hgemm_asm_nt.yaml
diff --git a/Tensile/Tests/pre_checkin/hgemm_asm_tn.yaml b/Tests/pre_checkin/hgemm_asm_tn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/hgemm_asm_tn.yaml
rename to Tests/pre_checkin/hgemm_asm_tn.yaml
diff --git a/Tensile/Tests/pre_checkin/hgemm_asm_tt.yaml b/Tests/pre_checkin/hgemm_asm_tt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/hgemm_asm_tt.yaml
rename to Tests/pre_checkin/hgemm_asm_tt.yaml
diff --git a/Tensile/Tests/pre_checkin/hgemm_general_batch_asm_nn.yaml b/Tests/pre_checkin/hgemm_general_batch_asm_nn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/hgemm_general_batch_asm_nn.yaml
rename to Tests/pre_checkin/hgemm_general_batch_asm_nn.yaml
diff --git a/Tensile/Tests/pre_checkin/hgemm_general_batch_hpa_asm_nn.yaml b/Tests/pre_checkin/hgemm_general_batch_hpa_asm_nn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/hgemm_general_batch_hpa_asm_nn.yaml
rename to Tests/pre_checkin/hgemm_general_batch_hpa_asm_nn.yaml
diff --git a/Tensile/Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_nn.yaml b/Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_nn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_nn.yaml
rename to Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_nn.yaml
diff --git a/Tensile/Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_nt.yaml b/Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_nt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_nt.yaml
rename to Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_nt.yaml
diff --git a/Tensile/Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_tn.yaml b/Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_tn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_tn.yaml
rename to Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_tn.yaml
diff --git a/Tensile/Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_tt.yaml b/Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_tt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_tt.yaml
rename to Tests/pre_checkin/hgemm_hpa_asm_f32_alphabeta_tt.yaml
diff --git a/Tensile/Tests/pre_checkin/hgemm_hpa_asm_nn.yaml b/Tests/pre_checkin/hgemm_hpa_asm_nn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/hgemm_hpa_asm_nn.yaml
rename to Tests/pre_checkin/hgemm_hpa_asm_nn.yaml
diff --git a/Tensile/Tests/pre_checkin/hgemm_hpa_asm_nt.yaml b/Tests/pre_checkin/hgemm_hpa_asm_nt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/hgemm_hpa_asm_nt.yaml
rename to Tests/pre_checkin/hgemm_hpa_asm_nt.yaml
diff --git a/Tensile/Tests/pre_checkin/hgemm_hpa_asm_tn.yaml b/Tests/pre_checkin/hgemm_hpa_asm_tn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/hgemm_hpa_asm_tn.yaml
rename to Tests/pre_checkin/hgemm_hpa_asm_tn.yaml
diff --git a/Tensile/Tests/pre_checkin/hgemm_hpa_asm_tt.yaml b/Tests/pre_checkin/hgemm_hpa_asm_tt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/hgemm_hpa_asm_tt.yaml
rename to Tests/pre_checkin/hgemm_hpa_asm_tt.yaml
diff --git a/Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_nn.yaml b/Tests/pre_checkin/hgemm_hpa_iu2_asm_nn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_nn.yaml
rename to Tests/pre_checkin/hgemm_hpa_iu2_asm_nn.yaml
diff --git a/Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_nt.yaml b/Tests/pre_checkin/hgemm_hpa_iu2_asm_nt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_nt.yaml
rename to Tests/pre_checkin/hgemm_hpa_iu2_asm_nt.yaml
diff --git a/Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_tn.yaml b/Tests/pre_checkin/hgemm_hpa_iu2_asm_tn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_tn.yaml
rename to Tests/pre_checkin/hgemm_hpa_iu2_asm_tn.yaml
diff --git a/Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_tt.yaml b/Tests/pre_checkin/hgemm_hpa_iu2_asm_tt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_tt.yaml
rename to Tests/pre_checkin/hgemm_hpa_iu2_asm_tt.yaml
diff --git a/Tensile/Tests/pre_checkin/hsgemm_hpa_asm_nn.yaml b/Tests/pre_checkin/hsgemm_hpa_asm_nn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/hsgemm_hpa_asm_nn.yaml
rename to Tests/pre_checkin/hsgemm_hpa_asm_nn.yaml
diff --git a/Tensile/Tests/pre_checkin/hsgemm_hpa_asm_nt.yaml b/Tests/pre_checkin/hsgemm_hpa_asm_nt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/hsgemm_hpa_asm_nt.yaml
rename to Tests/pre_checkin/hsgemm_hpa_asm_nt.yaml
diff --git a/Tensile/Tests/pre_checkin/hsgemm_hpa_asm_tn.yaml b/Tests/pre_checkin/hsgemm_hpa_asm_tn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/hsgemm_hpa_asm_tn.yaml
rename to Tests/pre_checkin/hsgemm_hpa_asm_tn.yaml
diff --git a/Tensile/Tests/pre_checkin/hsgemm_hpa_asm_tt.yaml b/Tests/pre_checkin/hsgemm_hpa_asm_tt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/hsgemm_hpa_asm_tt.yaml
rename to Tests/pre_checkin/hsgemm_hpa_asm_tt.yaml
diff --git a/Tensile/Tests/pre_checkin/hsgemm_hpa_iu2_asm_nn.yaml b/Tests/pre_checkin/hsgemm_hpa_iu2_asm_nn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/hsgemm_hpa_iu2_asm_nn.yaml
rename to Tests/pre_checkin/hsgemm_hpa_iu2_asm_nn.yaml
diff --git a/Tensile/Tests/pre_checkin/hsgemm_hpa_iu2_asm_nt.yaml b/Tests/pre_checkin/hsgemm_hpa_iu2_asm_nt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/hsgemm_hpa_iu2_asm_nt.yaml
rename to Tests/pre_checkin/hsgemm_hpa_iu2_asm_nt.yaml
diff --git a/Tensile/Tests/pre_checkin/hsgemm_hpa_iu2_asm_tn.yaml b/Tests/pre_checkin/hsgemm_hpa_iu2_asm_tn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/hsgemm_hpa_iu2_asm_tn.yaml
rename to Tests/pre_checkin/hsgemm_hpa_iu2_asm_tn.yaml
diff --git a/Tensile/Tests/pre_checkin/hsgemm_hpa_iu2_asm_tt.yaml b/Tests/pre_checkin/hsgemm_hpa_iu2_asm_tt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/hsgemm_hpa_iu2_asm_tt.yaml
rename to Tests/pre_checkin/hsgemm_hpa_iu2_asm_tt.yaml
diff --git a/Tensile/Tests/pre_checkin/igemm_hpa_asm_nn.yaml b/Tests/pre_checkin/igemm_hpa_asm_nn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/igemm_hpa_asm_nn.yaml
rename to Tests/pre_checkin/igemm_hpa_asm_nn.yaml
diff --git a/Tensile/Tests/pre_checkin/igemm_hpa_hip_nn.yaml b/Tests/pre_checkin/igemm_hpa_hip_nn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/igemm_hpa_hip_nn.yaml
rename to Tests/pre_checkin/igemm_hpa_hip_nn.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/1LDSB.yaml b/Tests/pre_checkin/mfma/1LDSB.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/1LDSB.yaml
rename to Tests/pre_checkin/mfma/1LDSB.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/c-tile-reuse-no-nll.yaml b/Tests/pre_checkin/mfma/c-tile-reuse-no-nll.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/c-tile-reuse-no-nll.yaml
rename to Tests/pre_checkin/mfma/c-tile-reuse-no-nll.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/cgemm_asm.yaml b/Tests/pre_checkin/mfma/cgemm_asm.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/cgemm_asm.yaml
rename to Tests/pre_checkin/mfma/cgemm_asm.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/cgemm_asm_conjugate.yaml b/Tests/pre_checkin/mfma/cgemm_asm_conjugate.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/cgemm_asm_conjugate.yaml
rename to Tests/pre_checkin/mfma/cgemm_asm_conjugate.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/dgemm_alpha1_beta0_sgpr.yaml b/Tests/pre_checkin/mfma/dgemm_alpha1_beta0_sgpr.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/dgemm_alpha1_beta0_sgpr.yaml
rename to Tests/pre_checkin/mfma/dgemm_alpha1_beta0_sgpr.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/dgemm_asm.yaml b/Tests/pre_checkin/mfma/dgemm_asm.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/dgemm_asm.yaml
rename to Tests/pre_checkin/mfma/dgemm_asm.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/dgemm_gb_global_ldd.yaml b/Tests/pre_checkin/mfma/dgemm_gb_global_ldd.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/dgemm_gb_global_ldd.yaml
rename to Tests/pre_checkin/mfma/dgemm_gb_global_ldd.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/dgemm_large_offset.yaml b/Tests/pre_checkin/mfma/dgemm_large_offset.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/dgemm_large_offset.yaml
rename to Tests/pre_checkin/mfma/dgemm_large_offset.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_bfloat16_gemm_asm.yaml b/Tests/pre_checkin/mfma/hpa_bfloat16_gemm_asm.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/hpa_bfloat16_gemm_asm.yaml
rename to Tests/pre_checkin/mfma/hpa_bfloat16_gemm_asm.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_bfloat16_gemm_asm_gfx940.yaml b/Tests/pre_checkin/mfma/hpa_bfloat16_gemm_asm_gfx940.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/hpa_bfloat16_gemm_asm_gfx940.yaml
rename to Tests/pre_checkin/mfma/hpa_bfloat16_gemm_asm_gfx940.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_bfloat16_general_batch_gemm_asm.yaml b/Tests/pre_checkin/mfma/hpa_bfloat16_general_batch_gemm_asm.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/hpa_bfloat16_general_batch_gemm_asm.yaml
rename to Tests/pre_checkin/mfma/hpa_bfloat16_general_batch_gemm_asm.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_bfloat16_general_batch_gemm_asm_gfx940.yaml b/Tests/pre_checkin/mfma/hpa_bfloat16_general_batch_gemm_asm_gfx940.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/hpa_bfloat16_general_batch_gemm_asm_gfx940.yaml
rename to Tests/pre_checkin/mfma/hpa_bfloat16_general_batch_gemm_asm_gfx940.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_bfloat16s_gemm_asm.yaml b/Tests/pre_checkin/mfma/hpa_bfloat16s_gemm_asm.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/hpa_bfloat16s_gemm_asm.yaml
rename to Tests/pre_checkin/mfma/hpa_bfloat16s_gemm_asm.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_bfloat16s_gemm_asm_gfx940.yaml b/Tests/pre_checkin/mfma/hpa_bfloat16s_gemm_asm_gfx940.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/hpa_bfloat16s_gemm_asm_gfx940.yaml
rename to Tests/pre_checkin/mfma/hpa_bfloat16s_gemm_asm_gfx940.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_hgemm_asm.yaml b/Tests/pre_checkin/mfma/hpa_hgemm_asm.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/hpa_hgemm_asm.yaml
rename to Tests/pre_checkin/mfma/hpa_hgemm_asm.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_hgemm_f32_alphabeta_asm.yaml b/Tests/pre_checkin/mfma/hpa_hgemm_f32_alphabeta_asm.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/hpa_hgemm_f32_alphabeta_asm.yaml
rename to Tests/pre_checkin/mfma/hpa_hgemm_f32_alphabeta_asm.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_hgemm_general_batch_asm.yaml b/Tests/pre_checkin/mfma/hpa_hgemm_general_batch_asm.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/hpa_hgemm_general_batch_asm.yaml
rename to Tests/pre_checkin/mfma/hpa_hgemm_general_batch_asm.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_hgemm_split_lds.yaml b/Tests/pre_checkin/mfma/hpa_hgemm_split_lds.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/hpa_hgemm_split_lds.yaml
rename to Tests/pre_checkin/mfma/hpa_hgemm_split_lds.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_hsgemm_asm.yaml b/Tests/pre_checkin/mfma/hpa_hsgemm_asm.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/hpa_hsgemm_asm.yaml
rename to Tests/pre_checkin/mfma/hpa_hsgemm_asm.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_igemm_i8_asm.yaml b/Tests/pre_checkin/mfma/hpa_igemm_i8_asm.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/hpa_igemm_i8_asm.yaml
rename to Tests/pre_checkin/mfma/hpa_igemm_i8_asm.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_igemm_i8_asm_gfx940.yaml b/Tests/pre_checkin/mfma/hpa_igemm_i8_asm_gfx940.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/hpa_igemm_i8_asm_gfx940.yaml
rename to Tests/pre_checkin/mfma/hpa_igemm_i8_asm_gfx940.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_igemm_i8_split_lds.yaml b/Tests/pre_checkin/mfma/hpa_igemm_i8_split_lds.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/hpa_igemm_i8_split_lds.yaml
rename to Tests/pre_checkin/mfma/hpa_igemm_i8_split_lds.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/hpa_igemm_i8_split_lds_gfx940.yaml b/Tests/pre_checkin/mfma/hpa_igemm_i8_split_lds_gfx940.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/hpa_igemm_i8_split_lds_gfx940.yaml
rename to Tests/pre_checkin/mfma/hpa_igemm_i8_split_lds_gfx940.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/sgemm_64bit_offset.yaml b/Tests/pre_checkin/mfma/sgemm_64bit_offset.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/sgemm_64bit_offset.yaml
rename to Tests/pre_checkin/mfma/sgemm_64bit_offset.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/sgemm_64bit_offset_post.yaml b/Tests/pre_checkin/mfma/sgemm_64bit_offset_post.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/sgemm_64bit_offset_post.yaml
rename to Tests/pre_checkin/mfma/sgemm_64bit_offset_post.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/sgemm_asm.yaml b/Tests/pre_checkin/mfma/sgemm_asm.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/sgemm_asm.yaml
rename to Tests/pre_checkin/mfma/sgemm_asm.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/sgemm_general_batch_asm.yaml b/Tests/pre_checkin/mfma/sgemm_general_batch_asm.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/sgemm_general_batch_asm.yaml
rename to Tests/pre_checkin/mfma/sgemm_general_batch_asm.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/sgemm_split_lds.yaml b/Tests/pre_checkin/mfma/sgemm_split_lds.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/sgemm_split_lds.yaml
rename to Tests/pre_checkin/mfma/sgemm_split_lds.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/sgemm_xf32_asm_gfx940.yaml b/Tests/pre_checkin/mfma/sgemm_xf32_asm_gfx940.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/sgemm_xf32_asm_gfx940.yaml
rename to Tests/pre_checkin/mfma/sgemm_xf32_asm_gfx940.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/wider_local_read.yaml b/Tests/pre_checkin/mfma/wider_local_read.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/wider_local_read.yaml
rename to Tests/pre_checkin/mfma/wider_local_read.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/zgemm_asm.yaml b/Tests/pre_checkin/mfma/zgemm_asm.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/zgemm_asm.yaml
rename to Tests/pre_checkin/mfma/zgemm_asm.yaml
diff --git a/Tensile/Tests/pre_checkin/mfma/zgemm_asm_conjugate.yaml b/Tests/pre_checkin/mfma/zgemm_asm_conjugate.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/mfma/zgemm_asm_conjugate.yaml
rename to Tests/pre_checkin/mfma/zgemm_asm_conjugate.yaml
diff --git a/Tensile/Tests/pre_checkin/no_load_loop/nll_reproduce_bug.yaml b/Tests/pre_checkin/no_load_loop/nll_reproduce_bug.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/no_load_loop/nll_reproduce_bug.yaml
rename to Tests/pre_checkin/no_load_loop/nll_reproduce_bug.yaml
diff --git a/Tensile/Tests/pre_checkin/no_load_loop/sgemm_nll_asm_nn.yaml b/Tests/pre_checkin/no_load_loop/sgemm_nll_asm_nn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/no_load_loop/sgemm_nll_asm_nn.yaml
rename to Tests/pre_checkin/no_load_loop/sgemm_nll_asm_nn.yaml
diff --git a/Tensile/Tests/pre_checkin/no_load_loop/sgemm_nll_asm_nt.yaml b/Tests/pre_checkin/no_load_loop/sgemm_nll_asm_nt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/no_load_loop/sgemm_nll_asm_nt.yaml
rename to Tests/pre_checkin/no_load_loop/sgemm_nll_asm_nt.yaml
diff --git a/Tensile/Tests/pre_checkin/no_load_loop/sgemm_nll_asm_tn.yaml b/Tests/pre_checkin/no_load_loop/sgemm_nll_asm_tn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/no_load_loop/sgemm_nll_asm_tn.yaml
rename to Tests/pre_checkin/no_load_loop/sgemm_nll_asm_tn.yaml
diff --git a/Tensile/Tests/pre_checkin/no_load_loop/sgemm_nll_asm_tt.yaml b/Tests/pre_checkin/no_load_loop/sgemm_nll_asm_tt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/no_load_loop/sgemm_nll_asm_tt.yaml
rename to Tests/pre_checkin/no_load_loop/sgemm_nll_asm_tt.yaml
diff --git a/Tensile/Tests/pre_checkin/regression/persistent_kernel.yaml b/Tests/pre_checkin/regression/persistent_kernel.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/regression/persistent_kernel.yaml
rename to Tests/pre_checkin/regression/persistent_kernel.yaml
diff --git a/Tensile/Tests/pre_checkin/sgemm_asm_nn.yaml b/Tests/pre_checkin/sgemm_asm_nn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/sgemm_asm_nn.yaml
rename to Tests/pre_checkin/sgemm_asm_nn.yaml
diff --git a/Tensile/Tests/pre_checkin/sgemm_asm_nt.yaml b/Tests/pre_checkin/sgemm_asm_nt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/sgemm_asm_nt.yaml
rename to Tests/pre_checkin/sgemm_asm_nt.yaml
diff --git a/Tensile/Tests/pre_checkin/sgemm_asm_tn.yaml b/Tests/pre_checkin/sgemm_asm_tn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/sgemm_asm_tn.yaml
rename to Tests/pre_checkin/sgemm_asm_tn.yaml
diff --git a/Tensile/Tests/pre_checkin/sgemm_asm_tn_bigk.yaml b/Tests/pre_checkin/sgemm_asm_tn_bigk.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/sgemm_asm_tn_bigk.yaml
rename to Tests/pre_checkin/sgemm_asm_tn_bigk.yaml
diff --git a/Tensile/Tests/pre_checkin/sgemm_asm_tt.yaml b/Tests/pre_checkin/sgemm_asm_tt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/sgemm_asm_tt.yaml
rename to Tests/pre_checkin/sgemm_asm_tt.yaml
diff --git a/Tensile/Tests/pre_checkin/sgemm_exact_dict.yaml b/Tests/pre_checkin/sgemm_exact_dict.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/sgemm_exact_dict.yaml
rename to Tests/pre_checkin/sgemm_exact_dict.yaml
diff --git a/Tensile/Tests/pre_checkin/sgemm_general_batch_asm_nn.yaml b/Tests/pre_checkin/sgemm_general_batch_asm_nn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/sgemm_general_batch_asm_nn.yaml
rename to Tests/pre_checkin/sgemm_general_batch_asm_nn.yaml
diff --git a/Tensile/Tests/pre_checkin/source/test_dgemm_defaults.yaml b/Tests/pre_checkin/source/test_dgemm_defaults.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/source/test_dgemm_defaults.yaml
rename to Tests/pre_checkin/source/test_dgemm_defaults.yaml
diff --git a/Tensile/Tests/pre_checkin/source/test_hgemm_defaults.yaml b/Tests/pre_checkin/source/test_hgemm_defaults.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/source/test_hgemm_defaults.yaml
rename to Tests/pre_checkin/source/test_hgemm_defaults.yaml
diff --git a/Tensile/Tests/pre_checkin/source/test_hgemm_hpa.yaml b/Tests/pre_checkin/source/test_hgemm_hpa.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/source/test_hgemm_hpa.yaml
rename to Tests/pre_checkin/source/test_hgemm_hpa.yaml
diff --git a/Tensile/Tests/pre_checkin/source/test_sgemm_defaults.yaml b/Tests/pre_checkin/source/test_sgemm_defaults.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/source/test_sgemm_defaults.yaml
rename to Tests/pre_checkin/source/test_sgemm_defaults.yaml
diff --git a/Tensile/Tests/pre_checkin/wmma/hgemm_wmma.yaml b/Tests/pre_checkin/wmma/hgemm_wmma.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/wmma/hgemm_wmma.yaml
rename to Tests/pre_checkin/wmma/hgemm_wmma.yaml
diff --git a/Tensile/Tests/pre_checkin/wmma/hpa_bfloat16_gemm_wmma.yaml b/Tests/pre_checkin/wmma/hpa_bfloat16_gemm_wmma.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/wmma/hpa_bfloat16_gemm_wmma.yaml
rename to Tests/pre_checkin/wmma/hpa_bfloat16_gemm_wmma.yaml
diff --git a/Tensile/Tests/pre_checkin/wmma/hpa_hgemm_wmma.yaml b/Tests/pre_checkin/wmma/hpa_hgemm_wmma.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/wmma/hpa_hgemm_wmma.yaml
rename to Tests/pre_checkin/wmma/hpa_hgemm_wmma.yaml
diff --git a/Tensile/Tests/pre_checkin/wmma/hpa_igemm_wmma.yaml b/Tests/pre_checkin/wmma/hpa_igemm_wmma.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/wmma/hpa_igemm_wmma.yaml
rename to Tests/pre_checkin/wmma/hpa_igemm_wmma.yaml
diff --git a/Tensile/Tests/special/global_split_u_src/README b/Tests/special/global_split_u_src/README
similarity index 100%
rename from Tensile/Tests/special/global_split_u_src/README
rename to Tests/special/global_split_u_src/README
diff --git a/Tensile/Tests/special/global_split_u_src/hgemm_gsu.yaml b/Tests/special/global_split_u_src/hgemm_gsu.yaml
similarity index 100%
rename from Tensile/Tests/special/global_split_u_src/hgemm_gsu.yaml
rename to Tests/special/global_split_u_src/hgemm_gsu.yaml
diff --git a/Tensile/Tests/special/global_split_u_src/sgemm_gsu_beta0.yaml b/Tests/special/global_split_u_src/sgemm_gsu_beta0.yaml
similarity index 100%
rename from Tensile/Tests/special/global_split_u_src/sgemm_gsu_beta0.yaml
rename to Tests/special/global_split_u_src/sgemm_gsu_beta0.yaml
diff --git a/Tensile/Tests/special/global_split_u_src/sgemm_gsu_beta1.yaml b/Tests/special/global_split_u_src/sgemm_gsu_beta1.yaml
similarity index 100%
rename from Tensile/Tests/special/global_split_u_src/sgemm_gsu_beta1.yaml
rename to Tests/special/global_split_u_src/sgemm_gsu_beta1.yaml
diff --git a/Tensile/Tests/special/global_split_u_src/sgemm_gsu_beta2.yaml b/Tests/special/global_split_u_src/sgemm_gsu_beta2.yaml
similarity index 100%
rename from Tensile/Tests/special/global_split_u_src/sgemm_gsu_beta2.yaml
rename to Tests/special/global_split_u_src/sgemm_gsu_beta2.yaml
diff --git a/Tensile/Tests/special/global_split_u_src/sgemm_gsu_usebeta0.yaml b/Tests/special/global_split_u_src/sgemm_gsu_usebeta0.yaml
similarity index 100%
rename from Tensile/Tests/special/global_split_u_src/sgemm_gsu_usebeta0.yaml
rename to Tests/special/global_split_u_src/sgemm_gsu_usebeta0.yaml
diff --git a/Tensile/Tests/special/igemm/igemm_hpa_hip_lsu.yaml b/Tests/special/igemm/igemm_hpa_hip_lsu.yaml
similarity index 100%
rename from Tensile/Tests/special/igemm/igemm_hpa_hip_lsu.yaml
rename to Tests/special/igemm/igemm_hpa_hip_lsu.yaml
diff --git a/Tensile/Tests/special/igemm/igemm_hpa_hip_nn.yaml b/Tests/special/igemm/igemm_hpa_hip_nn.yaml
similarity index 100%
rename from Tensile/Tests/special/igemm/igemm_hpa_hip_nn.yaml
rename to Tests/special/igemm/igemm_hpa_hip_nn.yaml
diff --git a/Tensile/Tests/special/igemm/igemm_hpa_hip_tt.yaml b/Tests/special/igemm/igemm_hpa_hip_tt.yaml
similarity index 100%
rename from Tensile/Tests/special/igemm/igemm_hpa_hip_tt.yaml
rename to Tests/special/igemm/igemm_hpa_hip_tt.yaml
diff --git a/Tensile/Tests/test_data/unit/library_data/hardcodedParameters.yaml b/Tests/test_data/unit/library_data/hardcodedParameters.yaml
similarity index 100%
rename from Tensile/Tests/test_data/unit/library_data/hardcodedParameters.yaml
rename to Tests/test_data/unit/library_data/hardcodedParameters.yaml
diff --git a/Tensile/Tests/test_data/unit/library_data/initialSolutionParameters.yaml b/Tests/test_data/unit/library_data/initialSolutionParameters.yaml
similarity index 100%
rename from Tensile/Tests/test_data/unit/library_data/initialSolutionParameters.yaml
rename to Tests/test_data/unit/library_data/initialSolutionParameters.yaml
diff --git a/Tensile/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx1010.hsaco b/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx1010.hsaco
similarity index 100%
rename from Tensile/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx1010.hsaco
rename to Tests/test_data/unit/library_data/library/Kernels.so-000-gfx1010.hsaco
diff --git a/Tensile/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx1011.hsaco b/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx1011.hsaco
similarity index 100%
rename from Tensile/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx1011.hsaco
rename to Tests/test_data/unit/library_data/library/Kernels.so-000-gfx1011.hsaco
diff --git a/Tensile/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx803.hsaco b/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx803.hsaco
similarity index 100%
rename from Tensile/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx803.hsaco
rename to Tests/test_data/unit/library_data/library/Kernels.so-000-gfx803.hsaco
diff --git a/Tensile/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx900.hsaco b/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx900.hsaco
similarity index 100%
rename from Tensile/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx900.hsaco
rename to Tests/test_data/unit/library_data/library/Kernels.so-000-gfx900.hsaco
diff --git a/Tensile/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx906.hsaco b/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx906.hsaco
similarity index 100%
rename from Tensile/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx906.hsaco
rename to Tests/test_data/unit/library_data/library/Kernels.so-000-gfx906.hsaco
diff --git a/Tensile/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx908.hsaco b/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx908.hsaco
similarity index 100%
rename from Tensile/Tests/test_data/unit/library_data/library/Kernels.so-000-gfx908.hsaco
rename to Tests/test_data/unit/library_data/library/Kernels.so-000-gfx908.hsaco
diff --git a/Tensile/Tests/test_data/unit/library_data/library/TensileLibrary.yaml b/Tests/test_data/unit/library_data/library/TensileLibrary.yaml
similarity index 100%
rename from Tensile/Tests/test_data/unit/library_data/library/TensileLibrary.yaml
rename to Tests/test_data/unit/library_data/library/TensileLibrary.yaml
diff --git a/Tensile/Tests/test_data/unit/library_data/library/TensileLibrary_gfx1010.co b/Tests/test_data/unit/library_data/library/TensileLibrary_gfx1010.co
similarity index 100%
rename from Tensile/Tests/test_data/unit/library_data/library/TensileLibrary_gfx1010.co
rename to Tests/test_data/unit/library_data/library/TensileLibrary_gfx1010.co
diff --git a/Tensile/Tests/test_data/unit/library_data/library/TensileLibrary_gfx1011.co b/Tests/test_data/unit/library_data/library/TensileLibrary_gfx1011.co
similarity index 100%
rename from Tensile/Tests/test_data/unit/library_data/library/TensileLibrary_gfx1011.co
rename to Tests/test_data/unit/library_data/library/TensileLibrary_gfx1011.co
diff --git a/Tensile/Tests/test_data/unit/library_data/library/TensileLibrary_gfx803.co b/Tests/test_data/unit/library_data/library/TensileLibrary_gfx803.co
similarity index 100%
rename from Tensile/Tests/test_data/unit/library_data/library/TensileLibrary_gfx803.co
rename to Tests/test_data/unit/library_data/library/TensileLibrary_gfx803.co
diff --git a/Tensile/Tests/test_data/unit/library_data/library/TensileLibrary_gfx900.co b/Tests/test_data/unit/library_data/library/TensileLibrary_gfx900.co
similarity index 100%
rename from Tensile/Tests/test_data/unit/library_data/library/TensileLibrary_gfx900.co
rename to Tests/test_data/unit/library_data/library/TensileLibrary_gfx900.co
diff --git a/Tensile/Tests/test_data/unit/library_data/library/TensileLibrary_gfx906.co b/Tests/test_data/unit/library_data/library/TensileLibrary_gfx906.co
similarity index 100%
rename from Tensile/Tests/test_data/unit/library_data/library/TensileLibrary_gfx906.co
rename to Tests/test_data/unit/library_data/library/TensileLibrary_gfx906.co
diff --git a/Tensile/Tests/test_data/unit/library_data/library/TensileLibrary_gfx908.co b/Tests/test_data/unit/library_data/library/TensileLibrary_gfx908.co
similarity index 100%
rename from Tensile/Tests/test_data/unit/library_data/library/TensileLibrary_gfx908.co
rename to Tests/test_data/unit/library_data/library/TensileLibrary_gfx908.co
diff --git a/Tensile/Tests/test_data/unit/library_data/library/metadata.yaml b/Tests/test_data/unit/library_data/library/metadata.yaml
similarity index 100%
rename from Tensile/Tests/test_data/unit/library_data/library/metadata.yaml
rename to Tests/test_data/unit/library_data/library/metadata.yaml
diff --git a/Tensile/Tests/test_data/unit/library_data/problemType.yaml b/Tests/test_data/unit/library_data/problemType.yaml
similarity index 100%
rename from Tensile/Tests/test_data/unit/library_data/problemType.yaml
rename to Tests/test_data/unit/library_data/problemType.yaml
diff --git a/Tensile/Tests/test_data/unit/solutions/solutions_nn_3.yaml b/Tests/test_data/unit/solutions/solutions_nn_3.yaml
similarity index 100%
rename from Tensile/Tests/test_data/unit/solutions/solutions_nn_3.yaml
rename to Tests/test_data/unit/solutions/solutions_nn_3.yaml
diff --git a/Tensile/Tests/unit/__init__.py b/Tests/unit/__init__.py
similarity index 100%
rename from Tensile/Tests/unit/__init__.py
rename to Tests/unit/__init__.py
diff --git a/Tensile/Tests/unit/customKernels/TestKernel.s b/Tests/unit/customKernels/TestKernel.s
similarity index 100%
rename from Tensile/Tests/unit/customKernels/TestKernel.s
rename to Tests/unit/customKernels/TestKernel.s
diff --git a/Tensile/Tests/unit/replacement/bad_file/bad.txt b/Tests/unit/replacement/bad_file/bad.txt
similarity index 100%
rename from Tensile/Tests/unit/replacement/bad_file/bad.txt
rename to Tests/unit/replacement/bad_file/bad.txt
diff --git a/Tensile/Tests/unit/replacement/duplicate_kernel/a.txt b/Tests/unit/replacement/duplicate_kernel/a.txt
similarity index 100%
rename from Tensile/Tests/unit/replacement/duplicate_kernel/a.txt
rename to Tests/unit/replacement/duplicate_kernel/a.txt
diff --git a/Tensile/Tests/unit/replacement/duplicate_kernel/b.txt b/Tests/unit/replacement/duplicate_kernel/b.txt
similarity index 100%
rename from Tensile/Tests/unit/replacement/duplicate_kernel/b.txt
rename to Tests/unit/replacement/duplicate_kernel/b.txt
diff --git a/Tensile/Tests/unit/replacement/known_kernels_v2/baz.s.txt b/Tests/unit/replacement/known_kernels_v2/baz.s.txt
similarity index 100%
rename from Tensile/Tests/unit/replacement/known_kernels_v2/baz.s.txt
rename to Tests/unit/replacement/known_kernels_v2/baz.s.txt
diff --git a/Tensile/Tests/unit/replacement/known_kernels_v2/kernel_named_bar.txt b/Tests/unit/replacement/known_kernels_v2/kernel_named_bar.txt
similarity index 100%
rename from Tensile/Tests/unit/replacement/known_kernels_v2/kernel_named_bar.txt
rename to Tests/unit/replacement/known_kernels_v2/kernel_named_bar.txt
diff --git a/Tensile/Tests/unit/replacement/known_kernels_v2/kernel_named_foo.txt b/Tests/unit/replacement/known_kernels_v2/kernel_named_foo.txt
similarity index 100%
rename from Tensile/Tests/unit/replacement/known_kernels_v2/kernel_named_foo.txt
rename to Tests/unit/replacement/known_kernels_v2/kernel_named_foo.txt
diff --git a/Tensile/Tests/unit/replacement/known_kernels_v3/baz.s.txt b/Tests/unit/replacement/known_kernels_v3/baz.s.txt
similarity index 100%
rename from Tensile/Tests/unit/replacement/known_kernels_v3/baz.s.txt
rename to Tests/unit/replacement/known_kernels_v3/baz.s.txt
diff --git a/Tensile/Tests/unit/replacement/known_kernels_v3/kernel_named_bar.txt b/Tests/unit/replacement/known_kernels_v3/kernel_named_bar.txt
similarity index 100%
rename from Tensile/Tests/unit/replacement/known_kernels_v3/kernel_named_bar.txt
rename to Tests/unit/replacement/known_kernels_v3/kernel_named_bar.txt
diff --git a/Tensile/Tests/unit/replacement/known_kernels_v3/kernel_named_foo.txt b/Tests/unit/replacement/known_kernels_v3/kernel_named_foo.txt
similarity index 100%
rename from Tensile/Tests/unit/replacement/known_kernels_v3/kernel_named_foo.txt
rename to Tests/unit/replacement/known_kernels_v3/kernel_named_foo.txt
diff --git a/Tensile/Tests/unit/test_Common.py b/Tests/unit/test_Common.py
similarity index 100%
rename from Tensile/Tests/unit/test_Common.py
rename to Tests/unit/test_Common.py
diff --git a/Tensile/Tests/unit/test_Component.py b/Tests/unit/test_Component.py
similarity index 100%
rename from Tensile/Tests/unit/test_Component.py
rename to Tests/unit/test_Component.py
diff --git a/Tensile/Tests/unit/test_Configuration.py b/Tests/unit/test_Configuration.py
similarity index 100%
rename from Tensile/Tests/unit/test_Configuration.py
rename to Tests/unit/test_Configuration.py
diff --git a/Tensile/Tests/unit/test_CustomKernels.py b/Tests/unit/test_CustomKernels.py
similarity index 100%
rename from Tensile/Tests/unit/test_CustomKernels.py
rename to Tests/unit/test_CustomKernels.py
diff --git a/Tensile/Tests/unit/test_DataType.py b/Tests/unit/test_DataType.py
similarity index 100%
rename from Tensile/Tests/unit/test_DataType.py
rename to Tests/unit/test_DataType.py
diff --git a/Tensile/Tests/unit/test_HardwarePredicates.py b/Tests/unit/test_HardwarePredicates.py
similarity index 100%
rename from Tensile/Tests/unit/test_HardwarePredicates.py
rename to Tests/unit/test_HardwarePredicates.py
diff --git a/Tensile/Tests/unit/test_KernelWriterAssembly.py b/Tests/unit/test_KernelWriterAssembly.py
similarity index 100%
rename from Tensile/Tests/unit/test_KernelWriterAssembly.py
rename to Tests/unit/test_KernelWriterAssembly.py
diff --git a/Tensile/Tests/unit/test_LibraryIO.py b/Tests/unit/test_LibraryIO.py
similarity index 100%
rename from Tensile/Tests/unit/test_LibraryIO.py
rename to Tests/unit/test_LibraryIO.py
diff --git a/Tensile/Tests/unit/test_PerfMetricPredicates.py b/Tests/unit/test_PerfMetricPredicates.py
similarity index 100%
rename from Tensile/Tests/unit/test_PerfMetricPredicates.py
rename to Tests/unit/test_PerfMetricPredicates.py
diff --git a/Tensile/Tests/unit/test_Priority.py b/Tests/unit/test_Priority.py
similarity index 100%
rename from Tensile/Tests/unit/test_Priority.py
rename to Tests/unit/test_Priority.py
diff --git a/Tensile/Tests/unit/test_ReplacementKernels.py b/Tests/unit/test_ReplacementKernels.py
similarity index 100%
rename from Tensile/Tests/unit/test_ReplacementKernels.py
rename to Tests/unit/test_ReplacementKernels.py
diff --git a/Tensile/Tests/unit/test_TensileCreateLibrary.py b/Tests/unit/test_TensileCreateLibrary.py
similarity index 100%
rename from Tensile/Tests/unit/test_TensileCreateLibrary.py
rename to Tests/unit/test_TensileCreateLibrary.py
diff --git a/Tensile/Tests/unit/test_conv_problem.py b/Tests/unit/test_conv_problem.py
similarity index 100%
rename from Tensile/Tests/unit/test_conv_problem.py
rename to Tests/unit/test_conv_problem.py
diff --git a/Tensile/Tests/unit/test_exact_problem.py b/Tests/unit/test_exact_problem.py
similarity index 100%
rename from Tensile/Tests/unit/test_exact_problem.py
rename to Tests/unit/test_exact_problem.py
diff --git a/Tensile/Tests/unit/test_makeProblem.py b/Tests/unit/test_makeProblem.py
similarity index 100%
rename from Tensile/Tests/unit/test_makeProblem.py
rename to Tests/unit/test_makeProblem.py
diff --git a/Tensile/Tests/unit/test_mergeLogic.py b/Tests/unit/test_mergeLogic.py
similarity index 100%
rename from Tensile/Tests/unit/test_mergeLogic.py
rename to Tests/unit/test_mergeLogic.py
diff --git a/Tensile/Tests/unit/test_tryAssembler.py b/Tests/unit/test_tryAssembler.py
similarity index 100%
rename from Tensile/Tests/unit/test_tryAssembler.py
rename to Tests/unit/test_tryAssembler.py
diff --git a/Tensile/Tests/unit/test_useGlobalParameters.py b/Tests/unit/test_useGlobalParameters.py
similarity index 100%
rename from Tensile/Tests/unit/test_useGlobalParameters.py
rename to Tests/unit/test_useGlobalParameters.py
diff --git a/Tensile/Tests/vega_20/fast/igemm_asm_nn.yaml b/Tests/vega_20/fast/igemm_asm_nn.yaml
similarity index 100%
rename from Tensile/Tests/vega_20/fast/igemm_asm_nn.yaml
rename to Tests/vega_20/fast/igemm_asm_nn.yaml
diff --git a/Tensile/Tests/vega_20/fast/igemm_asm_nt.yaml b/Tests/vega_20/fast/igemm_asm_nt.yaml
similarity index 100%
rename from Tensile/Tests/vega_20/fast/igemm_asm_nt.yaml
rename to Tests/vega_20/fast/igemm_asm_nt.yaml
diff --git a/Tensile/Tests/vega_20/fast/igemm_asm_tn.yaml b/Tests/vega_20/fast/igemm_asm_tn.yaml
similarity index 100%
rename from Tensile/Tests/vega_20/fast/igemm_asm_tn.yaml
rename to Tests/vega_20/fast/igemm_asm_tn.yaml
diff --git a/Tensile/Tests/vega_20/fast/igemm_asm_tt.yaml b/Tests/vega_20/fast/igemm_asm_tt.yaml
similarity index 100%
rename from Tensile/Tests/vega_20/fast/igemm_asm_tt.yaml
rename to Tests/vega_20/fast/igemm_asm_tt.yaml
diff --git a/Tensile/Tests/vega_20/nightly/global_split_u/igemm_gsu_beta0.yaml b/Tests/vega_20/nightly/global_split_u/igemm_gsu_beta0.yaml
similarity index 100%
rename from Tensile/Tests/vega_20/nightly/global_split_u/igemm_gsu_beta0.yaml
rename to Tests/vega_20/nightly/global_split_u/igemm_gsu_beta0.yaml
diff --git a/Tensile/Tests/vega_20/nightly/global_split_u/igemm_gsu_beta1.yaml b/Tests/vega_20/nightly/global_split_u/igemm_gsu_beta1.yaml
similarity index 100%
rename from Tensile/Tests/vega_20/nightly/global_split_u/igemm_gsu_beta1.yaml
rename to Tests/vega_20/nightly/global_split_u/igemm_gsu_beta1.yaml
diff --git a/Tensile/Tests/vega_20/nightly/global_split_u/igemm_gsu_beta2.yaml b/Tests/vega_20/nightly/global_split_u/igemm_gsu_beta2.yaml
similarity index 100%
rename from Tensile/Tests/vega_20/nightly/global_split_u/igemm_gsu_beta2.yaml
rename to Tests/vega_20/nightly/global_split_u/igemm_gsu_beta2.yaml
diff --git a/Tensile/Tests/vega_20/nightly/local_split_u/igemm_lsu.yaml b/Tests/vega_20/nightly/local_split_u/igemm_lsu.yaml
similarity index 100%
rename from Tensile/Tests/vega_20/nightly/local_split_u/igemm_lsu.yaml
rename to Tests/vega_20/nightly/local_split_u/igemm_lsu.yaml
diff --git a/Tensile/Tests/weekly/assertions/README b/Tests/weekly/assertions/README
similarity index 100%
rename from Tensile/Tests/weekly/assertions/README
rename to Tests/weekly/assertions/README
diff --git a/Tensile/Tests/weekly/assertions/test_hgemm_asem2_asm.yaml b/Tests/weekly/assertions/test_hgemm_asem2_asm.yaml
similarity index 100%
rename from Tensile/Tests/weekly/assertions/test_hgemm_asem2_asm.yaml
rename to Tests/weekly/assertions/test_hgemm_asem2_asm.yaml
diff --git a/Tensile/Tests/weekly/classic_source/test_hgemm_vectors.yaml b/Tests/weekly/classic_source/test_hgemm_vectors.yaml
similarity index 100%
rename from Tensile/Tests/weekly/classic_source/test_hgemm_vectors.yaml
rename to Tests/weekly/classic_source/test_hgemm_vectors.yaml
diff --git a/Tensile/Tests/weekly/classic_source/test_sgemm_vectors.yaml b/Tests/weekly/classic_source/test_sgemm_vectors.yaml
similarity index 100%
rename from Tensile/Tests/weekly/classic_source/test_sgemm_vectors.yaml
rename to Tests/weekly/classic_source/test_sgemm_vectors.yaml
diff --git a/Tensile/Tests/yaml_only/test_config.py b/Tests/yaml_only/test_config.py
similarity index 100%
rename from Tensile/Tests/yaml_only/test_config.py
rename to Tests/yaml_only/test_config.py
diff --git a/Tensile/Tests/yaml_only/test_ya b/Tests/yaml_only/test_ya
similarity index 100%
rename from Tensile/Tests/yaml_only/test_ya
rename to Tests/yaml_only/test_ya
diff --git a/Tensile/AsmMemoryInstruction.py b/src/Tensile/AsmMemoryInstruction.py
similarity index 100%
rename from Tensile/AsmMemoryInstruction.py
rename to src/Tensile/AsmMemoryInstruction.py
diff --git a/Tensile/AsmRegisterPool.py b/src/Tensile/AsmRegisterPool.py
similarity index 100%
rename from Tensile/AsmRegisterPool.py
rename to src/Tensile/AsmRegisterPool.py
diff --git a/Tensile/AsmUtils.py b/src/Tensile/AsmUtils.py
similarity index 100%
rename from Tensile/AsmUtils.py
rename to src/Tensile/AsmUtils.py
diff --git a/Tensile/BenchmarkProblems.py b/src/Tensile/BenchmarkProblems.py
similarity index 100%
rename from Tensile/BenchmarkProblems.py
rename to src/Tensile/BenchmarkProblems.py
diff --git a/Tensile/BenchmarkSplitter.py b/src/Tensile/BenchmarkSplitter.py
similarity index 100%
rename from Tensile/BenchmarkSplitter.py
rename to src/Tensile/BenchmarkSplitter.py
diff --git a/Tensile/BenchmarkStructs.py b/src/Tensile/BenchmarkStructs.py
similarity index 100%
rename from Tensile/BenchmarkStructs.py
rename to src/Tensile/BenchmarkStructs.py
diff --git a/Tensile/ClientExecutable.py b/src/Tensile/ClientExecutable.py
similarity index 100%
rename from Tensile/ClientExecutable.py
rename to src/Tensile/ClientExecutable.py
diff --git a/Tensile/ClientWriter.py b/src/Tensile/ClientWriter.py
similarity index 100%
rename from Tensile/ClientWriter.py
rename to src/Tensile/ClientWriter.py
diff --git a/Tensile/Code.py b/src/Tensile/Code.py
similarity index 100%
rename from Tensile/Code.py
rename to src/Tensile/Code.py
diff --git a/Tensile/Common.py b/src/Tensile/Common.py
similarity index 100%
rename from Tensile/Common.py
rename to src/Tensile/Common.py
diff --git a/Tensile/Component.py b/src/Tensile/Component.py
similarity index 100%
rename from Tensile/Component.py
rename to src/Tensile/Component.py
diff --git a/Tensile/Components/ComputeStoreVgprs.py b/src/Tensile/Components/ComputeStoreVgprs.py
similarity index 100%
rename from Tensile/Components/ComputeStoreVgprs.py
rename to src/Tensile/Components/ComputeStoreVgprs.py
diff --git a/Tensile/Components/LocalRead.py b/src/Tensile/Components/LocalRead.py
similarity index 100%
rename from Tensile/Components/LocalRead.py
rename to src/Tensile/Components/LocalRead.py
diff --git a/Tensile/Components/LraTileAssignment.py b/src/Tensile/Components/LraTileAssignment.py
similarity index 100%
rename from Tensile/Components/LraTileAssignment.py
rename to src/Tensile/Components/LraTileAssignment.py
diff --git a/Tensile/Components/MAC_BF16_HPA.py b/src/Tensile/Components/MAC_BF16_HPA.py
similarity index 100%
rename from Tensile/Components/MAC_BF16_HPA.py
rename to src/Tensile/Components/MAC_BF16_HPA.py
diff --git a/Tensile/Components/MAC_F16.py b/src/Tensile/Components/MAC_F16.py
similarity index 100%
rename from Tensile/Components/MAC_F16.py
rename to src/Tensile/Components/MAC_F16.py
diff --git a/Tensile/Components/MAC_F16_HPA.py b/src/Tensile/Components/MAC_F16_HPA.py
similarity index 100%
rename from Tensile/Components/MAC_F16_HPA.py
rename to src/Tensile/Components/MAC_F16_HPA.py
diff --git a/Tensile/Components/MAC_F32.py b/src/Tensile/Components/MAC_F32.py
similarity index 100%
rename from Tensile/Components/MAC_F32.py
rename to src/Tensile/Components/MAC_F32.py
diff --git a/Tensile/Components/MAC_F32C.py b/src/Tensile/Components/MAC_F32C.py
similarity index 100%
rename from Tensile/Components/MAC_F32C.py
rename to src/Tensile/Components/MAC_F32C.py
diff --git a/Tensile/Components/MAC_F64.py b/src/Tensile/Components/MAC_F64.py
similarity index 100%
rename from Tensile/Components/MAC_F64.py
rename to src/Tensile/Components/MAC_F64.py
diff --git a/Tensile/Components/MAC_F64C.py b/src/Tensile/Components/MAC_F64C.py
similarity index 100%
rename from Tensile/Components/MAC_F64C.py
rename to src/Tensile/Components/MAC_F64C.py
diff --git a/Tensile/Components/MAC_I8X4.py b/src/Tensile/Components/MAC_I8X4.py
similarity index 100%
rename from Tensile/Components/MAC_I8X4.py
rename to src/Tensile/Components/MAC_I8X4.py
diff --git a/Tensile/Components/MAC_I8_HPA.py b/src/Tensile/Components/MAC_I8_HPA.py
similarity index 100%
rename from Tensile/Components/MAC_I8_HPA.py
rename to src/Tensile/Components/MAC_I8_HPA.py
diff --git a/Tensile/Components/MFMA.py b/src/Tensile/Components/MFMA.py
similarity index 100%
rename from Tensile/Components/MFMA.py
rename to src/Tensile/Components/MFMA.py
diff --git a/Tensile/Components/NotLocalFullTileElements.py b/src/Tensile/Components/NotLocalFullTileElements.py
similarity index 100%
rename from Tensile/Components/NotLocalFullTileElements.py
rename to src/Tensile/Components/NotLocalFullTileElements.py
diff --git a/Tensile/Components/Priority.py b/src/Tensile/Components/Priority.py
similarity index 100%
rename from Tensile/Components/Priority.py
rename to src/Tensile/Components/Priority.py
diff --git a/Tensile/Components/PseudoRandomGenerator.py b/src/Tensile/Components/PseudoRandomGenerator.py
similarity index 100%
rename from Tensile/Components/PseudoRandomGenerator.py
rename to src/Tensile/Components/PseudoRandomGenerator.py
diff --git a/Tensile/Components/ShiftVectorComponents.py b/src/Tensile/Components/ShiftVectorComponents.py
similarity index 100%
rename from Tensile/Components/ShiftVectorComponents.py
rename to src/Tensile/Components/ShiftVectorComponents.py
diff --git a/Tensile/Components/Signature.py b/src/Tensile/Components/Signature.py
similarity index 100%
rename from Tensile/Components/Signature.py
rename to src/Tensile/Components/Signature.py
diff --git a/Tensile/Components/__init__.py b/src/Tensile/Components/__init__.py
similarity index 100%
rename from Tensile/Components/__init__.py
rename to src/Tensile/Components/__init__.py
diff --git a/Tensile/Configuration.py b/src/Tensile/Configuration.py
similarity index 100%
rename from Tensile/Configuration.py
rename to src/Tensile/Configuration.py
diff --git a/Tensile/Contractions.py b/src/Tensile/Contractions.py
similarity index 100%
rename from Tensile/Contractions.py
rename to src/Tensile/Contractions.py
diff --git a/Tensile/CustomKernels.py b/src/Tensile/CustomKernels.py
similarity index 100%
rename from Tensile/CustomKernels.py
rename to src/Tensile/CustomKernels.py
diff --git a/Tensile/CustomKernels/DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4.s b/src/Tensile/CustomKernels/DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4.s
similarity index 100%
rename from Tensile/CustomKernels/DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4.s
rename to src/Tensile/CustomKernels/DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4.s
diff --git a/Tensile/DataType.py b/src/Tensile/DataType.py
similarity index 100%
rename from Tensile/DataType.py
rename to src/Tensile/DataType.py
diff --git a/Tensile/EmbeddedData.py b/src/Tensile/EmbeddedData.py
similarity index 100%
rename from Tensile/EmbeddedData.py
rename to src/Tensile/EmbeddedData.py
diff --git a/Tensile/GenerateSummations.py b/src/Tensile/GenerateSummations.py
similarity index 100%
rename from Tensile/GenerateSummations.py
rename to src/Tensile/GenerateSummations.py
diff --git a/Tensile/Hardware.py b/src/Tensile/Hardware.py
similarity index 100%
rename from Tensile/Hardware.py
rename to src/Tensile/Hardware.py
diff --git a/Tensile/KernelWriter.py b/src/Tensile/KernelWriter.py
similarity index 100%
rename from Tensile/KernelWriter.py
rename to src/Tensile/KernelWriter.py
diff --git a/Tensile/KernelWriterAssembly.py b/src/Tensile/KernelWriterAssembly.py
similarity index 100%
rename from Tensile/KernelWriterAssembly.py
rename to src/Tensile/KernelWriterAssembly.py
diff --git a/Tensile/KernelWriterBase.py b/src/Tensile/KernelWriterBase.py
similarity index 100%
rename from Tensile/KernelWriterBase.py
rename to src/Tensile/KernelWriterBase.py
diff --git a/Tensile/KernelWriterBetaOnly.py b/src/Tensile/KernelWriterBetaOnly.py
similarity index 100%
rename from Tensile/KernelWriterBetaOnly.py
rename to src/Tensile/KernelWriterBetaOnly.py
diff --git a/Tensile/KernelWriterConversion.py b/src/Tensile/KernelWriterConversion.py
similarity index 100%
rename from Tensile/KernelWriterConversion.py
rename to src/Tensile/KernelWriterConversion.py
diff --git a/Tensile/KernelWriterSource.py b/src/Tensile/KernelWriterSource.py
similarity index 100%
rename from Tensile/KernelWriterSource.py
rename to src/Tensile/KernelWriterSource.py
diff --git a/Tensile/KernelWriterStreamKInit.py b/src/Tensile/KernelWriterStreamKInit.py
similarity index 100%
rename from Tensile/KernelWriterStreamKInit.py
rename to src/Tensile/KernelWriterStreamKInit.py
diff --git a/Tensile/LibraryIO.py b/src/Tensile/LibraryIO.py
similarity index 100%
rename from Tensile/LibraryIO.py
rename to src/Tensile/LibraryIO.py
diff --git a/Tensile/LibraryLogic.py b/src/Tensile/LibraryLogic.py
similarity index 100%
rename from Tensile/LibraryLogic.py
rename to src/Tensile/LibraryLogic.py
diff --git a/Tensile/Parallel.py b/src/Tensile/Parallel.py
similarity index 100%
rename from Tensile/Parallel.py
rename to src/Tensile/Parallel.py
diff --git a/Tensile/Properties.py b/src/Tensile/Properties.py
similarity index 100%
rename from Tensile/Properties.py
rename to src/Tensile/Properties.py
diff --git a/Tensile/ReplacementKernels.py b/src/Tensile/ReplacementKernels.py
similarity index 100%
rename from Tensile/ReplacementKernels.py
rename to src/Tensile/ReplacementKernels.py
diff --git a/Tensile/SolutionLibrary.py b/src/Tensile/SolutionLibrary.py
similarity index 100%
rename from Tensile/SolutionLibrary.py
rename to src/Tensile/SolutionLibrary.py
diff --git a/Tensile/SolutionSelectionLibrary.py b/src/Tensile/SolutionSelectionLibrary.py
similarity index 100%
rename from Tensile/SolutionSelectionLibrary.py
rename to src/Tensile/SolutionSelectionLibrary.py
diff --git a/Tensile/SolutionStructs.py b/src/Tensile/SolutionStructs.py
similarity index 100%
rename from Tensile/SolutionStructs.py
rename to src/Tensile/SolutionStructs.py
diff --git a/Tensile/SolutionWriter.py b/src/Tensile/SolutionWriter.py
similarity index 100%
rename from Tensile/SolutionWriter.py
rename to src/Tensile/SolutionWriter.py
diff --git a/Tensile/Tensile.py b/src/Tensile/Tensile.py
similarity index 100%
rename from Tensile/Tensile.py
rename to src/Tensile/Tensile.py
diff --git a/Tensile/TensileBenchmarkCluster.py b/src/Tensile/TensileBenchmarkCluster.py
similarity index 100%
rename from Tensile/TensileBenchmarkCluster.py
rename to src/Tensile/TensileBenchmarkCluster.py
diff --git a/Tensile/TensileBenchmarkClusterScripts.py b/src/Tensile/TensileBenchmarkClusterScripts.py
similarity index 100%
rename from Tensile/TensileBenchmarkClusterScripts.py
rename to src/Tensile/TensileBenchmarkClusterScripts.py
diff --git a/Tensile/TensileBenchmarkLibraryClient.py b/src/Tensile/TensileBenchmarkLibraryClient.py
similarity index 100%
rename from Tensile/TensileBenchmarkLibraryClient.py
rename to src/Tensile/TensileBenchmarkLibraryClient.py
diff --git a/Tensile/TensileClientConfig.py b/src/Tensile/TensileClientConfig.py
similarity index 100%
rename from Tensile/TensileClientConfig.py
rename to src/Tensile/TensileClientConfig.py
diff --git a/Tensile/TensileCreateLibrary.py b/src/Tensile/TensileCreateLibrary.py
similarity index 100%
rename from Tensile/TensileCreateLibrary.py
rename to src/Tensile/TensileCreateLibrary.py
diff --git a/Tensile/TensileLibLogicToYaml.py b/src/Tensile/TensileLibLogicToYaml.py
similarity index 100%
rename from Tensile/TensileLibLogicToYaml.py
rename to src/Tensile/TensileLibLogicToYaml.py
diff --git a/Tensile/TensileMergeLibrary.py b/src/Tensile/TensileMergeLibrary.py
similarity index 100%
rename from Tensile/TensileMergeLibrary.py
rename to src/Tensile/TensileMergeLibrary.py
diff --git a/Tensile/TensileRetuneLibrary.py b/src/Tensile/TensileRetuneLibrary.py
similarity index 100%
rename from Tensile/TensileRetuneLibrary.py
rename to src/Tensile/TensileRetuneLibrary.py
diff --git a/Tensile/TensileUpdateLibrary.py b/src/Tensile/TensileUpdateLibrary.py
similarity index 100%
rename from Tensile/TensileUpdateLibrary.py
rename to src/Tensile/TensileUpdateLibrary.py
diff --git a/Tensile/Utils.py b/src/Tensile/Utils.py
similarity index 100%
rename from Tensile/Utils.py
rename to src/Tensile/Utils.py
diff --git a/Tensile/__init__.py b/src/Tensile/__init__.py
similarity index 100%
rename from Tensile/__init__.py
rename to src/Tensile/__init__.py
diff --git a/Tensile/bin/Tensile b/src/Tensile/bin/Tensile
similarity index 100%
rename from Tensile/bin/Tensile
rename to src/Tensile/bin/Tensile
diff --git a/Tensile/bin/TensileBenchmarkCluster b/src/Tensile/bin/TensileBenchmarkCluster
similarity index 100%
rename from Tensile/bin/TensileBenchmarkCluster
rename to src/Tensile/bin/TensileBenchmarkCluster
diff --git a/Tensile/bin/TensileClientConfig b/src/Tensile/bin/TensileClientConfig
similarity index 100%
rename from Tensile/bin/TensileClientConfig
rename to src/Tensile/bin/TensileClientConfig
diff --git a/Tensile/bin/TensileCreateLibrary b/src/Tensile/bin/TensileCreateLibrary
similarity index 100%
rename from Tensile/bin/TensileCreateLibrary
rename to src/Tensile/bin/TensileCreateLibrary
diff --git a/Tensile/bin/TensileGenerateSummations b/src/Tensile/bin/TensileGenerateSummations
similarity index 100%
rename from Tensile/bin/TensileGenerateSummations
rename to src/Tensile/bin/TensileGenerateSummations
diff --git a/Tensile/bin/TensileLibLogicToYaml b/src/Tensile/bin/TensileLibLogicToYaml
similarity index 100%
rename from Tensile/bin/TensileLibLogicToYaml
rename to src/Tensile/bin/TensileLibLogicToYaml
diff --git a/Tensile/bin/TensileMergeLibrary b/src/Tensile/bin/TensileMergeLibrary
similarity index 100%
rename from Tensile/bin/TensileMergeLibrary
rename to src/Tensile/bin/TensileMergeLibrary
diff --git a/Tensile/bin/TensileRetuneLibrary b/src/Tensile/bin/TensileRetuneLibrary
similarity index 100%
rename from Tensile/bin/TensileRetuneLibrary
rename to src/Tensile/bin/TensileRetuneLibrary
diff --git a/Tensile/bin/TensileUpdateLibrary b/src/Tensile/bin/TensileUpdateLibrary
similarity index 100%
rename from Tensile/bin/TensileUpdateLibrary
rename to src/Tensile/bin/TensileUpdateLibrary
diff --git a/Tensile/Configs/alternate-format/sizeList-example.yaml b/src/Tensile/data/Configs/alternate-format/sizeList-example.yaml
similarity index 100%
rename from Tensile/Configs/alternate-format/sizeList-example.yaml
rename to src/Tensile/data/Configs/alternate-format/sizeList-example.yaml
diff --git a/Tensile/Configs/alternate-format/vega20-example.yaml b/src/Tensile/data/Configs/alternate-format/vega20-example.yaml
similarity index 100%
rename from Tensile/Configs/alternate-format/vega20-example.yaml
rename to src/Tensile/data/Configs/alternate-format/vega20-example.yaml
diff --git a/Tensile/Configs/deep_bench_nn.csv b/src/Tensile/data/Configs/deep_bench_nn.csv
similarity index 100%
rename from Tensile/Configs/deep_bench_nn.csv
rename to src/Tensile/data/Configs/deep_bench_nn.csv
diff --git a/Tensile/Configs/deep_bench_nn_batched.csv b/src/Tensile/data/Configs/deep_bench_nn_batched.csv
similarity index 100%
rename from Tensile/Configs/deep_bench_nn_batched.csv
rename to src/Tensile/data/Configs/deep_bench_nn_batched.csv
diff --git a/Tensile/Configs/deep_bench_nt.csv b/src/Tensile/data/Configs/deep_bench_nt.csv
similarity index 100%
rename from Tensile/Configs/deep_bench_nt.csv
rename to src/Tensile/data/Configs/deep_bench_nt.csv
diff --git a/Tensile/Configs/deep_bench_nt_batched.csv b/src/Tensile/data/Configs/deep_bench_nt_batched.csv
similarity index 100%
rename from Tensile/Configs/deep_bench_nt_batched.csv
rename to src/Tensile/data/Configs/deep_bench_nt_batched.csv
diff --git a/Tensile/Configs/deep_bench_tn.csv b/src/Tensile/data/Configs/deep_bench_tn.csv
similarity index 100%
rename from Tensile/Configs/deep_bench_tn.csv
rename to src/Tensile/data/Configs/deep_bench_tn.csv
diff --git a/Tensile/Configs/deep_bench_tn_batched.csv b/src/Tensile/data/Configs/deep_bench_tn_batched.csv
similarity index 100%
rename from Tensile/Configs/deep_bench_tn_batched.csv
rename to src/Tensile/data/Configs/deep_bench_tn_batched.csv
diff --git a/Tensile/Configs/mfma/mfma_hpa_bf16_nt_test.yaml b/src/Tensile/data/Configs/mfma/mfma_hpa_bf16_nt_test.yaml
similarity index 100%
rename from Tensile/Configs/mfma/mfma_hpa_bf16_nt_test.yaml
rename to src/Tensile/data/Configs/mfma/mfma_hpa_bf16_nt_test.yaml
diff --git a/Tensile/Configs/mfma/mfma_igemm_lite_test.yaml b/src/Tensile/data/Configs/mfma/mfma_igemm_lite_test.yaml
similarity index 100%
rename from Tensile/Configs/mfma/mfma_igemm_lite_test.yaml
rename to src/Tensile/data/Configs/mfma/mfma_igemm_lite_test.yaml
diff --git a/Tensile/Configs/mfma/mfma_igemm_nn_asm_full.yaml b/src/Tensile/data/Configs/mfma/mfma_igemm_nn_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/mfma/mfma_igemm_nn_asm_full.yaml
rename to src/Tensile/data/Configs/mfma/mfma_igemm_nn_asm_full.yaml
diff --git a/Tensile/Configs/mfma/mfma_igemm_nt_asm_full.yaml b/src/Tensile/data/Configs/mfma/mfma_igemm_nt_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/mfma/mfma_igemm_nt_asm_full.yaml
rename to src/Tensile/data/Configs/mfma/mfma_igemm_nt_asm_full.yaml
diff --git a/Tensile/Configs/mfma/mfma_igemm_tn_asm_full.yaml b/src/Tensile/data/Configs/mfma/mfma_igemm_tn_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/mfma/mfma_igemm_tn_asm_full.yaml
rename to src/Tensile/data/Configs/mfma/mfma_igemm_tn_asm_full.yaml
diff --git a/Tensile/Configs/mfma/mfma_igemm_tt_asm_full.yaml b/src/Tensile/data/Configs/mfma/mfma_igemm_tt_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/mfma/mfma_igemm_tt_asm_full.yaml
rename to src/Tensile/data/Configs/mfma/mfma_igemm_tt_asm_full.yaml
diff --git a/Tensile/Configs/mfma/mfma_test.yaml b/src/Tensile/data/Configs/mfma/mfma_test.yaml
similarity index 100%
rename from Tensile/Configs/mfma/mfma_test.yaml
rename to src/Tensile/data/Configs/mfma/mfma_test.yaml
diff --git a/Tensile/Configs/mfma/rocblas_cgemm_asm_xdlops.yaml b/src/Tensile/data/Configs/mfma/rocblas_cgemm_asm_xdlops.yaml
similarity index 100%
rename from Tensile/Configs/mfma/rocblas_cgemm_asm_xdlops.yaml
rename to src/Tensile/data/Configs/mfma/rocblas_cgemm_asm_xdlops.yaml
diff --git a/Tensile/Configs/mfma/rocblas_sgemm_asm_single_kernel.yaml b/src/Tensile/data/Configs/mfma/rocblas_sgemm_asm_single_kernel.yaml
similarity index 100%
rename from Tensile/Configs/mfma/rocblas_sgemm_asm_single_kernel.yaml
rename to src/Tensile/data/Configs/mfma/rocblas_sgemm_asm_single_kernel.yaml
diff --git a/Tensile/Configs/mfma/rocblas_sgemm_nt_hpl1_asm_full.yaml b/src/Tensile/data/Configs/mfma/rocblas_sgemm_nt_hpl1_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/mfma/rocblas_sgemm_nt_hpl1_asm_full.yaml
rename to src/Tensile/data/Configs/mfma/rocblas_sgemm_nt_hpl1_asm_full.yaml
diff --git a/Tensile/Configs/mfma/sgemm_tlunn.yaml b/src/Tensile/data/Configs/mfma/sgemm_tlunn.yaml
similarity index 100%
rename from Tensile/Configs/mfma/sgemm_tlunn.yaml
rename to src/Tensile/data/Configs/mfma/sgemm_tlunn.yaml
diff --git a/Tensile/Configs/mfma/sgemm_transposeLDS.yaml b/src/Tensile/data/Configs/mfma/sgemm_transposeLDS.yaml
similarity index 100%
rename from Tensile/Configs/mfma/sgemm_transposeLDS.yaml
rename to src/Tensile/data/Configs/mfma/sgemm_transposeLDS.yaml
diff --git a/Tensile/Configs/miopen/Logic/deepbench_conv/vega10_Cijk_Ailk_Bljk_HB.yaml b/src/Tensile/data/Configs/miopen/Logic/deepbench_conv/vega10_Cijk_Ailk_Bljk_HB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/Logic/deepbench_conv/vega10_Cijk_Ailk_Bljk_HB.yaml
rename to src/Tensile/data/Configs/miopen/Logic/deepbench_conv/vega10_Cijk_Ailk_Bljk_HB.yaml
diff --git a/Tensile/Configs/miopen/Logic/deepbench_conv/vega10_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/Logic/deepbench_conv/vega10_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/Logic/deepbench_conv/vega10_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/Logic/deepbench_conv/vega10_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bjlk_HB.yaml b/src/Tensile/data/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bjlk_HB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bjlk_HB.yaml
rename to src/Tensile/data/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bjlk_HB.yaml
diff --git a/Tensile/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bljk_HB.yaml b/src/Tensile/data/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bljk_HB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bljk_HB.yaml
rename to src/Tensile/data/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bljk_HB.yaml
diff --git a/Tensile/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Alik_Bljk_HB.yaml b/src/Tensile/data/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Alik_Bljk_HB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Alik_Bljk_HB.yaml
rename to src/Tensile/data/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Alik_Bljk_HB.yaml
diff --git a/Tensile/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/Logic/deepbench_gemm/vega10_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/Makefile b/src/Tensile/data/Configs/miopen/Makefile
similarity index 100%
rename from Tensile/Configs/miopen/Makefile
rename to src/Tensile/data/Configs/miopen/Makefile
diff --git a/Tensile/Configs/miopen/README.md b/src/Tensile/data/Configs/miopen/README.md
similarity index 100%
rename from Tensile/Configs/miopen/README.md
rename to src/Tensile/data/Configs/miopen/README.md
diff --git a/Tensile/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_nn_bert.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_nn_bert.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_nn_bert.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_nn_bert.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_nt_bert.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_nt_bert.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_nt_bert.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_nt_bert.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_tn_bert.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_tn_bert.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_tn_bert.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2019-08-26/configs/vega20_sgemm_tn_bert.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_nn_bert.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_nn_bert.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_nn_bert.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_nn_bert.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_nt_bert.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_nt_bert.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_nt_bert.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_nt_bert.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_tn_bert.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_tn_bert.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_tn_bert.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-01-07/configs/vega20_sgemm_tn_bert.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-01-07/exact/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_nn_bert.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_nn_bert.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_nn_bert.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_nn_bert.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_nt_bert.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_nt_bert.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_nt_bert.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_nt_bert.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_tn_bert.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_tn_bert.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_tn_bert.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-15/configs/arcturus_sgemm_tn_bert.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-15/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_nn_msra.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_nn_msra.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_nn_msra.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_nn_msra.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_nt_msra.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_nt_msra.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_nt_msra.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_nt_msra.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_tn_msra.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_tn_msra.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_tn_msra.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-22/configs/vega20_sgemm_tn_msra.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-22/exact/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_nn_bert.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_nn_bert.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_nn_bert.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_nn_bert.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_nt_bert.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_nt_bert.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_nt_bert.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_nt_bert.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_tn_bert.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_tn_bert.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_tn_bert.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-27/configs/vega20_sgemm_tn_bert.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-04-27/exact/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_nn_bert_f16.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_nn_bert_f16.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_nn_bert_f16.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_nn_bert_f16.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_nt_bert_f16.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_nt_bert_f16.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_nt_bert_f16.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_nt_bert_f16.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_tn_bert_f16.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_tn_bert_f16.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_tn_bert_f16.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-07/configs/vega20_hgemm_tn_bert_f16.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Ailk_Bjlk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Ailk_Bjlk_HB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Ailk_Bjlk_HB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Ailk_Bjlk_HB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Ailk_Bljk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Ailk_Bljk_HB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Ailk_Bljk_HB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Ailk_Bljk_HB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Alik_Bljk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Alik_Bljk_HB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Alik_Bljk_HB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-07/exact/vega20_Cijk_Alik_Bljk_HB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-18/configs/bert_sgemm_xdlops_nn.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/bert_sgemm_xdlops_nn.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-05-18/configs/bert_sgemm_xdlops_nn.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/bert_sgemm_xdlops_nn.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-18/configs/bert_sgemm_xdlops_tn.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/bert_sgemm_xdlops_tn.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-05-18/configs/bert_sgemm_xdlops_tn.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/bert_sgemm_xdlops_tn.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-18/configs/dlrm_sgemm_xdlops.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/dlrm_sgemm_xdlops.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-05-18/configs/dlrm_sgemm_xdlops.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/dlrm_sgemm_xdlops.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-18/configs/dlrm_sgemm_xdlops_nt.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/dlrm_sgemm_xdlops_nt.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-05-18/configs/dlrm_sgemm_xdlops_nt.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/dlrm_sgemm_xdlops_nt.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-18/configs/replacement-kernel-arcturus-tn.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/replacement-kernel-arcturus-tn.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-05-18/configs/replacement-kernel-arcturus-tn.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/replacement-kernel-arcturus-tn.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_nn_inc1_asm_full.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_nn_inc1_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_nn_inc1_asm_full.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_nn_inc1_asm_full.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_nt_inc1_asm_full.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_nt_inc1_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_nt_inc1_asm_full.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_nt_inc1_asm_full.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_tn_inc1_asm_full.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_tn_inc1_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_tn_inc1_asm_full.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/configs/rocblas_sgemm_tn_inc1_asm_full.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-18/exact/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-05-18/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-18/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_nn_batched_msra.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_nn_batched_msra.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_nn_batched_msra.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_nn_batched_msra.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_nt_batched_msra.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_nt_batched_msra.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_nt_batched_msra.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_nt_batched_msra.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_tn_batched_msra.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_tn_batched_msra.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_tn_batched_msra.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-27/configs/vega20_sgemm_tn_batched_msra.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-05-27/exact/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_nn_onnx.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_nn_onnx.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_nn_onnx.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_nn_onnx.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_nt_onnx.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_nt_onnx.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_nt_onnx.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_nt_onnx.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_tn_onnx.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_tn_onnx.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_tn_onnx.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-07-14/configs/vega20_sgemm_tn_onnx.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-07-14/exact/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_nn_megatron.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_nn_megatron.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_nn_megatron.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_nn_megatron.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_nt_megatron.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_nt_megatron.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_nt_megatron.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_nt_megatron.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_tn_megatron.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_tn_megatron.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_tn_megatron.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-08-20/configs/vega20_hgemm_tn_megatron.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Alik_Bljk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Alik_Bljk_HBH.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Alik_Bljk_HBH.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-08-20/exact/vega20_Cijk_Alik_Bljk_HBH.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-11-06/configs/doit.sh b/src/Tensile/data/Configs/miopen/archives/bert/2020-11-06/configs/doit.sh
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-11-06/configs/doit.sh
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-11-06/configs/doit.sh
diff --git a/Tensile/Configs/miopen/archives/bert/2020-11-06/configs/nn.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-11-06/configs/nn.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-11-06/configs/nn.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-11-06/configs/nn.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-11-06/configs/nt.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-11-06/configs/nt.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-11-06/configs/nt.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-11-06/configs/nt.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-11-06/configs/tn.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-11-06/configs/tn.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-11-06/configs/tn.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-11-06/configs/tn.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-11-06/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-11-08/configs/bert-nn.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-11-08/configs/bert-nn.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-11-08/configs/bert-nn.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-11-08/configs/bert-nn.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-11-08/configs/bert-nt.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-11-08/configs/bert-nt.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-11-08/configs/bert-nt.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-11-08/configs/bert-nt.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-11-08/configs/bert-tn.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-11-08/configs/bert-tn.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-11-08/configs/bert-tn.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-11-08/configs/bert-tn.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-11-08/configs/doit.sh b/src/Tensile/data/Configs/miopen/archives/bert/2020-11-08/configs/doit.sh
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-11-08/configs/doit.sh
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-11-08/configs/doit.sh
diff --git a/Tensile/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/bert/2020-11-08/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_nn_dlrm.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_nn_dlrm.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_nn_dlrm.yaml
rename to src/Tensile/data/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_nn_dlrm.yaml
diff --git a/Tensile/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_nt_dlrm.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_nt_dlrm.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_nt_dlrm.yaml
rename to src/Tensile/data/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_nt_dlrm.yaml
diff --git a/Tensile/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_tn_dlrm.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_tn_dlrm.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_tn_dlrm.yaml
rename to src/Tensile/data/Configs/miopen/archives/dlrm/2019-08-26/configs/vega20_sgemm_tn_dlrm.yaml
diff --git a/Tensile/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/dlrm/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_nn_dlrm.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_nn_dlrm.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_nn_dlrm.yaml
rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_nn_dlrm.yaml
diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_nt_dlrm.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_nt_dlrm.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_nt_dlrm.yaml
rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_nt_dlrm.yaml
diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_tn_dlrm.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_tn_dlrm.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_tn_dlrm.yaml
rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-04-02/configs/arcturus_sgemm_tn_dlrm.yaml
diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-04-02/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-07-02/configs/temp.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-02/configs/temp.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/dlrm/2020-07-02/configs/temp.yaml
rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-02/configs/temp.yaml
diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-07-02/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-02/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/dlrm/2020-07-02/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-02/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_nn_terabyte.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_nn_terabyte.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_nn_terabyte.yaml
rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_nn_terabyte.yaml
diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_nt_terabyte.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_nt_terabyte.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_nt_terabyte.yaml
rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_nt_terabyte.yaml
diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_tn_terabyte.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_tn_terabyte.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_tn_terabyte.yaml
rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-08/configs/sgemm_xdlops_tn_terabyte.yaml
diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-07-08/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_nn_last-dlrm-terabyte-tt-2.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_nn_last-dlrm-terabyte-tt-2.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_nn_last-dlrm-terabyte-tt-2.yaml
rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_nn_last-dlrm-terabyte-tt-2.yaml
diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_nt_last-dlrm-terabyte-tt-2.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_nt_last-dlrm-terabyte-tt-2.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_nt_last-dlrm-terabyte-tt-2.yaml
rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_nt_last-dlrm-terabyte-tt-2.yaml
diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_tn_last-dlrm-terabyte-tt-2.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_tn_last-dlrm-terabyte-tt-2.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_tn_last-dlrm-terabyte-tt-2.yaml
rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-08-27/configs/arcturus_sgemm_tn_last-dlrm-terabyte-tt-2.yaml
diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/dlrm/2020-08-27/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/README b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/README
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/README
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/README
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/clients/samples/example_gemm_ext2-tn.cpp b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/clients/samples/example_gemm_ext2-tn.cpp
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/clients/samples/example_gemm_ext2-tn.cpp
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/clients/samples/example_gemm_ext2-tn.cpp
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/doit.sh b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/doit.sh
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/doit.sh
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/doit.sh
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/spec2-nn-gfx900.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/spec2-nn-gfx900.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/spec2-nn-gfx900.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/spec2-nn-gfx900.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/spec2-tn-gfx900.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/spec2-tn-gfx900.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/spec2-tn-gfx900.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/spec2-tn-gfx900.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/speccd-nn-gfx900.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/speccd-nn-gfx900.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/speccd-nn-gfx900.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/speccd-nn-gfx900.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/speccd-tn-gfx900.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/speccd-tn-gfx900.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/speccd-tn-gfx900.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/configs/speccd-tn-gfx900.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/joined/vega10_Cijk_Ailk_Bljk_SBIIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/joined/vega10_Cijk_Ailk_Bljk_SBIIc.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/joined/vega10_Cijk_Ailk_Bljk_SBIIc.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/joined/vega10_Cijk_Ailk_Bljk_SBIIc.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/joined/vega10_Cijk_Ailk_Bljk_SBIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/joined/vega10_Cijk_Ailk_Bljk_SBIc.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/joined/vega10_Cijk_Ailk_Bljk_SBIc.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/joined/vega10_Cijk_Ailk_Bljk_SBIc.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/nn/vega10_Cijk_Ailk_Bljk_SBIIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/nn/vega10_Cijk_Ailk_Bljk_SBIIc.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/nn/vega10_Cijk_Ailk_Bljk_SBIIc.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/nn/vega10_Cijk_Ailk_Bljk_SBIIc.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/nn/vega10_Cijk_Ailk_Bljk_SBIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/nn/vega10_Cijk_Ailk_Bljk_SBIc.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/nn/vega10_Cijk_Ailk_Bljk_SBIc.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/nn/vega10_Cijk_Ailk_Bljk_SBIc.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/tn/vega10_Cijk_Ailk_Bljk_SBIIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/tn/vega10_Cijk_Ailk_Bljk_SBIIc.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/tn/vega10_Cijk_Ailk_Bljk_SBIIc.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/tn/vega10_Cijk_Ailk_Bljk_SBIIc.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/tn/vega10_Cijk_Ailk_Bljk_SBIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/tn/vega10_Cijk_Ailk_Bljk_SBIc.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/tn/vega10_Cijk_Ailk_Bljk_SBIc.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx900/raw/tn/vega10_Cijk_Ailk_Bljk_SBIc.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/doit.sh b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/doit.sh
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/doit.sh
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/doit.sh
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/spec2-nn-gfx906.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/spec2-nn-gfx906.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/spec2-nn-gfx906.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/spec2-nn-gfx906.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/spec2-tn-gfx906.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/spec2-tn-gfx906.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/spec2-tn-gfx906.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/spec2-tn-gfx906.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/speccd-nn-gfx906.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/speccd-nn-gfx906.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/speccd-nn-gfx906.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/speccd-nn-gfx906.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/speccd-tn-gfx906.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/speccd-tn-gfx906.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/speccd-tn-gfx906.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/configs/speccd-tn-gfx906.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/joined/vega20_Cijk_Ailk_Bljk_SBIIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/joined/vega20_Cijk_Ailk_Bljk_SBIIc.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/joined/vega20_Cijk_Ailk_Bljk_SBIIc.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/joined/vega20_Cijk_Ailk_Bljk_SBIIc.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/joined/vega20_Cijk_Ailk_Bljk_SBIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/joined/vega20_Cijk_Ailk_Bljk_SBIc.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/joined/vega20_Cijk_Ailk_Bljk_SBIc.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/joined/vega20_Cijk_Ailk_Bljk_SBIc.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/nn/vega20_Cijk_Ailk_Bljk_SBIIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/nn/vega20_Cijk_Ailk_Bljk_SBIIc.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/nn/vega20_Cijk_Ailk_Bljk_SBIIc.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/nn/vega20_Cijk_Ailk_Bljk_SBIIc.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/nn/vega20_Cijk_Ailk_Bljk_SBIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/nn/vega20_Cijk_Ailk_Bljk_SBIc.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/nn/vega20_Cijk_Ailk_Bljk_SBIc.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/nn/vega20_Cijk_Ailk_Bljk_SBIc.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/tn/vega20_Cijk_Ailk_Bljk_SBIIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/tn/vega20_Cijk_Ailk_Bljk_SBIIc.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/tn/vega20_Cijk_Ailk_Bljk_SBIIc.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/tn/vega20_Cijk_Ailk_Bljk_SBIIc.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/tn/vega20_Cijk_Ailk_Bljk_SBIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/tn/vega20_Cijk_Ailk_Bljk_SBIc.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/tn/vega20_Cijk_Ailk_Bljk_SBIc.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx906/raw/tn/vega20_Cijk_Ailk_Bljk_SBIc.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/doit.sh b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/doit.sh
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/doit.sh
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/doit.sh
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/spec2-nn-gfx908.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/spec2-nn-gfx908.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/spec2-nn-gfx908.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/spec2-nn-gfx908.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/spec2-tn-gfx908.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/spec2-tn-gfx908.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/spec2-tn-gfx908.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/spec2-tn-gfx908.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/speccd-nn-gfx908.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/speccd-nn-gfx908.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/speccd-nn-gfx908.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/speccd-nn-gfx908.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/speccd-tn-gfx908.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/speccd-tn-gfx908.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/speccd-tn-gfx908.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/configs/speccd-tn-gfx908.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/joined/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/joined/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/joined/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/joined/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/joined/arcturus_Cijk_Ailk_Bljk_SBIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/joined/arcturus_Cijk_Ailk_Bljk_SBIc.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/joined/arcturus_Cijk_Ailk_Bljk_SBIc.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/joined/arcturus_Cijk_Ailk_Bljk_SBIc.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/nn/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/nn/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/nn/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/nn/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/nn/arcturus_Cijk_Ailk_Bljk_SBIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/nn/arcturus_Cijk_Ailk_Bljk_SBIc.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/nn/arcturus_Cijk_Ailk_Bljk_SBIc.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/nn/arcturus_Cijk_Ailk_Bljk_SBIc.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/tn/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/tn/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/tn/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/tn/arcturus_Cijk_Ailk_Bljk_SBIIc.yaml
diff --git a/Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/tn/arcturus_Cijk_Ailk_Bljk_SBIc.yaml b/src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/tn/arcturus_Cijk_Ailk_Bljk_SBIc.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/tn/arcturus_Cijk_Ailk_Bljk_SBIc.yaml
rename to src/Tensile/data/Configs/miopen/archives/ext2/2020-11-05/gfx908/raw/tn/arcturus_Cijk_Ailk_Bljk_SBIc.yaml
diff --git a/Tensile/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_nn.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_nn.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_nn.yaml
rename to src/Tensile/data/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_nn.yaml
diff --git a/Tensile/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_nt_batched.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_nt_batched.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_nt_batched.yaml
rename to src/Tensile/data/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_nt_batched.yaml
diff --git a/Tensile/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_tn.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_tn.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_tn.yaml
rename to src/Tensile/data/Configs/miopen/archives/inception/2019-03-26/configs/sgemm_inception_tn.yaml
diff --git a/Tensile/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/inception/2019-03-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_nn_riga.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_nn_riga.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_nn_riga.yaml
rename to src/Tensile/data/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_nn_riga.yaml
diff --git a/Tensile/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_nt_riga.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_nt_riga.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_nt_riga.yaml
rename to src/Tensile/data/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_nt_riga.yaml
diff --git a/Tensile/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_tn_riga.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_tn_riga.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_tn_riga.yaml
rename to src/Tensile/data/Configs/miopen/archives/inception/2019-08-26/configs/vega20_sgemm_tn_riga.yaml
diff --git a/Tensile/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/inception/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/inception/2020-06-15/configs/arcturus_sgemm_nn_resnext-inception.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2020-06-15/configs/arcturus_sgemm_nn_resnext-inception.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/inception/2020-06-15/configs/arcturus_sgemm_nn_resnext-inception.yaml
rename to src/Tensile/data/Configs/miopen/archives/inception/2020-06-15/configs/arcturus_sgemm_nn_resnext-inception.yaml
diff --git a/Tensile/Configs/miopen/archives/inception/2020-06-15/configs/arcturus_sgemm_nt_resnext-inception.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2020-06-15/configs/arcturus_sgemm_nt_resnext-inception.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/inception/2020-06-15/configs/arcturus_sgemm_nt_resnext-inception.yaml
rename to src/Tensile/data/Configs/miopen/archives/inception/2020-06-15/configs/arcturus_sgemm_nt_resnext-inception.yaml
diff --git a/Tensile/Configs/miopen/archives/inception/2020-06-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2020-06-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/inception/2020-06-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/inception/2020-06-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/inception/2020-06-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/inception/2020-06-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/inception/2020-06-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/inception/2020-06-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/megatron/2021-02-04/2_BenchmarkData.tar.gz b/src/Tensile/data/Configs/miopen/archives/megatron/2021-02-04/2_BenchmarkData.tar.gz
similarity index 100%
rename from Tensile/Configs/miopen/archives/megatron/2021-02-04/2_BenchmarkData.tar.gz
rename to src/Tensile/data/Configs/miopen/archives/megatron/2021-02-04/2_BenchmarkData.tar.gz
diff --git a/Tensile/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_nn_hbh.yaml b/src/Tensile/data/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_nn_hbh.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_nn_hbh.yaml
rename to src/Tensile/data/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_nn_hbh.yaml
diff --git a/Tensile/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_nt_hbh.yaml b/src/Tensile/data/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_nt_hbh.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_nt_hbh.yaml
rename to src/Tensile/data/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_nt_hbh.yaml
diff --git a/Tensile/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_tn_hbh.yaml b/src/Tensile/data/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_tn_hbh.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_tn_hbh.yaml
rename to src/Tensile/data/Configs/miopen/archives/megatron/2021-02-04/configs/vega20_hgemm_tn_hbh.yaml
diff --git a/Tensile/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml
rename to src/Tensile/data/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml
diff --git a/Tensile/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml
rename to src/Tensile/data/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml
diff --git a/Tensile/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Alik_Bljk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Alik_Bljk_HBH.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Alik_Bljk_HBH.yaml
rename to src/Tensile/data/Configs/miopen/archives/megatron/2021-02-04/exact/vega20_Cijk_Alik_Bljk_HBH.yaml
diff --git a/Tensile/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_nn_mlp.yaml b/src/Tensile/data/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_nn_mlp.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_nn_mlp.yaml
rename to src/Tensile/data/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_nn_mlp.yaml
diff --git a/Tensile/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_nt_mlp.yaml b/src/Tensile/data/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_nt_mlp.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_nt_mlp.yaml
rename to src/Tensile/data/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_nt_mlp.yaml
diff --git a/Tensile/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_tn_mlp.yaml b/src/Tensile/data/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_tn_mlp.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_tn_mlp.yaml
rename to src/Tensile/data/Configs/miopen/archives/mlp/2019-11-20/configs/vega20_sgemm_tn_mlp.yaml
diff --git a/Tensile/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/mlp/2019-11-20/exact/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_nn_k1.yaml b/src/Tensile/data/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_nn_k1.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_nn_k1.yaml
rename to src/Tensile/data/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_nn_k1.yaml
diff --git a/Tensile/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_nt_k1.yaml b/src/Tensile/data/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_nt_k1.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_nt_k1.yaml
rename to src/Tensile/data/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_nt_k1.yaml
diff --git a/Tensile/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_tn_k1.yaml b/src/Tensile/data/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_tn_k1.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_tn_k1.yaml
rename to src/Tensile/data/Configs/miopen/archives/mlp/2020-03-30/configs/vega20_sgemm_tn_k1.yaml
diff --git a/Tensile/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/mlp/2020-03-30/exact/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-03/archive/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-03/exact/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-06/archive/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/mlperf/2019-05-06/exact/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/phantom/2019-08-26/configs/configs1/vega20_sgemm_nn_phantom.yaml b/src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/configs/configs1/vega20_sgemm_nn_phantom.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/phantom/2019-08-26/configs/configs1/vega20_sgemm_nn_phantom.yaml
rename to src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/configs/configs1/vega20_sgemm_nn_phantom.yaml
diff --git a/Tensile/Configs/miopen/archives/phantom/2019-08-26/configs/configs1/vega20_sgemm_tn_phantom.yaml b/src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/configs/configs1/vega20_sgemm_tn_phantom.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/phantom/2019-08-26/configs/configs1/vega20_sgemm_tn_phantom.yaml
rename to src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/configs/configs1/vega20_sgemm_tn_phantom.yaml
diff --git a/Tensile/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_nn_phantom.yaml b/src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_nn_phantom.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_nn_phantom.yaml
rename to src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_nn_phantom.yaml
diff --git a/Tensile/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_nt_phantom.yaml b/src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_nt_phantom.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_nt_phantom.yaml
rename to src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_nt_phantom.yaml
diff --git a/Tensile/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_tn_phantom.yaml b/src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_tn_phantom.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_tn_phantom.yaml
rename to src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/configs/configs2/vega20_sgemm_tn_phantom.yaml
diff --git a/Tensile/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/phantom/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_nn_riga.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_nn_riga.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_nn_riga.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_nn_riga.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_nt_riga.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_nt_riga.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_nt_riga.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_nt_riga.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_tn_riga.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_tn_riga.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_tn_riga.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet/2019-08-26/configs/vega20_sgemm_tn_riga.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet/2019-08-26/exact/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nn-2x2.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nn-2x2.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nn-2x2.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nn-2x2.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nn.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nn.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nn.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nn.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nt-2x2.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nt-2x2.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nt-2x2.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nt-2x2.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nt.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nt.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nt.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/configs/resnet-inception-nt.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijk_Ailk_Bjlk_S.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijk_Ailk_Bjlk_S.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijk_Ailk_Bjlk_S.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijk_Ailk_Bjlk_S.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijk_Ailk_Bljk_S.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijk_Ailk_Bljk_S.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijk_Ailk_Bljk_S.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijk_Ailk_Bljk_S.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijkl_Aijml_Bkml_SI.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijkl_Aijml_Bkml_SI.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijkl_Aijml_Bkml_SI.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijkl_Aijml_Bkml_SI.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijkl_Aijml_Bmkl_SI.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijkl_Aijml_Bmkl_SI.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijkl_Aijml_Bmkl_SI.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-05-05/exact/vega20_Cijkl_Aijml_Bmkl_SI.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet/2020-05-06/configs/resnet-inception-hgemm-nn.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-05-06/configs/resnet-inception-hgemm-nn.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet/2020-05-06/configs/resnet-inception-hgemm-nn.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-05-06/configs/resnet-inception-hgemm-nn.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet/2020-05-06/configs/resnet-inception-hgemm-nt.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-05-06/configs/resnet-inception-hgemm-nt.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet/2020-05-06/configs/resnet-inception-hgemm-nt.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-05-06/configs/resnet-inception-hgemm-nt.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet/2020-05-06/exact/vega20_Cijk_Ailk_Bjlk_HH.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-05-06/exact/vega20_Cijk_Ailk_Bjlk_HH.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet/2020-05-06/exact/vega20_Cijk_Ailk_Bjlk_HH.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-05-06/exact/vega20_Cijk_Ailk_Bjlk_HH.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet/2020-05-06/exact/vega20_Cijk_Ailk_Bljk_HH.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-05-06/exact/vega20_Cijk_Ailk_Bljk_HH.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet/2020-05-06/exact/vega20_Cijk_Ailk_Bljk_HH.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-05-06/exact/vega20_Cijk_Ailk_Bljk_HH.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet/2020-06-15/configs/arcturus_sgemm_nn_resnext-inception.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-06-15/configs/arcturus_sgemm_nn_resnext-inception.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet/2020-06-15/configs/arcturus_sgemm_nn_resnext-inception.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-06-15/configs/arcturus_sgemm_nn_resnext-inception.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet/2020-06-15/configs/arcturus_sgemm_nt_resnext-inception.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-06-15/configs/arcturus_sgemm_nt_resnext-inception.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet/2020-06-15/configs/arcturus_sgemm_nt_resnext-inception.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-06-15/configs/arcturus_sgemm_nt_resnext-inception.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet/2020-06-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-06-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet/2020-06-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-06-15/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet/2020-06-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet/2020-06-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet/2020-06-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet/2020-06-15/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-09-12/README.md b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/README.md
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-09-12/README.md
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/README.md
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_nn.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_nn.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_nn.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_nn.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_nt.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_nt.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_nt.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_nt.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_tn.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_tn.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_tn.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/config/hgemm_resnet50_tn.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_nn.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_nn.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_nn.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_nn.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_nt.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_nt.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_nt.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_nt.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_tn.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_tn.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_tn.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/config/sgemm_resnet50_tn.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bjlk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bjlk_HB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bjlk_HB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bjlk_HB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bljk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bljk_HB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bljk_HB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bljk_HB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Alik_Bljk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Alik_Bljk_HB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Alik_Bljk_HB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Alik_Bljk_HB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-09-12/logic/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/README.md b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/README.md
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/README.md
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/README.md
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_nn.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_nn.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_nn.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_nn.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_nt.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_nt.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_nt.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_nt.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_tn.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_tn.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_tn.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/hgemm_resnet50_tn.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_nn.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_nn.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_nn.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_nn.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_nt.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_nt.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_nt.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_nt.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_tn.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_tn.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_tn.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/hpa_resnet50_tn.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_nn.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_nn.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_nn.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_nn.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_nt.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_nt.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_nt.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_nt.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_tn.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_tn.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_tn.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/config/sgemm_resnet50_tn.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_HB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_HB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_HB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_HBH.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_HBH.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_HBH.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_HB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_HB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_HB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_HBH.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_HBH.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_HBH.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_HB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_HB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_HB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_HBH.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_HBH.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_HBH.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/main/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_HB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_HB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_HB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_HBH.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_HBH.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_HBH.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_HB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_HB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_HB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_HBH.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_HBH.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_HBH.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_HB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_HB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_HB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_HBH.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_HBH.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_HBH.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/merged/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_HB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_HB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_HB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_HBH.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_HBH.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_HBH.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_HB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_HB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_HB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_HBH.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_HBH.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_HBH.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_HB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_HB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_HB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_HB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_HBH.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_HBH.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_HBH.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2018-10-09/logic/resnet50/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2019-12-03/configs/vega20_sgemm_nn_resnet50.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2019-12-03/configs/vega20_sgemm_nn_resnet50.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2019-12-03/configs/vega20_sgemm_nn_resnet50.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2019-12-03/configs/vega20_sgemm_nn_resnet50.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2019-12-03/configs/vega20_sgemm_nt_resnet50.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2019-12-03/configs/vega20_sgemm_nt_resnet50.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2019-12-03/configs/vega20_sgemm_nt_resnet50.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2019-12-03/configs/vega20_sgemm_nt_resnet50.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2019-12-03/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2019-12-03/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2019-12-03/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2019-12-03/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnet50/2019-12-03/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnet50/2019-12-03/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnet50/2019-12-03/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnet50/2019-12-03/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-10/2_BenchmarkData.tar.gz b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-10/2_BenchmarkData.tar.gz
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-10/2_BenchmarkData.tar.gz
rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-10/2_BenchmarkData.tar.gz
diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_nn_sb.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_nn_sb.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_nn_sb.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_nn_sb.yaml
diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_nt_sb.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_nt_sb.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_nt_sb.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_nt_sb.yaml
diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_tn_sb.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_tn_sb.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_tn_sb.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-10/configs/arcturus_sgemm_tn_sb.yaml
diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-10/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-17/2_BenchmarkData.tar.gz b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-17/2_BenchmarkData.tar.gz
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-17/2_BenchmarkData.tar.gz
rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-17/2_BenchmarkData.tar.gz
diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_nn_resnext3d.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_nn_resnext3d.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_nn_resnext3d.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_nn_resnext3d.yaml
diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_nt_resnext3d.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_nt_resnext3d.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_nt_resnext3d.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_nt_resnext3d.yaml
diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_tn_resnext3d.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_tn_resnext3d.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_tn_resnext3d.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-17/configs/vega20_sgemm_tn_resnext3d.yaml
diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-17/exact/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-18/2_BenchmarkData.tar.gz b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-18/2_BenchmarkData.tar.gz
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-18/2_BenchmarkData.tar.gz
rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-18/2_BenchmarkData.tar.gz
diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_nn_resnext3d-r2.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_nn_resnext3d-r2.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_nn_resnext3d-r2.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_nn_resnext3d-r2.yaml
diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_nt_resnext3d-r2.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_nt_resnext3d-r2.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_nt_resnext3d-r2.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_nt_resnext3d-r2.yaml
diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_tn_resnext3d-r2.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_tn_resnext3d-r2.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_tn_resnext3d-r2.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-18/configs/vega20_sgemm_tn_resnext3d-r2.yaml
diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/resnext3d/2021-02-18/exact/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/rk/2020-07-23/configs/replacement-kernel-arcturus-tn.yaml b/src/Tensile/data/Configs/miopen/archives/rk/2020-07-23/configs/replacement-kernel-arcturus-tn.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/rk/2020-07-23/configs/replacement-kernel-arcturus-tn.yaml
rename to src/Tensile/data/Configs/miopen/archives/rk/2020-07-23/configs/replacement-kernel-arcturus-tn.yaml
diff --git a/Tensile/Configs/miopen/archives/rk/2020-07-23/exact/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rk/2020-07-23/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/rk/2020-07-23/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/rk/2020-07-23/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/rk/2020-08-12/base/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rk/2020-08-12/base/arcturus_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/rk/2020-08-12/base/arcturus_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/rk/2020-08-12/base/arcturus_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/rk/2020-08-12/combined/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rk/2020-08-12/combined/arcturus_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/rk/2020-08-12/combined/arcturus_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/rk/2020-08-12/combined/arcturus_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/rk/2020-08-12/configuration/sgemm_tn-guard-pr195.yaml b/src/Tensile/data/Configs/miopen/archives/rk/2020-08-12/configuration/sgemm_tn-guard-pr195.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/rk/2020-08-12/configuration/sgemm_tn-guard-pr195.yaml
rename to src/Tensile/data/Configs/miopen/archives/rk/2020-08-12/configuration/sgemm_tn-guard-pr195.yaml
diff --git a/Tensile/Configs/miopen/archives/rk/2020-08-12/inc-raw/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rk/2020-08-12/inc-raw/arcturus_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/rk/2020-08-12/inc-raw/arcturus_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/rk/2020-08-12/inc-raw/arcturus_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/rk/2020-08-12/inc/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rk/2020-08-12/inc/arcturus_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/rk/2020-08-12/inc/arcturus_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/rk/2020-08-12/inc/arcturus_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/rk/2020-08-12/logs/convert.log b/src/Tensile/data/Configs/miopen/archives/rk/2020-08-12/logs/convert.log
similarity index 100%
rename from Tensile/Configs/miopen/archives/rk/2020-08-12/logs/convert.log
rename to src/Tensile/data/Configs/miopen/archives/rk/2020-08-12/logs/convert.log
diff --git a/Tensile/Configs/miopen/archives/rk/2020-08-12/logs/merge.log b/src/Tensile/data/Configs/miopen/archives/rk/2020-08-12/logs/merge.log
similarity index 100%
rename from Tensile/Configs/miopen/archives/rk/2020-08-12/logs/merge.log
rename to src/Tensile/data/Configs/miopen/archives/rk/2020-08-12/logs/merge.log
diff --git a/Tensile/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-05-29/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_nn_shakespeare.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_nn_shakespeare.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_nn_shakespeare.yaml
rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_nn_shakespeare.yaml
diff --git a/Tensile/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_nt_shakespeare.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_nt_shakespeare.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_nt_shakespeare.yaml
rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_nt_shakespeare.yaml
diff --git a/Tensile/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_tn_shakespeare.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_tn_shakespeare.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_tn_shakespeare.yaml
rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-10-10/configs/vega20_sgemm_tn_shakespeare.yaml
diff --git a/Tensile/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-10-10/exact/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_nn_shakespeare.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_nn_shakespeare.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_nn_shakespeare.yaml
rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_nn_shakespeare.yaml
diff --git a/Tensile/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_nt_shakespeare.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_nt_shakespeare.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_nt_shakespeare.yaml
rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_nt_shakespeare.yaml
diff --git a/Tensile/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_tn_shakespeare.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_tn_shakespeare.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_tn_shakespeare.yaml
rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-10-15/configs/vega10_sgemm_tn_shakespeare.yaml
diff --git a/Tensile/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/rnn/2019-10-15/exact/vega10_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/rnn/2020-03-27/configs/arcturus_sgemm_tn_miopen.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2020-03-27/configs/arcturus_sgemm_tn_miopen.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/rnn/2020-03-27/configs/arcturus_sgemm_tn_miopen.yaml
rename to src/Tensile/data/Configs/miopen/archives/rnn/2020-03-27/configs/arcturus_sgemm_tn_miopen.yaml
diff --git a/Tensile/Configs/miopen/archives/rnn/2020-03-27/exact/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/rnn/2020-03-27/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/rnn/2020-03-27/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/rnn/2020-03-27/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/arcturus_dgemm_nn_skinny_small.yaml b/src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/arcturus_dgemm_nn_skinny_small.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/arcturus_dgemm_nn_skinny_small.yaml
rename to src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/arcturus_dgemm_nn_skinny_small.yaml
diff --git a/Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/arcturus_dgemm_nt_skinny_small.yaml b/src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/arcturus_dgemm_nt_skinny_small.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/arcturus_dgemm_nt_skinny_small.yaml
rename to src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/arcturus_dgemm_nt_skinny_small.yaml
diff --git a/Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/vegoa20_dgemm_nn_skinny_small.yaml b/src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/vegoa20_dgemm_nn_skinny_small.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/vegoa20_dgemm_nn_skinny_small.yaml
rename to src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/vegoa20_dgemm_nn_skinny_small.yaml
diff --git a/Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/vegoa20_dgemm_nt_skinny_small.yaml b/src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/vegoa20_dgemm_nt_skinny_small.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/vegoa20_dgemm_nt_skinny_small.yaml
rename to src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/configs/vegoa20_dgemm_nt_skinny_small.yaml
diff --git a/Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/arcturus_Cijk_Ailk_Bjlk_DB.yaml b/src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/arcturus_Cijk_Ailk_Bjlk_DB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/arcturus_Cijk_Ailk_Bjlk_DB.yaml
rename to src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/arcturus_Cijk_Ailk_Bjlk_DB.yaml
diff --git a/Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/arcturus_Cijk_Ailk_Bljk_DB.yaml b/src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/arcturus_Cijk_Ailk_Bljk_DB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/arcturus_Cijk_Ailk_Bljk_DB.yaml
rename to src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/arcturus_Cijk_Ailk_Bljk_DB.yaml
diff --git a/Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/vega20_Cijk_Ailk_Bjlk_DB.yaml b/src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/vega20_Cijk_Ailk_Bjlk_DB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/vega20_Cijk_Ailk_Bjlk_DB.yaml
rename to src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/vega20_Cijk_Ailk_Bjlk_DB.yaml
diff --git a/Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/vega20_Cijk_Ailk_Bljk_DB.yaml b/src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/vega20_Cijk_Ailk_Bljk_DB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/vega20_Cijk_Ailk_Bljk_DB.yaml
rename to src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-21/exact/vega20_Cijk_Ailk_Bljk_DB.yaml
diff --git a/Tensile/Configs/miopen/archives/skinny-sizes/2020-05-27/configs/arcturus_dgemm_nn_skinny_large.yaml b/src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-27/configs/arcturus_dgemm_nn_skinny_large.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/skinny-sizes/2020-05-27/configs/arcturus_dgemm_nn_skinny_large.yaml
rename to src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-27/configs/arcturus_dgemm_nn_skinny_large.yaml
diff --git a/Tensile/Configs/miopen/archives/skinny-sizes/2020-05-27/configs/vega20_dgemm_nn_skinny_large.yaml b/src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-27/configs/vega20_dgemm_nn_skinny_large.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/skinny-sizes/2020-05-27/configs/vega20_dgemm_nn_skinny_large.yaml
rename to src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-27/configs/vega20_dgemm_nn_skinny_large.yaml
diff --git a/Tensile/Configs/miopen/archives/skinny-sizes/2020-05-27/exact/arcturus_Cijk_Ailk_Bljk_DB.yaml b/src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-27/exact/arcturus_Cijk_Ailk_Bljk_DB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/skinny-sizes/2020-05-27/exact/arcturus_Cijk_Ailk_Bljk_DB.yaml
rename to src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-27/exact/arcturus_Cijk_Ailk_Bljk_DB.yaml
diff --git a/Tensile/Configs/miopen/archives/skinny-sizes/2020-05-27/exact/vega20_Cijk_Ailk_Bljk_DB.yaml b/src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-27/exact/vega20_Cijk_Ailk_Bljk_DB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/skinny-sizes/2020-05-27/exact/vega20_Cijk_Ailk_Bljk_DB.yaml
rename to src/Tensile/data/Configs/miopen/archives/skinny-sizes/2020-05-27/exact/vega20_Cijk_Ailk_Bljk_DB.yaml
diff --git a/Tensile/Configs/miopen/archives/small-sizes/archive/2019-11-11/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/small-sizes/archive/2019-11-11/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/small-sizes/archive/2019-11-11/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/small-sizes/archive/2019-11-11/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/small-sizes/archive/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/small-sizes/archive/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/small-sizes/archive/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/small-sizes/archive/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/small-sizes/exact/2019-11-11/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/small-sizes/exact/2019-11-11/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/small-sizes/exact/2019-11-11/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/small-sizes/exact/2019-11-11/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/small-sizes/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/small-sizes/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/small-sizes/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/small-sizes/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/sparsNN/configs/sgemm_sparseNN_gemm_nn.yaml b/src/Tensile/data/Configs/miopen/archives/sparsNN/configs/sgemm_sparseNN_gemm_nn.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/sparsNN/configs/sgemm_sparseNN_gemm_nn.yaml
rename to src/Tensile/data/Configs/miopen/archives/sparsNN/configs/sgemm_sparseNN_gemm_nn.yaml
diff --git a/Tensile/Configs/miopen/archives/sparsNN/configs/sgemm_sparseNN_gemm_tn.yaml b/src/Tensile/data/Configs/miopen/archives/sparsNN/configs/sgemm_sparseNN_gemm_tn.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/sparsNN/configs/sgemm_sparseNN_gemm_tn.yaml
rename to src/Tensile/data/Configs/miopen/archives/sparsNN/configs/sgemm_sparseNN_gemm_tn.yaml
diff --git a/Tensile/Configs/miopen/archives/sparsNN/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/sparsNN/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/sparsNN/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/sparsNN/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/sparsNN/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/sparsNN/exact/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/sparsNN/exact/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/sparsNN/exact/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_nn_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_nn_transformer.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_nn_transformer.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_nn_transformer.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_nt_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_nt_transformer.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_nt_transformer.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_nt_transformer.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_tn_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_tn_transformer.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_tn_transformer.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2019-11-01/configs/vega10_sgemm_tn_transformer.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2019-11-01/exact/vega10_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_nn_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_nn_transformer.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_nn_transformer.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_nn_transformer.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_nt_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_nt_transformer.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_nt_transformer.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_nt_transformer.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_tn_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_tn_transformer.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_tn_transformer.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2019-11-05/configs/vega20_sgemm_tn_transformer.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2019-11-05/exact/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_nn_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_nn_transformer.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_nn_transformer.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_nn_transformer.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_nt_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_nt_transformer.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_nt_transformer.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_nt_transformer.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_tn_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_tn_transformer.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_tn_transformer.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-01-19/configs/arcturus_sgemm_tn_transformer.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-01-19/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_nn_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_nn_transformer.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_nn_transformer.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_nn_transformer.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_nt_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_nt_transformer.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_nt_transformer.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_nt_transformer.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_tn_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_tn_transformer.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_tn_transformer.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-03-29/configs/arcturus_sgemm_tn_transformer.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-03-29/exact/arcturus_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_nn_sgemm_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_nn_sgemm_transformer.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_nn_sgemm_transformer.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_nn_sgemm_transformer.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_nt_sgemm_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_nt_sgemm_transformer.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_nt_sgemm_transformer.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_nt_sgemm_transformer.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_tn_sgemm_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_tn_sgemm_transformer.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_tn_sgemm_transformer.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-08-31/configs/vega20_sgemm_tn_sgemm_transformer.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-08-31/exact/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_nn_hgemm_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_nn_hgemm_transformer.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_nn_hgemm_transformer.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_nn_hgemm_transformer.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_nt_hgemm_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_nt_hgemm_transformer.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_nt_hgemm_transformer.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_nt_hgemm_transformer.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_tn_hgemm_transformer.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_tn_hgemm_transformer.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_tn_hgemm_transformer.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-09-01/configs/vega20_hgemm_tn_hgemm_transformer.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Ailk_Bjlk_HBH.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Ailk_Bljk_HBH.yaml
diff --git a/Tensile/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Alik_Bljk_HBH.yaml b/src/Tensile/data/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Alik_Bljk_HBH.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Alik_Bljk_HBH.yaml
rename to src/Tensile/data/Configs/miopen/archives/transformer/2020-09-01/exact/vega20_Cijk_Alik_Bljk_HBH.yaml
diff --git a/Tensile/Configs/miopen/archives/winograd/2019-08-26/configs/vega20_sgemm_nt_winograd.yaml b/src/Tensile/data/Configs/miopen/archives/winograd/2019-08-26/configs/vega20_sgemm_nt_winograd.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/winograd/2019-08-26/configs/vega20_sgemm_nt_winograd.yaml
rename to src/Tensile/data/Configs/miopen/archives/winograd/2019-08-26/configs/vega20_sgemm_nt_winograd.yaml
diff --git a/Tensile/Configs/miopen/archives/winograd/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/winograd/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/winograd/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/winograd/2019-08-26/exact/vega20_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Configs/miopen/archives/winograd/2019-10-05/configs/vega20_sgemm_tn_winograd.yaml b/src/Tensile/data/Configs/miopen/archives/winograd/2019-10-05/configs/vega20_sgemm_tn_winograd.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/winograd/2019-10-05/configs/vega20_sgemm_tn_winograd.yaml
rename to src/Tensile/data/Configs/miopen/archives/winograd/2019-10-05/configs/vega20_sgemm_tn_winograd.yaml
diff --git a/Tensile/Configs/miopen/archives/winograd/2019-10-05/exact/vega20_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Configs/miopen/archives/winograd/2019-10-05/exact/vega20_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Configs/miopen/archives/winograd/2019-10-05/exact/vega20_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Configs/miopen/archives/winograd/2019-10-05/exact/vega20_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Configs/miopen/boiler/header.yml b/src/Tensile/data/Configs/miopen/boiler/header.yml
similarity index 100%
rename from Tensile/Configs/miopen/boiler/header.yml
rename to src/Tensile/data/Configs/miopen/boiler/header.yml
diff --git a/Tensile/Configs/miopen/boiler/library_logic_hip_only.yml b/src/Tensile/data/Configs/miopen/boiler/library_logic_hip_only.yml
similarity index 100%
rename from Tensile/Configs/miopen/boiler/library_logic_hip_only.yml
rename to src/Tensile/data/Configs/miopen/boiler/library_logic_hip_only.yml
diff --git a/Tensile/Configs/miopen/boiler/library_logic_vega10_only.yml b/src/Tensile/data/Configs/miopen/boiler/library_logic_vega10_only.yml
similarity index 100%
rename from Tensile/Configs/miopen/boiler/library_logic_vega10_only.yml
rename to src/Tensile/data/Configs/miopen/boiler/library_logic_vega10_only.yml
diff --git a/Tensile/Configs/miopen/boiler/library_logic_vega20_only.yml b/src/Tensile/data/Configs/miopen/boiler/library_logic_vega20_only.yml
similarity index 100%
rename from Tensile/Configs/miopen/boiler/library_logic_vega20_only.yml
rename to src/Tensile/data/Configs/miopen/boiler/library_logic_vega20_only.yml
diff --git a/Tensile/Configs/miopen/convert_cfg.py b/src/Tensile/data/Configs/miopen/convert_cfg.py
similarity index 100%
rename from Tensile/Configs/miopen/convert_cfg.py
rename to src/Tensile/data/Configs/miopen/convert_cfg.py
diff --git a/Tensile/Configs/miopen/make_all.sh b/src/Tensile/data/Configs/miopen/make_all.sh
similarity index 100%
rename from Tensile/Configs/miopen/make_all.sh
rename to src/Tensile/data/Configs/miopen/make_all.sh
diff --git a/Tensile/Configs/miopen/problems/nn/deepbench_conv_1x1_batch1.yml b/src/Tensile/data/Configs/miopen/problems/nn/deepbench_conv_1x1_batch1.yml
similarity index 100%
rename from Tensile/Configs/miopen/problems/nn/deepbench_conv_1x1_batch1.yml
rename to src/Tensile/data/Configs/miopen/problems/nn/deepbench_conv_1x1_batch1.yml
diff --git a/Tensile/Configs/miopen/problems/nn/deepbench_conv_1x1_batchN.yml b/src/Tensile/data/Configs/miopen/problems/nn/deepbench_conv_1x1_batchN.yml
similarity index 100%
rename from Tensile/Configs/miopen/problems/nn/deepbench_conv_1x1_batchN.yml
rename to src/Tensile/data/Configs/miopen/problems/nn/deepbench_conv_1x1_batchN.yml
diff --git a/Tensile/Configs/miopen/problems/nn/deepbench_gemm_large.yml b/src/Tensile/data/Configs/miopen/problems/nn/deepbench_gemm_large.yml
similarity index 100%
rename from Tensile/Configs/miopen/problems/nn/deepbench_gemm_large.yml
rename to src/Tensile/data/Configs/miopen/problems/nn/deepbench_gemm_large.yml
diff --git a/Tensile/Configs/miopen/problems/nn/deepbench_gemm_skinny.yml b/src/Tensile/data/Configs/miopen/problems/nn/deepbench_gemm_skinny.yml
similarity index 100%
rename from Tensile/Configs/miopen/problems/nn/deepbench_gemm_skinny.yml
rename to src/Tensile/data/Configs/miopen/problems/nn/deepbench_gemm_skinny.yml
diff --git a/Tensile/Configs/miopen/problems/nn/resnet50_all.yml b/src/Tensile/data/Configs/miopen/problems/nn/resnet50_all.yml
similarity index 100%
rename from Tensile/Configs/miopen/problems/nn/resnet50_all.yml
rename to src/Tensile/data/Configs/miopen/problems/nn/resnet50_all.yml
diff --git a/Tensile/Configs/miopen/problems/nn/resnet50_batch64.yml b/src/Tensile/data/Configs/miopen/problems/nn/resnet50_batch64.yml
similarity index 100%
rename from Tensile/Configs/miopen/problems/nn/resnet50_batch64.yml
rename to src/Tensile/data/Configs/miopen/problems/nn/resnet50_batch64.yml
diff --git a/Tensile/Configs/miopen/problems/nn/resnet_batch64_B.yml b/src/Tensile/data/Configs/miopen/problems/nn/resnet_batch64_B.yml
similarity index 100%
rename from Tensile/Configs/miopen/problems/nn/resnet_batch64_B.yml
rename to src/Tensile/data/Configs/miopen/problems/nn/resnet_batch64_B.yml
diff --git a/Tensile/Configs/miopen/problems/nt/deepbench_gemm_large.yml b/src/Tensile/data/Configs/miopen/problems/nt/deepbench_gemm_large.yml
similarity index 100%
rename from Tensile/Configs/miopen/problems/nt/deepbench_gemm_large.yml
rename to src/Tensile/data/Configs/miopen/problems/nt/deepbench_gemm_large.yml
diff --git a/Tensile/Configs/miopen/problems/nt/deepbench_gemm_skinny.yml b/src/Tensile/data/Configs/miopen/problems/nt/deepbench_gemm_skinny.yml
similarity index 100%
rename from Tensile/Configs/miopen/problems/nt/deepbench_gemm_skinny.yml
rename to src/Tensile/data/Configs/miopen/problems/nt/deepbench_gemm_skinny.yml
diff --git a/Tensile/Configs/miopen/problems/nt/resnet50_all.yml b/src/Tensile/data/Configs/miopen/problems/nt/resnet50_all.yml
similarity index 100%
rename from Tensile/Configs/miopen/problems/nt/resnet50_all.yml
rename to src/Tensile/data/Configs/miopen/problems/nt/resnet50_all.yml
diff --git a/Tensile/Configs/miopen/problems/tn/deepbench_gemm_large.yml b/src/Tensile/data/Configs/miopen/problems/tn/deepbench_gemm_large.yml
similarity index 100%
rename from Tensile/Configs/miopen/problems/tn/deepbench_gemm_large.yml
rename to src/Tensile/data/Configs/miopen/problems/tn/deepbench_gemm_large.yml
diff --git a/Tensile/Configs/miopen/problems/tn/deepbench_gemm_skinny.yml b/src/Tensile/data/Configs/miopen/problems/tn/deepbench_gemm_skinny.yml
similarity index 100%
rename from Tensile/Configs/miopen/problems/tn/deepbench_gemm_skinny.yml
rename to src/Tensile/data/Configs/miopen/problems/tn/deepbench_gemm_skinny.yml
diff --git a/Tensile/Configs/miopen/problems/tn/resnet50_all.yml b/src/Tensile/data/Configs/miopen/problems/tn/resnet50_all.yml
similarity index 100%
rename from Tensile/Configs/miopen/problems/tn/resnet50_all.yml
rename to src/Tensile/data/Configs/miopen/problems/tn/resnet50_all.yml
diff --git a/Tensile/Configs/miopen/solutions/hgemm_large_explore_3.yml b/src/Tensile/data/Configs/miopen/solutions/hgemm_large_explore_3.yml
similarity index 100%
rename from Tensile/Configs/miopen/solutions/hgemm_large_explore_3.yml
rename to src/Tensile/data/Configs/miopen/solutions/hgemm_large_explore_3.yml
diff --git a/Tensile/Configs/miopen/solutions/hgemm_large_explore_5.yml b/src/Tensile/data/Configs/miopen/solutions/hgemm_large_explore_5.yml
similarity index 100%
rename from Tensile/Configs/miopen/solutions/hgemm_large_explore_5.yml
rename to src/Tensile/data/Configs/miopen/solutions/hgemm_large_explore_5.yml
diff --git a/Tensile/Configs/miopen/solutions/hgemm_quick.yml b/src/Tensile/data/Configs/miopen/solutions/hgemm_quick.yml
similarity index 100%
rename from Tensile/Configs/miopen/solutions/hgemm_quick.yml
rename to src/Tensile/data/Configs/miopen/solutions/hgemm_quick.yml
diff --git a/Tensile/Configs/miopen/solutions/hgemm_skinny_explore_3.yml b/src/Tensile/data/Configs/miopen/solutions/hgemm_skinny_explore_3.yml
similarity index 100%
rename from Tensile/Configs/miopen/solutions/hgemm_skinny_explore_3.yml
rename to src/Tensile/data/Configs/miopen/solutions/hgemm_skinny_explore_3.yml
diff --git a/Tensile/Configs/miopen/solutions/hgemm_skinny_explore_5.yml b/src/Tensile/data/Configs/miopen/solutions/hgemm_skinny_explore_5.yml
similarity index 100%
rename from Tensile/Configs/miopen/solutions/hgemm_skinny_explore_5.yml
rename to src/Tensile/data/Configs/miopen/solutions/hgemm_skinny_explore_5.yml
diff --git a/Tensile/Configs/miopen/solutions/sgemm_large_explore_3.yml b/src/Tensile/data/Configs/miopen/solutions/sgemm_large_explore_3.yml
similarity index 100%
rename from Tensile/Configs/miopen/solutions/sgemm_large_explore_3.yml
rename to src/Tensile/data/Configs/miopen/solutions/sgemm_large_explore_3.yml
diff --git a/Tensile/Configs/miopen/solutions/sgemm_large_explore_5.yml b/src/Tensile/data/Configs/miopen/solutions/sgemm_large_explore_5.yml
similarity index 100%
rename from Tensile/Configs/miopen/solutions/sgemm_large_explore_5.yml
rename to src/Tensile/data/Configs/miopen/solutions/sgemm_large_explore_5.yml
diff --git a/Tensile/Configs/miopen/solutions/sgemm_large_explore_7.yml b/src/Tensile/data/Configs/miopen/solutions/sgemm_large_explore_7.yml
similarity index 100%
rename from Tensile/Configs/miopen/solutions/sgemm_large_explore_7.yml
rename to src/Tensile/data/Configs/miopen/solutions/sgemm_large_explore_7.yml
diff --git a/Tensile/Configs/miopen/solutions/sgemm_quick.yml b/src/Tensile/data/Configs/miopen/solutions/sgemm_quick.yml
similarity index 100%
rename from Tensile/Configs/miopen/solutions/sgemm_quick.yml
rename to src/Tensile/data/Configs/miopen/solutions/sgemm_quick.yml
diff --git a/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_3.yml b/src/Tensile/data/Configs/miopen/solutions/sgemm_skinny_explore_3.yml
similarity index 100%
rename from Tensile/Configs/miopen/solutions/sgemm_skinny_explore_3.yml
rename to src/Tensile/data/Configs/miopen/solutions/sgemm_skinny_explore_3.yml
diff --git a/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_4.yml b/src/Tensile/data/Configs/miopen/solutions/sgemm_skinny_explore_4.yml
similarity index 100%
rename from Tensile/Configs/miopen/solutions/sgemm_skinny_explore_4.yml
rename to src/Tensile/data/Configs/miopen/solutions/sgemm_skinny_explore_4.yml
diff --git a/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_5.yml b/src/Tensile/data/Configs/miopen/solutions/sgemm_skinny_explore_5.yml
similarity index 100%
rename from Tensile/Configs/miopen/solutions/sgemm_skinny_explore_5.yml
rename to src/Tensile/data/Configs/miopen/solutions/sgemm_skinny_explore_5.yml
diff --git a/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_7.yml b/src/Tensile/data/Configs/miopen/solutions/sgemm_skinny_explore_7.yml
similarity index 100%
rename from Tensile/Configs/miopen/solutions/sgemm_skinny_explore_7.yml
rename to src/Tensile/data/Configs/miopen/solutions/sgemm_skinny_explore_7.yml
diff --git a/Tensile/Configs/miopen/types/hgemm_nn.yml b/src/Tensile/data/Configs/miopen/types/hgemm_nn.yml
similarity index 100%
rename from Tensile/Configs/miopen/types/hgemm_nn.yml
rename to src/Tensile/data/Configs/miopen/types/hgemm_nn.yml
diff --git a/Tensile/Configs/miopen/types/hgemm_nt.yml b/src/Tensile/data/Configs/miopen/types/hgemm_nt.yml
similarity index 100%
rename from Tensile/Configs/miopen/types/hgemm_nt.yml
rename to src/Tensile/data/Configs/miopen/types/hgemm_nt.yml
diff --git a/Tensile/Configs/miopen/types/hgemm_tn.yml b/src/Tensile/data/Configs/miopen/types/hgemm_tn.yml
similarity index 100%
rename from Tensile/Configs/miopen/types/hgemm_tn.yml
rename to src/Tensile/data/Configs/miopen/types/hgemm_tn.yml
diff --git a/Tensile/Configs/miopen/types/hgemm_tt.yml b/src/Tensile/data/Configs/miopen/types/hgemm_tt.yml
similarity index 100%
rename from Tensile/Configs/miopen/types/hgemm_tt.yml
rename to src/Tensile/data/Configs/miopen/types/hgemm_tt.yml
diff --git a/Tensile/Configs/miopen/types/igemm_nn.yml b/src/Tensile/data/Configs/miopen/types/igemm_nn.yml
similarity index 100%
rename from Tensile/Configs/miopen/types/igemm_nn.yml
rename to src/Tensile/data/Configs/miopen/types/igemm_nn.yml
diff --git a/Tensile/Configs/miopen/types/igemm_nt.yml b/src/Tensile/data/Configs/miopen/types/igemm_nt.yml
similarity index 100%
rename from Tensile/Configs/miopen/types/igemm_nt.yml
rename to src/Tensile/data/Configs/miopen/types/igemm_nt.yml
diff --git a/Tensile/Configs/miopen/types/igemm_tn.yml b/src/Tensile/data/Configs/miopen/types/igemm_tn.yml
similarity index 100%
rename from Tensile/Configs/miopen/types/igemm_tn.yml
rename to src/Tensile/data/Configs/miopen/types/igemm_tn.yml
diff --git a/Tensile/Configs/miopen/types/igemm_tt.yml b/src/Tensile/data/Configs/miopen/types/igemm_tt.yml
similarity index 100%
rename from Tensile/Configs/miopen/types/igemm_tt.yml
rename to src/Tensile/data/Configs/miopen/types/igemm_tt.yml
diff --git a/Tensile/Configs/miopen/types/sgemm_nn.yml b/src/Tensile/data/Configs/miopen/types/sgemm_nn.yml
similarity index 100%
rename from Tensile/Configs/miopen/types/sgemm_nn.yml
rename to src/Tensile/data/Configs/miopen/types/sgemm_nn.yml
diff --git a/Tensile/Configs/miopen/types/sgemm_nt.yml b/src/Tensile/data/Configs/miopen/types/sgemm_nt.yml
similarity index 100%
rename from Tensile/Configs/miopen/types/sgemm_nt.yml
rename to src/Tensile/data/Configs/miopen/types/sgemm_nt.yml
diff --git a/Tensile/Configs/miopen/types/sgemm_tn.yml b/src/Tensile/data/Configs/miopen/types/sgemm_tn.yml
similarity index 100%
rename from Tensile/Configs/miopen/types/sgemm_tn.yml
rename to src/Tensile/data/Configs/miopen/types/sgemm_tn.yml
diff --git a/Tensile/Configs/miopen/types/sgemm_tt.yml b/src/Tensile/data/Configs/miopen/types/sgemm_tt.yml
similarity index 100%
rename from Tensile/Configs/miopen/types/sgemm_tt.yml
rename to src/Tensile/data/Configs/miopen/types/sgemm_tt.yml
diff --git a/Tensile/Configs/navi21/rocblas_hgemm_gb_nn_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hgemm_gb_nn_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/navi21/rocblas_hgemm_gb_nn_asm_full.yaml
rename to src/Tensile/data/Configs/navi21/rocblas_hgemm_gb_nn_asm_full.yaml
diff --git a/Tensile/Configs/navi21/rocblas_hgemm_gb_nt_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hgemm_gb_nt_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/navi21/rocblas_hgemm_gb_nt_asm_full.yaml
rename to src/Tensile/data/Configs/navi21/rocblas_hgemm_gb_nt_asm_full.yaml
diff --git a/Tensile/Configs/navi21/rocblas_hgemm_gb_tn_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hgemm_gb_tn_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/navi21/rocblas_hgemm_gb_tn_asm_full.yaml
rename to src/Tensile/data/Configs/navi21/rocblas_hgemm_gb_tn_asm_full.yaml
diff --git a/Tensile/Configs/navi21/rocblas_hgemm_gb_tt_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hgemm_gb_tt_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/navi21/rocblas_hgemm_gb_tt_asm_full.yaml
rename to src/Tensile/data/Configs/navi21/rocblas_hgemm_gb_tt_asm_full.yaml
diff --git a/Tensile/Configs/navi21/rocblas_hgemm_sb_nn_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hgemm_sb_nn_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/navi21/rocblas_hgemm_sb_nn_asm_full.yaml
rename to src/Tensile/data/Configs/navi21/rocblas_hgemm_sb_nn_asm_full.yaml
diff --git a/Tensile/Configs/navi21/rocblas_hgemm_sb_nt_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hgemm_sb_nt_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/navi21/rocblas_hgemm_sb_nt_asm_full.yaml
rename to src/Tensile/data/Configs/navi21/rocblas_hgemm_sb_nt_asm_full.yaml
diff --git a/Tensile/Configs/navi21/rocblas_hgemm_sb_tn_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hgemm_sb_tn_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/navi21/rocblas_hgemm_sb_tn_asm_full.yaml
rename to src/Tensile/data/Configs/navi21/rocblas_hgemm_sb_tn_asm_full.yaml
diff --git a/Tensile/Configs/navi21/rocblas_hgemm_sb_tt_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hgemm_sb_tt_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/navi21/rocblas_hgemm_sb_tt_asm_full.yaml
rename to src/Tensile/data/Configs/navi21/rocblas_hgemm_sb_tt_asm_full.yaml
diff --git a/Tensile/Configs/navi21/rocblas_hpa_hgemm_gb_nn_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_gb_nn_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/navi21/rocblas_hpa_hgemm_gb_nn_asm_full.yaml
rename to src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_gb_nn_asm_full.yaml
diff --git a/Tensile/Configs/navi21/rocblas_hpa_hgemm_gb_nt_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_gb_nt_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/navi21/rocblas_hpa_hgemm_gb_nt_asm_full.yaml
rename to src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_gb_nt_asm_full.yaml
diff --git a/Tensile/Configs/navi21/rocblas_hpa_hgemm_gb_tn_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_gb_tn_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/navi21/rocblas_hpa_hgemm_gb_tn_asm_full.yaml
rename to src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_gb_tn_asm_full.yaml
diff --git a/Tensile/Configs/navi21/rocblas_hpa_hgemm_gb_tt_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_gb_tt_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/navi21/rocblas_hpa_hgemm_gb_tt_asm_full.yaml
rename to src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_gb_tt_asm_full.yaml
diff --git a/Tensile/Configs/navi21/rocblas_hpa_hgemm_sb_nn_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_sb_nn_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/navi21/rocblas_hpa_hgemm_sb_nn_asm_full.yaml
rename to src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_sb_nn_asm_full.yaml
diff --git a/Tensile/Configs/navi21/rocblas_hpa_hgemm_sb_nt_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_sb_nt_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/navi21/rocblas_hpa_hgemm_sb_nt_asm_full.yaml
rename to src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_sb_nt_asm_full.yaml
diff --git a/Tensile/Configs/navi21/rocblas_hpa_hgemm_sb_tn_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_sb_tn_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/navi21/rocblas_hpa_hgemm_sb_tn_asm_full.yaml
rename to src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_sb_tn_asm_full.yaml
diff --git a/Tensile/Configs/navi21/rocblas_hpa_hgemm_sb_tt_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_sb_tt_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/navi21/rocblas_hpa_hgemm_sb_tt_asm_full.yaml
rename to src/Tensile/data/Configs/navi21/rocblas_hpa_hgemm_sb_tt_asm_full.yaml
diff --git a/Tensile/Configs/navi21/rocblas_sgemm_gb_nn_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_sgemm_gb_nn_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/navi21/rocblas_sgemm_gb_nn_asm_full.yaml
rename to src/Tensile/data/Configs/navi21/rocblas_sgemm_gb_nn_asm_full.yaml
diff --git a/Tensile/Configs/navi21/rocblas_sgemm_gb_nt_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_sgemm_gb_nt_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/navi21/rocblas_sgemm_gb_nt_asm_full.yaml
rename to src/Tensile/data/Configs/navi21/rocblas_sgemm_gb_nt_asm_full.yaml
diff --git a/Tensile/Configs/navi21/rocblas_sgemm_gb_tn_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_sgemm_gb_tn_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/navi21/rocblas_sgemm_gb_tn_asm_full.yaml
rename to src/Tensile/data/Configs/navi21/rocblas_sgemm_gb_tn_asm_full.yaml
diff --git a/Tensile/Configs/navi21/rocblas_sgemm_gb_tt_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_sgemm_gb_tt_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/navi21/rocblas_sgemm_gb_tt_asm_full.yaml
rename to src/Tensile/data/Configs/navi21/rocblas_sgemm_gb_tt_asm_full.yaml
diff --git a/Tensile/Configs/navi21/rocblas_sgemm_sb_nn_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_sgemm_sb_nn_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/navi21/rocblas_sgemm_sb_nn_asm_full.yaml
rename to src/Tensile/data/Configs/navi21/rocblas_sgemm_sb_nn_asm_full.yaml
diff --git a/Tensile/Configs/navi21/rocblas_sgemm_sb_nt_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_sgemm_sb_nt_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/navi21/rocblas_sgemm_sb_nt_asm_full.yaml
rename to src/Tensile/data/Configs/navi21/rocblas_sgemm_sb_nt_asm_full.yaml
diff --git a/Tensile/Configs/navi21/rocblas_sgemm_sb_tn_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_sgemm_sb_tn_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/navi21/rocblas_sgemm_sb_tn_asm_full.yaml
rename to src/Tensile/data/Configs/navi21/rocblas_sgemm_sb_tn_asm_full.yaml
diff --git a/Tensile/Configs/navi21/rocblas_sgemm_sb_tt_asm_full.yaml b/src/Tensile/data/Configs/navi21/rocblas_sgemm_sb_tt_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/navi21/rocblas_sgemm_sb_tt_asm_full.yaml
rename to src/Tensile/data/Configs/navi21/rocblas_sgemm_sb_tt_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_cgemm.yaml b/src/Tensile/data/Configs/rocblas_cgemm.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_cgemm.yaml
rename to src/Tensile/data/Configs/rocblas_cgemm.yaml
diff --git a/Tensile/Configs/rocblas_cgemm_asm_lite.yaml b/src/Tensile/data/Configs/rocblas_cgemm_asm_lite.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_cgemm_asm_lite.yaml
rename to src/Tensile/data/Configs/rocblas_cgemm_asm_lite.yaml
diff --git a/Tensile/Configs/rocblas_cgemm_hip_lite.yaml b/src/Tensile/data/Configs/rocblas_cgemm_hip_lite.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_cgemm_hip_lite.yaml
rename to src/Tensile/data/Configs/rocblas_cgemm_hip_lite.yaml
diff --git a/Tensile/Configs/rocblas_dgemm_asm_lite.yaml b/src/Tensile/data/Configs/rocblas_dgemm_asm_lite.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_dgemm_asm_lite.yaml
rename to src/Tensile/data/Configs/rocblas_dgemm_asm_lite.yaml
diff --git a/Tensile/Configs/rocblas_dgemm_asm_single_kernel.yaml b/src/Tensile/data/Configs/rocblas_dgemm_asm_single_kernel.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_dgemm_asm_single_kernel.yaml
rename to src/Tensile/data/Configs/rocblas_dgemm_asm_single_kernel.yaml
diff --git a/Tensile/Configs/rocblas_dgemm_asm_square.yaml b/src/Tensile/data/Configs/rocblas_dgemm_asm_square.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_dgemm_asm_square.yaml
rename to src/Tensile/data/Configs/rocblas_dgemm_asm_square.yaml
diff --git a/Tensile/Configs/rocblas_dgemm_bufferload_limit.yaml b/src/Tensile/data/Configs/rocblas_dgemm_bufferload_limit.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_dgemm_bufferload_limit.yaml
rename to src/Tensile/data/Configs/rocblas_dgemm_bufferload_limit.yaml
diff --git a/Tensile/Configs/rocblas_dgemm_hip_lite.yaml b/src/Tensile/data/Configs/rocblas_dgemm_hip_lite.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_dgemm_hip_lite.yaml
rename to src/Tensile/data/Configs/rocblas_dgemm_hip_lite.yaml
diff --git a/Tensile/Configs/rocblas_dgemm_nn_asm_full.yaml b/src/Tensile/data/Configs/rocblas_dgemm_nn_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_dgemm_nn_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_dgemm_nn_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_dgemm_nn_inc0_asm_full.yaml b/src/Tensile/data/Configs/rocblas_dgemm_nn_inc0_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_dgemm_nn_inc0_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_dgemm_nn_inc0_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_dgemm_nt_asm_full.yaml b/src/Tensile/data/Configs/rocblas_dgemm_nt_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_dgemm_nt_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_dgemm_nt_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_dgemm_nt_inc0_asm_full.yaml b/src/Tensile/data/Configs/rocblas_dgemm_nt_inc0_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_dgemm_nt_inc0_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_dgemm_nt_inc0_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_dgemm_nt_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_dgemm_nt_inc1_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_dgemm_nt_inc1_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_dgemm_nt_inc1_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_dgemm_nt_inc2_asm_full.yaml b/src/Tensile/data/Configs/rocblas_dgemm_nt_inc2_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_dgemm_nt_inc2_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_dgemm_nt_inc2_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_dgemm_nt_inc3_asm_full.yaml b/src/Tensile/data/Configs/rocblas_dgemm_nt_inc3_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_dgemm_nt_inc3_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_dgemm_nt_inc3_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_dgemm_nt_resume_train_exp.yaml b/src/Tensile/data/Configs/rocblas_dgemm_nt_resume_train_exp.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_dgemm_nt_resume_train_exp.yaml
rename to src/Tensile/data/Configs/rocblas_dgemm_nt_resume_train_exp.yaml
diff --git a/Tensile/Configs/rocblas_dgemm_tn_asm_full.yaml b/src/Tensile/data/Configs/rocblas_dgemm_tn_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_dgemm_tn_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_dgemm_tn_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_dgemm_tt_asm_full.yaml b/src/Tensile/data/Configs/rocblas_dgemm_tt_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_dgemm_tt_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_dgemm_tt_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_hgemm_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hgemm_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hgemm_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_hgemm_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_hgemm_asm_lite.yaml b/src/Tensile/data/Configs/rocblas_hgemm_asm_lite.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hgemm_asm_lite.yaml
rename to src/Tensile/data/Configs/rocblas_hgemm_asm_lite.yaml
diff --git a/Tensile/Configs/rocblas_hgemm_asm_single_kernel.yaml b/src/Tensile/data/Configs/rocblas_hgemm_asm_single_kernel.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hgemm_asm_single_kernel.yaml
rename to src/Tensile/data/Configs/rocblas_hgemm_asm_single_kernel.yaml
diff --git a/Tensile/Configs/rocblas_hgemm_bufferload_limit.yaml b/src/Tensile/data/Configs/rocblas_hgemm_bufferload_limit.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hgemm_bufferload_limit.yaml
rename to src/Tensile/data/Configs/rocblas_hgemm_bufferload_limit.yaml
diff --git a/Tensile/Configs/rocblas_hgemm_hip_lite.yaml b/src/Tensile/data/Configs/rocblas_hgemm_hip_lite.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hgemm_hip_lite.yaml
rename to src/Tensile/data/Configs/rocblas_hgemm_hip_lite.yaml
diff --git a/Tensile/Configs/rocblas_hpa_bf16_gemm_tn_asm_test.yaml b/src/Tensile/data/Configs/rocblas_hpa_bf16_gemm_tn_asm_test.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_bf16_gemm_tn_asm_test.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_bf16_gemm_tn_asm_test.yaml
diff --git a/Tensile/Configs/rocblas_hpa_bf16s_gemm_tn_asm_test.yaml b/src/Tensile/data/Configs/rocblas_hpa_bf16s_gemm_tn_asm_test.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_bf16s_gemm_tn_asm_test.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_bf16s_gemm_tn_asm_test.yaml
diff --git a/Tensile/Configs/rocblas_hpa_bfloat16_gemm_inc1_hip.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16_gemm_inc1_hip.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_bfloat16_gemm_inc1_hip.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16_gemm_inc1_hip.yaml
diff --git a/Tensile/Configs/rocblas_hpa_bfloat16_gemm_nn_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16_gemm_nn_inc1_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_bfloat16_gemm_nn_inc1_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16_gemm_nn_inc1_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_hpa_bfloat16_gemm_nt_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16_gemm_nt_inc1_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_bfloat16_gemm_nt_inc1_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16_gemm_nt_inc1_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_hpa_bfloat16_gemm_tn_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16_gemm_tn_inc1_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_bfloat16_gemm_tn_inc1_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16_gemm_tn_inc1_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_hpa_bfloat16_hip_lite.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16_hip_lite.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_bfloat16_hip_lite.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16_hip_lite.yaml
diff --git a/Tensile/Configs/rocblas_hpa_bfloat16_hip_single_kernel.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16_hip_single_kernel.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_bfloat16_hip_single_kernel.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16_hip_single_kernel.yaml
diff --git a/Tensile/Configs/rocblas_hpa_bfloat16_tn_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16_tn_inc1_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_bfloat16_tn_inc1_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16_tn_inc1_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_hpa_bfloat16_tn_inc2_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16_tn_inc2_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_bfloat16_tn_inc2_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16_tn_inc2_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_hpa_bfloat16s_gemm_inc1_hip.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16s_gemm_inc1_hip.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_bfloat16s_gemm_inc1_hip.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16s_gemm_inc1_hip.yaml
diff --git a/Tensile/Configs/rocblas_hpa_bfloat16s_gemm_nn_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16s_gemm_nn_inc1_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_bfloat16s_gemm_nn_inc1_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16s_gemm_nn_inc1_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_hpa_bfloat16s_gemm_nt_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16s_gemm_nt_inc1_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_bfloat16s_gemm_nt_inc1_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16s_gemm_nt_inc1_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_hpa_bfloat16s_gemm_tn_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16s_gemm_tn_inc1_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_bfloat16s_gemm_tn_inc1_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16s_gemm_tn_inc1_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_hpa_bfloat16s_hip_lite.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16s_hip_lite.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_bfloat16s_hip_lite.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16s_hip_lite.yaml
diff --git a/Tensile/Configs/rocblas_hpa_bfloat16s_hip_single_kernel.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16s_hip_single_kernel.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_bfloat16s_hip_single_kernel.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16s_hip_single_kernel.yaml
diff --git a/Tensile/Configs/rocblas_hpa_bfloat16s_tn_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16s_tn_inc1_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_bfloat16s_tn_inc1_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16s_tn_inc1_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_hpa_bfloat16s_tn_inc2_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_bfloat16s_tn_inc2_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_bfloat16s_tn_inc2_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_bfloat16s_tn_inc2_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_hpa_hgemm_asm_lite.yaml b/src/Tensile/data/Configs/rocblas_hpa_hgemm_asm_lite.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_hgemm_asm_lite.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_hgemm_asm_lite.yaml
diff --git a/Tensile/Configs/rocblas_hpa_hgemm_asm_single_kernel.yaml b/src/Tensile/data/Configs/rocblas_hpa_hgemm_asm_single_kernel.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_hgemm_asm_single_kernel.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_hgemm_asm_single_kernel.yaml
diff --git a/Tensile/Configs/rocblas_hpa_hgemm_hip_lite.yaml b/src/Tensile/data/Configs/rocblas_hpa_hgemm_hip_lite.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_hgemm_hip_lite.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_hgemm_hip_lite.yaml
diff --git a/Tensile/Configs/rocblas_hpa_hgemm_inc1_hip.yaml b/src/Tensile/data/Configs/rocblas_hpa_hgemm_inc1_hip.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_hgemm_inc1_hip.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_hgemm_inc1_hip.yaml
diff --git a/Tensile/Configs/rocblas_hpa_hgemm_nn_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_hgemm_nn_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_hgemm_nn_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_hgemm_nn_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_hpa_hgemm_nn_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_hgemm_nn_inc1_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_hgemm_nn_inc1_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_hgemm_nn_inc1_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_hpa_hgemm_nt_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_hgemm_nt_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_hgemm_nt_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_hgemm_nt_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_hpa_hgemm_nt_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_hgemm_nt_inc1_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_hgemm_nt_inc1_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_hgemm_nt_inc1_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_hpa_hgemm_tn_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_hgemm_tn_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_hgemm_tn_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_hgemm_tn_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_hpa_hgemm_tn_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_hgemm_tn_inc1_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_hgemm_tn_inc1_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_hgemm_tn_inc1_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_hpa_hgemm_tt_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_hgemm_tt_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_hgemm_tt_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_hgemm_tt_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_hpa_hsgemm_asm_lite.yaml b/src/Tensile/data/Configs/rocblas_hpa_hsgemm_asm_lite.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_hsgemm_asm_lite.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_hsgemm_asm_lite.yaml
diff --git a/Tensile/Configs/rocblas_hpa_hsgemm_asm_single_kernel.yaml b/src/Tensile/data/Configs/rocblas_hpa_hsgemm_asm_single_kernel.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_hsgemm_asm_single_kernel.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_hsgemm_asm_single_kernel.yaml
diff --git a/Tensile/Configs/rocblas_hpa_hsgemm_hip_lite.yaml b/src/Tensile/data/Configs/rocblas_hpa_hsgemm_hip_lite.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_hsgemm_hip_lite.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_hsgemm_hip_lite.yaml
diff --git a/Tensile/Configs/rocblas_hpa_hsgemm_inc1_hip.yaml b/src/Tensile/data/Configs/rocblas_hpa_hsgemm_inc1_hip.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_hsgemm_inc1_hip.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_hsgemm_inc1_hip.yaml
diff --git a/Tensile/Configs/rocblas_hpa_hsgemm_nn_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_hsgemm_nn_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_hsgemm_nn_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_hsgemm_nn_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_hpa_hsgemm_nn_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_hsgemm_nn_inc1_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_hsgemm_nn_inc1_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_hsgemm_nn_inc1_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_hpa_hsgemm_nt_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_hsgemm_nt_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_hsgemm_nt_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_hsgemm_nt_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_hpa_hsgemm_nt_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_hsgemm_nt_inc1_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_hsgemm_nt_inc1_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_hsgemm_nt_inc1_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_hpa_hsgemm_tn_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_hsgemm_tn_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_hsgemm_tn_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_hsgemm_tn_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_hpa_hsgemm_tn_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_hsgemm_tn_inc1_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_hsgemm_tn_inc1_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_hsgemm_tn_inc1_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_hpa_hsgemm_tt_asm_full.yaml b/src/Tensile/data/Configs/rocblas_hpa_hsgemm_tt_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_hsgemm_tt_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_hsgemm_tt_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_hpa_igemm_nn_hip.yaml b/src/Tensile/data/Configs/rocblas_hpa_igemm_nn_hip.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_igemm_nn_hip.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_igemm_nn_hip.yaml
diff --git a/Tensile/Configs/rocblas_hpa_igemm_nt_hip.yaml b/src/Tensile/data/Configs/rocblas_hpa_igemm_nt_hip.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_igemm_nt_hip.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_igemm_nt_hip.yaml
diff --git a/Tensile/Configs/rocblas_hpa_igemm_tn_hip.yaml b/src/Tensile/data/Configs/rocblas_hpa_igemm_tn_hip.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_igemm_tn_hip.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_igemm_tn_hip.yaml
diff --git a/Tensile/Configs/rocblas_hpa_igemm_tt_hip.yaml b/src/Tensile/data/Configs/rocblas_hpa_igemm_tt_hip.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hpa_igemm_tt_hip.yaml
rename to src/Tensile/data/Configs/rocblas_hpa_igemm_tt_hip.yaml
diff --git a/Tensile/Configs/rocblas_hsgemm_asm_lite.yaml b/src/Tensile/data/Configs/rocblas_hsgemm_asm_lite.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_hsgemm_asm_lite.yaml
rename to src/Tensile/data/Configs/rocblas_hsgemm_asm_lite.yaml
diff --git a/Tensile/Configs/rocblas_igemm_asm_full_nn.yaml b/src/Tensile/data/Configs/rocblas_igemm_asm_full_nn.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_igemm_asm_full_nn.yaml
rename to src/Tensile/data/Configs/rocblas_igemm_asm_full_nn.yaml
diff --git a/Tensile/Configs/rocblas_igemm_asm_full_nt.yaml b/src/Tensile/data/Configs/rocblas_igemm_asm_full_nt.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_igemm_asm_full_nt.yaml
rename to src/Tensile/data/Configs/rocblas_igemm_asm_full_nt.yaml
diff --git a/Tensile/Configs/rocblas_igemm_asm_full_tn.yaml b/src/Tensile/data/Configs/rocblas_igemm_asm_full_tn.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_igemm_asm_full_tn.yaml
rename to src/Tensile/data/Configs/rocblas_igemm_asm_full_tn.yaml
diff --git a/Tensile/Configs/rocblas_igemm_asm_full_tt.yaml b/src/Tensile/data/Configs/rocblas_igemm_asm_full_tt.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_igemm_asm_full_tt.yaml
rename to src/Tensile/data/Configs/rocblas_igemm_asm_full_tt.yaml
diff --git a/Tensile/Configs/rocblas_igemm_hip_single_kernel.yaml b/src/Tensile/data/Configs/rocblas_igemm_hip_single_kernel.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_igemm_hip_single_kernel.yaml
rename to src/Tensile/data/Configs/rocblas_igemm_hip_single_kernel.yaml
diff --git a/Tensile/Configs/rocblas_sgemm_asm_full.yaml b/src/Tensile/data/Configs/rocblas_sgemm_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_sgemm_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_sgemm_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_sgemm_asm_lite.yaml b/src/Tensile/data/Configs/rocblas_sgemm_asm_lite.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_sgemm_asm_lite.yaml
rename to src/Tensile/data/Configs/rocblas_sgemm_asm_lite.yaml
diff --git a/Tensile/Configs/rocblas_sgemm_asm_only.yaml b/src/Tensile/data/Configs/rocblas_sgemm_asm_only.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_sgemm_asm_only.yaml
rename to src/Tensile/data/Configs/rocblas_sgemm_asm_only.yaml
diff --git a/Tensile/Configs/rocblas_sgemm_asm_single_kernel.yaml b/src/Tensile/data/Configs/rocblas_sgemm_asm_single_kernel.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_sgemm_asm_single_kernel.yaml
rename to src/Tensile/data/Configs/rocblas_sgemm_asm_single_kernel.yaml
diff --git a/Tensile/Tests/extended/bufferload_offset/rocblas_sgemm_bufferload_limit.yaml b/src/Tensile/data/Configs/rocblas_sgemm_bufferload_limit.yaml
similarity index 100%
rename from Tensile/Tests/extended/bufferload_offset/rocblas_sgemm_bufferload_limit.yaml
rename to src/Tensile/data/Configs/rocblas_sgemm_bufferload_limit.yaml
diff --git a/Tensile/Configs/rocblas_sgemm_example.yaml b/src/Tensile/data/Configs/rocblas_sgemm_example.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_sgemm_example.yaml
rename to src/Tensile/data/Configs/rocblas_sgemm_example.yaml
diff --git a/Tensile/Configs/rocblas_sgemm_hip_lite.yaml b/src/Tensile/data/Configs/rocblas_sgemm_hip_lite.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_sgemm_hip_lite.yaml
rename to src/Tensile/data/Configs/rocblas_sgemm_hip_lite.yaml
diff --git a/Tensile/Configs/rocblas_sgemm_nn_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_sgemm_nn_inc1_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_sgemm_nn_inc1_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_sgemm_nn_inc1_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_sgemm_nt_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_sgemm_nt_inc1_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_sgemm_nt_inc1_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_sgemm_nt_inc1_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_sgemm_tn_inc1_asm_full.yaml b/src/Tensile/data/Configs/rocblas_sgemm_tn_inc1_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_sgemm_tn_inc1_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_sgemm_tn_inc1_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_sgemm_tn_inc2_asm_full.yaml b/src/Tensile/data/Configs/rocblas_sgemm_tn_inc2_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_sgemm_tn_inc2_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_sgemm_tn_inc2_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_sgemm_tn_inc3_asm_full.yaml b/src/Tensile/data/Configs/rocblas_sgemm_tn_inc3_asm_full.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_sgemm_tn_inc3_asm_full.yaml
rename to src/Tensile/data/Configs/rocblas_sgemm_tn_inc3_asm_full.yaml
diff --git a/Tensile/Configs/rocblas_zgemm.yaml b/src/Tensile/data/Configs/rocblas_zgemm.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_zgemm.yaml
rename to src/Tensile/data/Configs/rocblas_zgemm.yaml
diff --git a/Tensile/Configs/rocblas_zgemm_asm_lite.yaml b/src/Tensile/data/Configs/rocblas_zgemm_asm_lite.yaml
similarity index 100%
rename from Tensile/Configs/rocblas_zgemm_asm_lite.yaml
rename to src/Tensile/data/Configs/rocblas_zgemm_asm_lite.yaml
diff --git a/Tensile/Perf/BDAS/dgemm_kmeans.yaml b/src/Tensile/data/Perf/BDAS/dgemm_kmeans.yaml
similarity index 100%
rename from Tensile/Perf/BDAS/dgemm_kmeans.yaml
rename to src/Tensile/data/Perf/BDAS/dgemm_kmeans.yaml
diff --git a/Tensile/Perf/BDAS/dgemm_pca.yaml b/src/Tensile/data/Perf/BDAS/dgemm_pca.yaml
similarity index 100%
rename from Tensile/Perf/BDAS/dgemm_pca.yaml
rename to src/Tensile/data/Perf/BDAS/dgemm_pca.yaml
diff --git a/Tensile/Perf/BERT/sgemm_xdlops.yaml b/src/Tensile/data/Perf/BERT/sgemm_xdlops.yaml
similarity index 100%
rename from Tensile/Perf/BERT/sgemm_xdlops.yaml
rename to src/Tensile/data/Perf/BERT/sgemm_xdlops.yaml
diff --git a/Tensile/Perf/DLRM/sgemm_xdlops.yaml b/src/Tensile/data/Perf/DLRM/sgemm_xdlops.yaml
similarity index 100%
rename from Tensile/Perf/DLRM/sgemm_xdlops.yaml
rename to src/Tensile/data/Perf/DLRM/sgemm_xdlops.yaml
diff --git a/Tensile/Perf/DLRM/sgemm_xdlops_nn.yaml b/src/Tensile/data/Perf/DLRM/sgemm_xdlops_nn.yaml
similarity index 100%
rename from Tensile/Perf/DLRM/sgemm_xdlops_nn.yaml
rename to src/Tensile/data/Perf/DLRM/sgemm_xdlops_nn.yaml
diff --git a/Tensile/Perf/DLRM/sgemm_xdlops_nn_terabyte.yaml b/src/Tensile/data/Perf/DLRM/sgemm_xdlops_nn_terabyte.yaml
similarity index 100%
rename from Tensile/Perf/DLRM/sgemm_xdlops_nn_terabyte.yaml
rename to src/Tensile/data/Perf/DLRM/sgemm_xdlops_nn_terabyte.yaml
diff --git a/Tensile/Perf/DLRM/sgemm_xdlops_nt.yaml b/src/Tensile/data/Perf/DLRM/sgemm_xdlops_nt.yaml
similarity index 100%
rename from Tensile/Perf/DLRM/sgemm_xdlops_nt.yaml
rename to src/Tensile/data/Perf/DLRM/sgemm_xdlops_nt.yaml
diff --git a/Tensile/Perf/DLRM/sgemm_xdlops_nt_terabyte.yaml b/src/Tensile/data/Perf/DLRM/sgemm_xdlops_nt_terabyte.yaml
similarity index 100%
rename from Tensile/Perf/DLRM/sgemm_xdlops_nt_terabyte.yaml
rename to src/Tensile/data/Perf/DLRM/sgemm_xdlops_nt_terabyte.yaml
diff --git a/Tensile/Perf/DLRM/sgemm_xdlops_tn_terabyte.yaml b/src/Tensile/data/Perf/DLRM/sgemm_xdlops_tn_terabyte.yaml
similarity index 100%
rename from Tensile/Perf/DLRM/sgemm_xdlops_tn_terabyte.yaml
rename to src/Tensile/data/Perf/DLRM/sgemm_xdlops_tn_terabyte.yaml
diff --git a/Tensile/Perf/TRANSFORMER/sgemm_xdlops.yaml b/src/Tensile/data/Perf/TRANSFORMER/sgemm_xdlops.yaml
similarity index 100%
rename from Tensile/Perf/TRANSFORMER/sgemm_xdlops.yaml
rename to src/Tensile/data/Perf/TRANSFORMER/sgemm_xdlops.yaml
diff --git a/Tensile/Perf/TRANSFORMER/sgemm_xdlops_nn.yaml b/src/Tensile/data/Perf/TRANSFORMER/sgemm_xdlops_nn.yaml
similarity index 100%
rename from Tensile/Perf/TRANSFORMER/sgemm_xdlops_nn.yaml
rename to src/Tensile/data/Perf/TRANSFORMER/sgemm_xdlops_nn.yaml
diff --git a/Tensile/Perf/TRANSFORMER/sgemm_xdlops_nt.yaml b/src/Tensile/data/Perf/TRANSFORMER/sgemm_xdlops_nt.yaml
similarity index 100%
rename from Tensile/Perf/TRANSFORMER/sgemm_xdlops_nt.yaml
rename to src/Tensile/data/Perf/TRANSFORMER/sgemm_xdlops_nt.yaml
diff --git a/Tensile/Perf/conv/README b/src/Tensile/data/Perf/conv/README
similarity index 100%
rename from Tensile/Perf/conv/README
rename to src/Tensile/data/Perf/conv/README
diff --git a/Tensile/Perf/conv/conv_1x1_af0em.yaml b/src/Tensile/data/Perf/conv/conv_1x1_af0em.yaml
similarity index 100%
rename from Tensile/Perf/conv/conv_1x1_af0em.yaml
rename to src/Tensile/data/Perf/conv/conv_1x1_af0em.yaml
diff --git a/Tensile/Perf/conv/conv_1x1_oddpbd.yaml b/src/Tensile/data/Perf/conv/conv_1x1_oddpbd.yaml
similarity index 100%
rename from Tensile/Perf/conv/conv_1x1_oddpbd.yaml
rename to src/Tensile/data/Perf/conv/conv_1x1_oddpbd.yaml
diff --git a/Tensile/Perf/conv/conv_1x1u2_bdww.yaml b/src/Tensile/data/Perf/conv/conv_1x1u2_bdww.yaml
similarity index 100%
rename from Tensile/Perf/conv/conv_1x1u2_bdww.yaml
rename to src/Tensile/data/Perf/conv/conv_1x1u2_bdww.yaml
diff --git a/Tensile/Perf/conv/conv_1x1u2_fwd.yaml b/src/Tensile/data/Perf/conv/conv_1x1u2_fwd.yaml
similarity index 100%
rename from Tensile/Perf/conv/conv_1x1u2_fwd.yaml
rename to src/Tensile/data/Perf/conv/conv_1x1u2_fwd.yaml
diff --git a/Tensile/Perf/conv/conv_1x7_fwd.yaml b/src/Tensile/data/Perf/conv/conv_1x7_fwd.yaml
similarity index 100%
rename from Tensile/Perf/conv/conv_1x7_fwd.yaml
rename to src/Tensile/data/Perf/conv/conv_1x7_fwd.yaml
diff --git a/Tensile/Perf/conv/conv_7x1_fwd.yaml b/src/Tensile/data/Perf/conv/conv_7x1_fwd.yaml
similarity index 100%
rename from Tensile/Perf/conv/conv_7x1_fwd.yaml
rename to src/Tensile/data/Perf/conv/conv_7x1_fwd.yaml
diff --git a/Tensile/Perf/conv/conv_7x1_fwd2.yaml b/src/Tensile/data/Perf/conv/conv_7x1_fwd2.yaml
similarity index 100%
rename from Tensile/Perf/conv/conv_7x1_fwd2.yaml
rename to src/Tensile/data/Perf/conv/conv_7x1_fwd2.yaml
diff --git a/Tensile/Perf/conv/conv_7x1_roundup.yaml b/src/Tensile/data/Perf/conv/conv_7x1_roundup.yaml
similarity index 100%
rename from Tensile/Perf/conv/conv_7x1_roundup.yaml
rename to src/Tensile/data/Perf/conv/conv_7x1_roundup.yaml
diff --git a/Tensile/Perf/conv/conv_7x7u2_fwd.yaml b/src/Tensile/data/Perf/conv/conv_7x7u2_fwd.yaml
similarity index 100%
rename from Tensile/Perf/conv/conv_7x7u2_fwd.yaml
rename to src/Tensile/data/Perf/conv/conv_7x7u2_fwd.yaml
diff --git a/Tensile/Perf/conv/conv_bwdd_pbd.yaml b/src/Tensile/data/Perf/conv/conv_bwdd_pbd.yaml
similarity index 100%
rename from Tensile/Perf/conv/conv_bwdd_pbd.yaml
rename to src/Tensile/data/Perf/conv/conv_bwdd_pbd.yaml
diff --git a/Tensile/Perf/conv/conv_fwd.yaml b/src/Tensile/data/Perf/conv/conv_fwd.yaml
similarity index 100%
rename from Tensile/Perf/conv/conv_fwd.yaml
rename to src/Tensile/data/Perf/conv/conv_fwd.yaml
diff --git a/Tensile/Perf/conv_bwdd_ex0.yaml b/src/Tensile/data/Perf/conv_bwdd_ex0.yaml
similarity index 100%
rename from Tensile/Perf/conv_bwdd_ex0.yaml
rename to src/Tensile/data/Perf/conv_bwdd_ex0.yaml
diff --git a/Tensile/Perf/conv_bwdd_ex1.yaml b/src/Tensile/data/Perf/conv_bwdd_ex1.yaml
similarity index 100%
rename from Tensile/Perf/conv_bwdd_ex1.yaml
rename to src/Tensile/data/Perf/conv_bwdd_ex1.yaml
diff --git a/Tensile/Perf/conv_bwdw_big_gsu.yaml b/src/Tensile/data/Perf/conv_bwdw_big_gsu.yaml
similarity index 100%
rename from Tensile/Perf/conv_bwdw_big_gsu.yaml
rename to src/Tensile/data/Perf/conv_bwdw_big_gsu.yaml
diff --git a/Tensile/Perf/conv_bwdw_small_gsu.yaml b/src/Tensile/data/Perf/conv_bwdw_small_gsu.yaml
similarity index 100%
rename from Tensile/Perf/conv_bwdw_small_gsu.yaml
rename to src/Tensile/data/Perf/conv_bwdw_small_gsu.yaml
diff --git a/Tensile/Perf/conv_fwd_ex0.yaml b/src/Tensile/data/Perf/conv_fwd_ex0.yaml
similarity index 100%
rename from Tensile/Perf/conv_fwd_ex0.yaml
rename to src/Tensile/data/Perf/conv_fwd_ex0.yaml
diff --git a/Tensile/Perf/dgemm_large_square.yaml b/src/Tensile/data/Perf/dgemm_large_square.yaml
similarity index 100%
rename from Tensile/Perf/dgemm_large_square.yaml
rename to src/Tensile/data/Perf/dgemm_large_square.yaml
diff --git a/Tensile/Perf/hpl.yaml b/src/Tensile/data/Perf/hpl.yaml
similarity index 100%
rename from Tensile/Perf/hpl.yaml
rename to src/Tensile/data/Perf/hpl.yaml
diff --git a/Tensile/Perf/hpl_one.yaml b/src/Tensile/data/Perf/hpl_one.yaml
similarity index 100%
rename from Tensile/Perf/hpl_one.yaml
rename to src/Tensile/data/Perf/hpl_one.yaml
diff --git a/Tensile/Perf/hpl_quick.yaml b/src/Tensile/data/Perf/hpl_quick.yaml
similarity index 100%
rename from Tensile/Perf/hpl_quick.yaml
rename to src/Tensile/data/Perf/hpl_quick.yaml
diff --git a/Tensile/Perf/hpl_quick44k.yaml b/src/Tensile/data/Perf/hpl_quick44k.yaml
similarity index 100%
rename from Tensile/Perf/hpl_quick44k.yaml
rename to src/Tensile/data/Perf/hpl_quick44k.yaml
diff --git a/Tensile/Perf/inception/conv_1x1u1.yaml b/src/Tensile/data/Perf/inception/conv_1x1u1.yaml
similarity index 100%
rename from Tensile/Perf/inception/conv_1x1u1.yaml
rename to src/Tensile/data/Perf/inception/conv_1x1u1.yaml
diff --git a/Tensile/Perf/inception/conv_1x1u1_starter.yaml b/src/Tensile/data/Perf/inception/conv_1x1u1_starter.yaml
similarity index 100%
rename from Tensile/Perf/inception/conv_1x1u1_starter.yaml
rename to src/Tensile/data/Perf/inception/conv_1x1u1_starter.yaml
diff --git a/Tensile/Perf/inception/conv_NxN.yaml b/src/Tensile/data/Perf/inception/conv_NxN.yaml
similarity index 100%
rename from Tensile/Perf/inception/conv_NxN.yaml
rename to src/Tensile/data/Perf/inception/conv_NxN.yaml
diff --git a/Tensile/Perf/sgemm_large_square_nn.yaml b/src/Tensile/data/Perf/sgemm_large_square_nn.yaml
similarity index 100%
rename from Tensile/Perf/sgemm_large_square_nn.yaml
rename to src/Tensile/data/Perf/sgemm_large_square_nn.yaml
diff --git a/Tensile/Perf/sgemm_large_square_nt.yaml b/src/Tensile/data/Perf/sgemm_large_square_nt.yaml
similarity index 100%
rename from Tensile/Perf/sgemm_large_square_nt.yaml
rename to src/Tensile/data/Perf/sgemm_large_square_nt.yaml
diff --git a/Tensile/Perf/sgemm_large_square_tn.yaml b/src/Tensile/data/Perf/sgemm_large_square_tn.yaml
similarity index 100%
rename from Tensile/Perf/sgemm_large_square_tn.yaml
rename to src/Tensile/data/Perf/sgemm_large_square_tn.yaml
diff --git a/Tensile/Perf/use_initial_strides_cd/README b/src/Tensile/data/Perf/use_initial_strides_cd/README
similarity index 100%
rename from Tensile/Perf/use_initial_strides_cd/README
rename to src/Tensile/data/Perf/use_initial_strides_cd/README
diff --git a/Tensile/Perf/use_initial_strides_cd/perf_baseline0.yaml b/src/Tensile/data/Perf/use_initial_strides_cd/perf_baseline0.yaml
similarity index 100%
rename from Tensile/Perf/use_initial_strides_cd/perf_baseline0.yaml
rename to src/Tensile/data/Perf/use_initial_strides_cd/perf_baseline0.yaml
diff --git a/Tensile/Perf/use_initial_strides_cd/perf_uis_cd0.yaml b/src/Tensile/data/Perf/use_initial_strides_cd/perf_uis_cd0.yaml
similarity index 100%
rename from Tensile/Perf/use_initial_strides_cd/perf_uis_cd0.yaml
rename to src/Tensile/data/Perf/use_initial_strides_cd/perf_uis_cd0.yaml
diff --git a/Tensile/Perf/use_initial_strides_cd/perf_uis_cd_specialized.yaml b/src/Tensile/data/Perf/use_initial_strides_cd/perf_uis_cd_specialized.yaml
similarity index 100%
rename from Tensile/Perf/use_initial_strides_cd/perf_uis_cd_specialized.yaml
rename to src/Tensile/data/Perf/use_initial_strides_cd/perf_uis_cd_specialized.yaml
diff --git a/Tensile/Source/CMakeLists.txt b/src/Tensile/data/Source/CMakeLists.txt
similarity index 100%
rename from Tensile/Source/CMakeLists.txt
rename to src/Tensile/data/Source/CMakeLists.txt
diff --git a/Tensile/Source/EnableWarnings.cmake b/src/Tensile/data/Source/EnableWarnings.cmake
similarity index 100%
rename from Tensile/Source/EnableWarnings.cmake
rename to src/Tensile/data/Source/EnableWarnings.cmake
diff --git a/Tensile/Source/FindHIP.cmake b/src/Tensile/data/Source/FindHIP.cmake
similarity index 100%
rename from Tensile/Source/FindHIP.cmake
rename to src/Tensile/data/Source/FindHIP.cmake
diff --git a/Tensile/Source/FindOpenCL.cmake b/src/Tensile/data/Source/FindOpenCL.cmake
similarity index 100%
rename from Tensile/Source/FindOpenCL.cmake
rename to src/Tensile/data/Source/FindOpenCL.cmake
diff --git a/Tensile/Source/KernelHeader.h b/src/Tensile/data/Source/KernelHeader.h
similarity index 100%
rename from Tensile/Source/KernelHeader.h
rename to src/Tensile/data/Source/KernelHeader.h
diff --git a/Tensile/Source/TensileTypes.h b/src/Tensile/data/Source/TensileTypes.h
similarity index 100%
rename from Tensile/Source/TensileTypes.h
rename to src/Tensile/data/Source/TensileTypes.h
diff --git a/Tensile/Source/client/CMakeLists.txt b/src/Tensile/data/Source/client/CMakeLists.txt
similarity index 100%
rename from Tensile/Source/client/CMakeLists.txt
rename to src/Tensile/data/Source/client/CMakeLists.txt
diff --git a/Tensile/Source/client/include/BenchmarkTimer.hpp b/src/Tensile/data/Source/client/include/BenchmarkTimer.hpp
similarity index 100%
rename from Tensile/Source/client/include/BenchmarkTimer.hpp
rename to src/Tensile/data/Source/client/include/BenchmarkTimer.hpp
diff --git a/Tensile/Source/client/include/CSVStackFile.hpp b/src/Tensile/data/Source/client/include/CSVStackFile.hpp
similarity index 100%
rename from Tensile/Source/client/include/CSVStackFile.hpp
rename to src/Tensile/data/Source/client/include/CSVStackFile.hpp
diff --git a/Tensile/Source/client/include/ClientProblemFactory.hpp b/src/Tensile/data/Source/client/include/ClientProblemFactory.hpp
similarity index 100%
rename from Tensile/Source/client/include/ClientProblemFactory.hpp
rename to src/Tensile/data/Source/client/include/ClientProblemFactory.hpp
diff --git a/Tensile/Source/client/include/ConvolutionProblem.hpp b/src/Tensile/data/Source/client/include/ConvolutionProblem.hpp
similarity index 100%
rename from Tensile/Source/client/include/ConvolutionProblem.hpp
rename to src/Tensile/data/Source/client/include/ConvolutionProblem.hpp
diff --git a/Tensile/Source/client/include/DataInitialization.hpp b/src/Tensile/data/Source/client/include/DataInitialization.hpp
similarity index 100%
rename from Tensile/Source/client/include/DataInitialization.hpp
rename to src/Tensile/data/Source/client/include/DataInitialization.hpp
diff --git a/Tensile/Source/client/include/DataInitializationTyped.hpp b/src/Tensile/data/Source/client/include/DataInitializationTyped.hpp
similarity index 100%
rename from Tensile/Source/client/include/DataInitializationTyped.hpp
rename to src/Tensile/data/Source/client/include/DataInitializationTyped.hpp
diff --git a/Tensile/Source/client/include/HardwareMonitor.hpp b/src/Tensile/data/Source/client/include/HardwareMonitor.hpp
similarity index 100%
rename from Tensile/Source/client/include/HardwareMonitor.hpp
rename to src/Tensile/data/Source/client/include/HardwareMonitor.hpp
diff --git a/Tensile/Source/client/include/HardwareMonitorListener.hpp b/src/Tensile/data/Source/client/include/HardwareMonitorListener.hpp
similarity index 100%
rename from Tensile/Source/client/include/HardwareMonitorListener.hpp
rename to src/Tensile/data/Source/client/include/HardwareMonitorListener.hpp
diff --git a/Tensile/Source/client/include/HardwareMonitorType.hpp b/src/Tensile/data/Source/client/include/HardwareMonitorType.hpp
similarity index 100%
rename from Tensile/Source/client/include/HardwareMonitorType.hpp
rename to src/Tensile/data/Source/client/include/HardwareMonitorType.hpp
diff --git a/Tensile/Source/client/include/HardwareMonitorWindows.hpp b/src/Tensile/data/Source/client/include/HardwareMonitorWindows.hpp
similarity index 100%
rename from Tensile/Source/client/include/HardwareMonitorWindows.hpp
rename to src/Tensile/data/Source/client/include/HardwareMonitorWindows.hpp
diff --git a/Tensile/Source/client/include/HardwareMonitor_fwd.hpp b/src/Tensile/data/Source/client/include/HardwareMonitor_fwd.hpp
similarity index 100%
rename from Tensile/Source/client/include/HardwareMonitor_fwd.hpp
rename to src/Tensile/data/Source/client/include/HardwareMonitor_fwd.hpp
diff --git a/Tensile/Source/client/include/LibraryUpdateReporter.hpp b/src/Tensile/data/Source/client/include/LibraryUpdateReporter.hpp
similarity index 100%
rename from Tensile/Source/client/include/LibraryUpdateReporter.hpp
rename to src/Tensile/data/Source/client/include/LibraryUpdateReporter.hpp
diff --git a/Tensile/Source/client/include/LogReporter.hpp b/src/Tensile/data/Source/client/include/LogReporter.hpp
similarity index 100%
rename from Tensile/Source/client/include/LogReporter.hpp
rename to src/Tensile/data/Source/client/include/LogReporter.hpp
diff --git a/Tensile/Source/client/include/MetaResultReporter.hpp b/src/Tensile/data/Source/client/include/MetaResultReporter.hpp
similarity index 100%
rename from Tensile/Source/client/include/MetaResultReporter.hpp
rename to src/Tensile/data/Source/client/include/MetaResultReporter.hpp
diff --git a/Tensile/Source/client/include/MetaRunListener.hpp b/src/Tensile/data/Source/client/include/MetaRunListener.hpp
similarity index 100%
rename from Tensile/Source/client/include/MetaRunListener.hpp
rename to src/Tensile/data/Source/client/include/MetaRunListener.hpp
diff --git a/Tensile/Source/client/include/PerformanceReporter.hpp b/src/Tensile/data/Source/client/include/PerformanceReporter.hpp
similarity index 100%
rename from Tensile/Source/client/include/PerformanceReporter.hpp
rename to src/Tensile/data/Source/client/include/PerformanceReporter.hpp
diff --git a/Tensile/Source/client/include/ProgressListener.hpp b/src/Tensile/data/Source/client/include/ProgressListener.hpp
similarity index 100%
rename from Tensile/Source/client/include/ProgressListener.hpp
rename to src/Tensile/data/Source/client/include/ProgressListener.hpp
diff --git a/Tensile/Source/client/include/Reference.hpp b/src/Tensile/data/Source/client/include/Reference.hpp
similarity index 100%
rename from Tensile/Source/client/include/Reference.hpp
rename to src/Tensile/data/Source/client/include/Reference.hpp
diff --git a/Tensile/Source/client/include/ReferenceValidator.hpp b/src/Tensile/data/Source/client/include/ReferenceValidator.hpp
similarity index 100%
rename from Tensile/Source/client/include/ReferenceValidator.hpp
rename to src/Tensile/data/Source/client/include/ReferenceValidator.hpp
diff --git a/Tensile/Source/client/include/ResultComparison.hpp b/src/Tensile/data/Source/client/include/ResultComparison.hpp
similarity index 100%
rename from Tensile/Source/client/include/ResultComparison.hpp
rename to src/Tensile/data/Source/client/include/ResultComparison.hpp
diff --git a/Tensile/Source/client/include/ResultFileReporter.hpp b/src/Tensile/data/Source/client/include/ResultFileReporter.hpp
similarity index 100%
rename from Tensile/Source/client/include/ResultFileReporter.hpp
rename to src/Tensile/data/Source/client/include/ResultFileReporter.hpp
diff --git a/Tensile/Source/client/include/ResultReporter.hpp b/src/Tensile/data/Source/client/include/ResultReporter.hpp
similarity index 100%
rename from Tensile/Source/client/include/ResultReporter.hpp
rename to src/Tensile/data/Source/client/include/ResultReporter.hpp
diff --git a/Tensile/Source/client/include/ResultReporter_fwd.hpp b/src/Tensile/data/Source/client/include/ResultReporter_fwd.hpp
similarity index 100%
rename from Tensile/Source/client/include/ResultReporter_fwd.hpp
rename to src/Tensile/data/Source/client/include/ResultReporter_fwd.hpp
diff --git a/Tensile/Source/client/include/RunListener.hpp b/src/Tensile/data/Source/client/include/RunListener.hpp
similarity index 100%
rename from Tensile/Source/client/include/RunListener.hpp
rename to src/Tensile/data/Source/client/include/RunListener.hpp
diff --git a/Tensile/Source/client/include/SolutionIterator.hpp b/src/Tensile/data/Source/client/include/SolutionIterator.hpp
similarity index 100%
rename from Tensile/Source/client/include/SolutionIterator.hpp
rename to src/Tensile/data/Source/client/include/SolutionIterator.hpp
diff --git a/Tensile/Source/client/include/TimingEvents.hpp b/src/Tensile/data/Source/client/include/TimingEvents.hpp
similarity index 100%
rename from Tensile/Source/client/include/TimingEvents.hpp
rename to src/Tensile/data/Source/client/include/TimingEvents.hpp
diff --git a/Tensile/Source/client/main.cpp b/src/Tensile/data/Source/client/main.cpp
similarity index 100%
rename from Tensile/Source/client/main.cpp
rename to src/Tensile/data/Source/client/main.cpp
diff --git a/Tensile/Source/client/source/BenchmarkTimer.cpp b/src/Tensile/data/Source/client/source/BenchmarkTimer.cpp
similarity index 100%
rename from Tensile/Source/client/source/BenchmarkTimer.cpp
rename to src/Tensile/data/Source/client/source/BenchmarkTimer.cpp
diff --git a/Tensile/Source/client/source/CSVStackFile.cpp b/src/Tensile/data/Source/client/source/CSVStackFile.cpp
similarity index 100%
rename from Tensile/Source/client/source/CSVStackFile.cpp
rename to src/Tensile/data/Source/client/source/CSVStackFile.cpp
diff --git a/Tensile/Source/client/source/ClientProblemFactory.cpp b/src/Tensile/data/Source/client/source/ClientProblemFactory.cpp
similarity index 100%
rename from Tensile/Source/client/source/ClientProblemFactory.cpp
rename to src/Tensile/data/Source/client/source/ClientProblemFactory.cpp
diff --git a/Tensile/Source/client/source/ConvolutionProblem.cpp b/src/Tensile/data/Source/client/source/ConvolutionProblem.cpp
similarity index 100%
rename from Tensile/Source/client/source/ConvolutionProblem.cpp
rename to src/Tensile/data/Source/client/source/ConvolutionProblem.cpp
diff --git a/Tensile/Source/client/source/DataInitialization.cpp b/src/Tensile/data/Source/client/source/DataInitialization.cpp
similarity index 100%
rename from Tensile/Source/client/source/DataInitialization.cpp
rename to src/Tensile/data/Source/client/source/DataInitialization.cpp
diff --git a/Tensile/Source/client/source/HardwareMonitor.cpp b/src/Tensile/data/Source/client/source/HardwareMonitor.cpp
similarity index 100%
rename from Tensile/Source/client/source/HardwareMonitor.cpp
rename to src/Tensile/data/Source/client/source/HardwareMonitor.cpp
diff --git a/Tensile/Source/client/source/HardwareMonitorListener.cpp b/src/Tensile/data/Source/client/source/HardwareMonitorListener.cpp
similarity index 100%
rename from Tensile/Source/client/source/HardwareMonitorListener.cpp
rename to src/Tensile/data/Source/client/source/HardwareMonitorListener.cpp
diff --git a/Tensile/Source/client/source/LibraryUpdateReporter.cpp b/src/Tensile/data/Source/client/source/LibraryUpdateReporter.cpp
similarity index 100%
rename from Tensile/Source/client/source/LibraryUpdateReporter.cpp
rename to src/Tensile/data/Source/client/source/LibraryUpdateReporter.cpp
diff --git a/Tensile/Source/client/source/MetaRunListener.cpp b/src/Tensile/data/Source/client/source/MetaRunListener.cpp
similarity index 100%
rename from Tensile/Source/client/source/MetaRunListener.cpp
rename to src/Tensile/data/Source/client/source/MetaRunListener.cpp
diff --git a/Tensile/Source/client/source/PerformanceReporter.cpp b/src/Tensile/data/Source/client/source/PerformanceReporter.cpp
similarity index 100%
rename from Tensile/Source/client/source/PerformanceReporter.cpp
rename to src/Tensile/data/Source/client/source/PerformanceReporter.cpp
diff --git a/Tensile/Source/client/source/ProgressListener.cpp b/src/Tensile/data/Source/client/source/ProgressListener.cpp
similarity index 100%
rename from Tensile/Source/client/source/ProgressListener.cpp
rename to src/Tensile/data/Source/client/source/ProgressListener.cpp
diff --git a/Tensile/Source/client/source/Reference.cpp b/src/Tensile/data/Source/client/source/Reference.cpp
similarity index 100%
rename from Tensile/Source/client/source/Reference.cpp
rename to src/Tensile/data/Source/client/source/Reference.cpp
diff --git a/Tensile/Source/client/source/ReferenceValidator.cpp b/src/Tensile/data/Source/client/source/ReferenceValidator.cpp
similarity index 100%
rename from Tensile/Source/client/source/ReferenceValidator.cpp
rename to src/Tensile/data/Source/client/source/ReferenceValidator.cpp
diff --git a/Tensile/Source/client/source/ResultFileReporter.cpp b/src/Tensile/data/Source/client/source/ResultFileReporter.cpp
similarity index 100%
rename from Tensile/Source/client/source/ResultFileReporter.cpp
rename to src/Tensile/data/Source/client/source/ResultFileReporter.cpp
diff --git a/Tensile/Source/client/source/ResultReporter.cpp b/src/Tensile/data/Source/client/source/ResultReporter.cpp
similarity index 100%
rename from Tensile/Source/client/source/ResultReporter.cpp
rename to src/Tensile/data/Source/client/source/ResultReporter.cpp
diff --git a/Tensile/Source/client/source/SolutionIterator.cpp b/src/Tensile/data/Source/client/source/SolutionIterator.cpp
similarity index 100%
rename from Tensile/Source/client/source/SolutionIterator.cpp
rename to src/Tensile/data/Source/client/source/SolutionIterator.cpp
diff --git a/Tensile/Source/client/source/TimingEvents.cpp b/src/Tensile/data/Source/client/source/TimingEvents.cpp
similarity index 100%
rename from Tensile/Source/client/source/TimingEvents.cpp
rename to src/Tensile/data/Source/client/source/TimingEvents.cpp
diff --git a/Tensile/Source/cmake/FindROCmSMI.cmake b/src/Tensile/data/Source/cmake/FindROCmSMI.cmake
similarity index 100%
rename from Tensile/Source/cmake/FindROCmSMI.cmake
rename to src/Tensile/data/Source/cmake/FindROCmSMI.cmake
diff --git a/Tensile/Source/hip_f8_impl.h b/src/Tensile/data/Source/hip_f8_impl.h
similarity index 100%
rename from Tensile/Source/hip_f8_impl.h
rename to src/Tensile/data/Source/hip_f8_impl.h
diff --git a/Tensile/Source/lib/CMakeLists.txt b/src/Tensile/data/Source/lib/CMakeLists.txt
similarity index 100%
rename from Tensile/Source/lib/CMakeLists.txt
rename to src/Tensile/data/Source/lib/CMakeLists.txt
diff --git a/Tensile/Source/lib/configs/SolutionLibraries/KernelsLiteNavi.yaml b/src/Tensile/data/Source/lib/configs/SolutionLibraries/KernelsLiteNavi.yaml
similarity index 100%
rename from Tensile/Source/lib/configs/SolutionLibraries/KernelsLiteNavi.yaml
rename to src/Tensile/data/Source/lib/configs/SolutionLibraries/KernelsLiteNavi.yaml
diff --git a/Tensile/Source/lib/configs/lite_configs/navi10_Cijk_Ailk_Bjlk_SB.yaml b/src/Tensile/data/Source/lib/configs/lite_configs/navi10_Cijk_Ailk_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Source/lib/configs/lite_configs/navi10_Cijk_Ailk_Bjlk_SB.yaml
rename to src/Tensile/data/Source/lib/configs/lite_configs/navi10_Cijk_Ailk_Bjlk_SB.yaml
diff --git a/Tensile/Source/lib/configs/lite_configs/navi10_Cijk_Ailk_Bljk_SB.yaml b/src/Tensile/data/Source/lib/configs/lite_configs/navi10_Cijk_Ailk_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Source/lib/configs/lite_configs/navi10_Cijk_Ailk_Bljk_SB.yaml
rename to src/Tensile/data/Source/lib/configs/lite_configs/navi10_Cijk_Ailk_Bljk_SB.yaml
diff --git a/Tensile/Source/lib/configs/lite_configs/navi10_Cijk_Alik_Bjlk_SB.yaml b/src/Tensile/data/Source/lib/configs/lite_configs/navi10_Cijk_Alik_Bjlk_SB.yaml
similarity index 100%
rename from Tensile/Source/lib/configs/lite_configs/navi10_Cijk_Alik_Bjlk_SB.yaml
rename to src/Tensile/data/Source/lib/configs/lite_configs/navi10_Cijk_Alik_Bjlk_SB.yaml
diff --git a/Tensile/Source/lib/configs/lite_configs/navi10_Cijk_Alik_Bljk_SB.yaml b/src/Tensile/data/Source/lib/configs/lite_configs/navi10_Cijk_Alik_Bljk_SB.yaml
similarity index 100%
rename from Tensile/Source/lib/configs/lite_configs/navi10_Cijk_Alik_Bljk_SB.yaml
rename to src/Tensile/data/Source/lib/configs/lite_configs/navi10_Cijk_Alik_Bljk_SB.yaml
diff --git a/Tensile/Source/lib/include/Tensile/AMDGPU.hpp b/src/Tensile/data/Source/lib/include/Tensile/AMDGPU.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/AMDGPU.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/AMDGPU.hpp
diff --git a/Tensile/Source/lib/include/Tensile/AMDGPUPredicates.hpp b/src/Tensile/data/Source/lib/include/Tensile/AMDGPUPredicates.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/AMDGPUPredicates.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/AMDGPUPredicates.hpp
diff --git a/Tensile/Source/lib/include/Tensile/AMDGPU_Detail.hpp b/src/Tensile/data/Source/lib/include/Tensile/AMDGPU_Detail.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/AMDGPU_Detail.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/AMDGPU_Detail.hpp
diff --git a/Tensile/Source/lib/include/Tensile/ArithmeticUnitTypes.hpp b/src/Tensile/data/Source/lib/include/Tensile/ArithmeticUnitTypes.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/ArithmeticUnitTypes.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/ArithmeticUnitTypes.hpp
diff --git a/Tensile/Source/lib/include/Tensile/CachingLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/CachingLibrary.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/CachingLibrary.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/CachingLibrary.hpp
diff --git a/Tensile/Source/lib/include/Tensile/Comparison.hpp b/src/Tensile/data/Source/lib/include/Tensile/Comparison.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/Comparison.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/Comparison.hpp
diff --git a/Tensile/Source/lib/include/Tensile/ContractionLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/ContractionLibrary.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/ContractionLibrary.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/ContractionLibrary.hpp
diff --git a/Tensile/Source/lib/include/Tensile/ContractionProblem.hpp b/src/Tensile/data/Source/lib/include/Tensile/ContractionProblem.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/ContractionProblem.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/ContractionProblem.hpp
diff --git a/Tensile/Source/lib/include/Tensile/ContractionProblemPredicates.hpp b/src/Tensile/data/Source/lib/include/Tensile/ContractionProblemPredicates.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/ContractionProblemPredicates.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/ContractionProblemPredicates.hpp
diff --git a/Tensile/Source/lib/include/Tensile/ContractionProblemProperties.hpp b/src/Tensile/data/Source/lib/include/Tensile/ContractionProblemProperties.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/ContractionProblemProperties.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/ContractionProblemProperties.hpp
diff --git a/Tensile/Source/lib/include/Tensile/ContractionProblem_Detail.hpp b/src/Tensile/data/Source/lib/include/Tensile/ContractionProblem_Detail.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/ContractionProblem_Detail.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/ContractionProblem_Detail.hpp
diff --git a/Tensile/Source/lib/include/Tensile/ContractionProblem_fwd.hpp b/src/Tensile/data/Source/lib/include/Tensile/ContractionProblem_fwd.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/ContractionProblem_fwd.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/ContractionProblem_fwd.hpp
diff --git a/Tensile/Source/lib/include/Tensile/ContractionSolution.hpp b/src/Tensile/data/Source/lib/include/Tensile/ContractionSolution.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/ContractionSolution.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/ContractionSolution.hpp
diff --git a/Tensile/Source/lib/include/Tensile/ContractionSolution_fwd.hpp b/src/Tensile/data/Source/lib/include/Tensile/ContractionSolution_fwd.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/ContractionSolution_fwd.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/ContractionSolution_fwd.hpp
diff --git a/Tensile/Source/lib/include/Tensile/Contractions.hpp b/src/Tensile/data/Source/lib/include/Tensile/Contractions.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/Contractions.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/Contractions.hpp
diff --git a/Tensile/Source/lib/include/Tensile/DataTypes.hpp b/src/Tensile/data/Source/lib/include/Tensile/DataTypes.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/DataTypes.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/DataTypes.hpp
diff --git a/Tensile/Source/lib/include/Tensile/DataTypes_BFloat16.hpp b/src/Tensile/data/Source/lib/include/Tensile/DataTypes_BFloat16.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/DataTypes_BFloat16.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/DataTypes_BFloat16.hpp
diff --git a/Tensile/Source/lib/include/Tensile/DataTypes_Float8_BFloat8.hpp b/src/Tensile/data/Source/lib/include/Tensile/DataTypes_Float8_BFloat8.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/DataTypes_Float8_BFloat8.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/DataTypes_Float8_BFloat8.hpp
diff --git a/Tensile/Source/lib/include/Tensile/DataTypes_Half.hpp b/src/Tensile/data/Source/lib/include/Tensile/DataTypes_Half.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/DataTypes_Half.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/DataTypes_Half.hpp
diff --git a/Tensile/Source/lib/include/Tensile/DataTypes_Int8.hpp b/src/Tensile/data/Source/lib/include/Tensile/DataTypes_Int8.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/DataTypes_Int8.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/DataTypes_Int8.hpp
diff --git a/Tensile/Source/lib/include/Tensile/DataTypes_Int8x4.hpp b/src/Tensile/data/Source/lib/include/Tensile/DataTypes_Int8x4.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/DataTypes_Int8x4.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/DataTypes_Int8x4.hpp
diff --git a/Tensile/Source/lib/include/Tensile/DataTypes_XFloat32.hpp b/src/Tensile/data/Source/lib/include/Tensile/DataTypes_XFloat32.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/DataTypes_XFloat32.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/DataTypes_XFloat32.hpp
diff --git a/Tensile/Source/lib/include/Tensile/Debug.hpp b/src/Tensile/data/Source/lib/include/Tensile/Debug.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/Debug.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/Debug.hpp
diff --git a/Tensile/Source/lib/include/Tensile/DecisionTree.hpp b/src/Tensile/data/Source/lib/include/Tensile/DecisionTree.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/DecisionTree.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/DecisionTree.hpp
diff --git a/Tensile/Source/lib/include/Tensile/DecisionTreeLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/DecisionTreeLibrary.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/DecisionTreeLibrary.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/DecisionTreeLibrary.hpp
diff --git a/Tensile/Source/lib/include/Tensile/Distance.hpp b/src/Tensile/data/Source/lib/include/Tensile/Distance.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/Distance.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/Distance.hpp
diff --git a/Tensile/Source/lib/include/Tensile/DistinctType.hpp b/src/Tensile/data/Source/lib/include/Tensile/DistinctType.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/DistinctType.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/DistinctType.hpp
diff --git a/Tensile/Source/lib/include/Tensile/EmbeddedData.hpp b/src/Tensile/data/Source/lib/include/Tensile/EmbeddedData.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/EmbeddedData.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/EmbeddedData.hpp
diff --git a/Tensile/Source/lib/include/Tensile/EmbeddedLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/EmbeddedLibrary.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/EmbeddedLibrary.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/EmbeddedLibrary.hpp
diff --git a/Tensile/Source/lib/include/Tensile/ExactLogicLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/ExactLogicLibrary.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/ExactLogicLibrary.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/ExactLogicLibrary.hpp
diff --git a/Tensile/Source/lib/include/Tensile/GranularitySelectionLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/GranularitySelectionLibrary.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/GranularitySelectionLibrary.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/GranularitySelectionLibrary.hpp
diff --git a/Tensile/Source/lib/include/Tensile/KernelArguments.hpp b/src/Tensile/data/Source/lib/include/Tensile/KernelArguments.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/KernelArguments.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/KernelArguments.hpp
diff --git a/Tensile/Source/lib/include/Tensile/KernelLanguageTypes.hpp b/src/Tensile/data/Source/lib/include/Tensile/KernelLanguageTypes.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/KernelLanguageTypes.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/KernelLanguageTypes.hpp
diff --git a/Tensile/Source/lib/include/Tensile/MLFeatures.hpp b/src/Tensile/data/Source/lib/include/Tensile/MLFeatures.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/MLFeatures.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/MLFeatures.hpp
diff --git a/Tensile/Source/lib/include/Tensile/Macros.hpp b/src/Tensile/data/Source/lib/include/Tensile/Macros.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/Macros.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/Macros.hpp
diff --git a/Tensile/Source/lib/include/Tensile/MapLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/MapLibrary.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/MapLibrary.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/MapLibrary.hpp
diff --git a/Tensile/Source/lib/include/Tensile/MasterSolutionLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/MasterSolutionLibrary.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/MasterSolutionLibrary.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/MasterSolutionLibrary.hpp
diff --git a/Tensile/Source/lib/include/Tensile/MatchingLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/MatchingLibrary.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/MatchingLibrary.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/MatchingLibrary.hpp
diff --git a/Tensile/Source/lib/include/Tensile/PerformanceMetricTypes.hpp b/src/Tensile/data/Source/lib/include/Tensile/PerformanceMetricTypes.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/PerformanceMetricTypes.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/PerformanceMetricTypes.hpp
diff --git a/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/PlaceholderLibrary.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/PlaceholderLibrary.hpp
diff --git a/Tensile/Source/lib/include/Tensile/Predicates.hpp b/src/Tensile/data/Source/lib/include/Tensile/Predicates.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/Predicates.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/Predicates.hpp
diff --git a/Tensile/Source/lib/include/Tensile/ProblemKey.hpp b/src/Tensile/data/Source/lib/include/Tensile/ProblemKey.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/ProblemKey.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/ProblemKey.hpp
diff --git a/Tensile/Source/lib/include/Tensile/Properties.hpp b/src/Tensile/data/Source/lib/include/Tensile/Properties.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/Properties.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/Properties.hpp
diff --git a/Tensile/Source/lib/include/Tensile/PropertyMatching.hpp b/src/Tensile/data/Source/lib/include/Tensile/PropertyMatching.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/PropertyMatching.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/PropertyMatching.hpp
diff --git a/Tensile/Source/lib/include/Tensile/ScalarValueTypes.hpp b/src/Tensile/data/Source/lib/include/Tensile/ScalarValueTypes.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/ScalarValueTypes.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/ScalarValueTypes.hpp
diff --git a/Tensile/Source/lib/include/Tensile/Serialization.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/Serialization.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/Serialization.hpp
diff --git a/Tensile/Source/lib/include/Tensile/Serialization/Base.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/Base.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/Serialization/Base.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/Base.hpp
diff --git a/Tensile/Source/lib/include/Tensile/Serialization/Containers.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/Containers.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/Serialization/Containers.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/Containers.hpp
diff --git a/Tensile/Source/lib/include/Tensile/Serialization/ContractionPredicates.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/ContractionPredicates.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/Serialization/ContractionPredicates.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/ContractionPredicates.hpp
diff --git a/Tensile/Source/lib/include/Tensile/Serialization/ContractionSolution.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/ContractionSolution.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/Serialization/ContractionSolution.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/ContractionSolution.hpp
diff --git a/Tensile/Source/lib/include/Tensile/Serialization/DecisionTreeLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/DecisionTreeLibrary.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/Serialization/DecisionTreeLibrary.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/DecisionTreeLibrary.hpp
diff --git a/Tensile/Source/lib/include/Tensile/Serialization/ExactLogicLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/ExactLogicLibrary.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/Serialization/ExactLogicLibrary.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/ExactLogicLibrary.hpp
diff --git a/Tensile/Source/lib/include/Tensile/Serialization/GranularitySelectionLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/GranularitySelectionLibrary.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/Serialization/GranularitySelectionLibrary.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/GranularitySelectionLibrary.hpp
diff --git a/Tensile/Source/lib/include/Tensile/Serialization/HasTraits.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/HasTraits.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/Serialization/HasTraits.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/HasTraits.hpp
diff --git a/Tensile/Source/lib/include/Tensile/Serialization/MLFeatures.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/MLFeatures.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/Serialization/MLFeatures.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/MLFeatures.hpp
diff --git a/Tensile/Source/lib/include/Tensile/Serialization/MapLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/MapLibrary.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/Serialization/MapLibrary.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/MapLibrary.hpp
diff --git a/Tensile/Source/lib/include/Tensile/Serialization/MatchingLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/MatchingLibrary.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/Serialization/MatchingLibrary.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/MatchingLibrary.hpp
diff --git a/Tensile/Source/lib/include/Tensile/Serialization/PlaceholderLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/PlaceholderLibrary.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/Serialization/PlaceholderLibrary.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/PlaceholderLibrary.hpp
diff --git a/Tensile/Source/lib/include/Tensile/Serialization/Predicates.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/Predicates.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/Serialization/Predicates.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/Predicates.hpp
diff --git a/Tensile/Source/lib/include/Tensile/Serialization/Properties.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/Properties.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/Serialization/Properties.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/Properties.hpp
diff --git a/Tensile/Source/lib/include/Tensile/Serialization/SolutionLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/Serialization/SolutionLibrary.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/Serialization/SolutionLibrary.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/Serialization/SolutionLibrary.hpp
diff --git a/Tensile/Source/lib/include/Tensile/SingleSolutionLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/SingleSolutionLibrary.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/SingleSolutionLibrary.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/SingleSolutionLibrary.hpp
diff --git a/Tensile/Source/lib/include/Tensile/Singleton.hpp b/src/Tensile/data/Source/lib/include/Tensile/Singleton.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/Singleton.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/Singleton.hpp
diff --git a/Tensile/Source/lib/include/Tensile/SolutionLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/SolutionLibrary.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/SolutionLibrary.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/SolutionLibrary.hpp
diff --git a/Tensile/Source/lib/include/Tensile/SolutionLibrary_fwd.hpp b/src/Tensile/data/Source/lib/include/Tensile/SolutionLibrary_fwd.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/SolutionLibrary_fwd.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/SolutionLibrary_fwd.hpp
diff --git a/Tensile/Source/lib/include/Tensile/SolutionMapLibrary.hpp b/src/Tensile/data/Source/lib/include/Tensile/SolutionMapLibrary.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/SolutionMapLibrary.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/SolutionMapLibrary.hpp
diff --git a/Tensile/Source/lib/include/Tensile/Tensile.hpp b/src/Tensile/data/Source/lib/include/Tensile/Tensile.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/Tensile.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/Tensile.hpp
diff --git a/Tensile/Source/lib/include/Tensile/Tensile_fwd.hpp b/src/Tensile/data/Source/lib/include/Tensile/Tensile_fwd.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/Tensile_fwd.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/Tensile_fwd.hpp
diff --git a/Tensile/Source/lib/include/Tensile/TensorDescriptor.hpp b/src/Tensile/data/Source/lib/include/Tensile/TensorDescriptor.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/TensorDescriptor.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/TensorDescriptor.hpp
diff --git a/Tensile/Source/lib/include/Tensile/TensorDescriptor_Detail.hpp b/src/Tensile/data/Source/lib/include/Tensile/TensorDescriptor_Detail.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/TensorDescriptor_Detail.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/TensorDescriptor_Detail.hpp
diff --git a/Tensile/Source/lib/include/Tensile/TensorDescriptor_fwd.hpp b/src/Tensile/data/Source/lib/include/Tensile/TensorDescriptor_fwd.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/TensorDescriptor_fwd.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/TensorDescriptor_fwd.hpp
diff --git a/Tensile/Source/lib/include/Tensile/TensorOps.hpp b/src/Tensile/data/Source/lib/include/Tensile/TensorOps.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/TensorOps.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/TensorOps.hpp
diff --git a/Tensile/Source/lib/include/Tensile/TensorOps_fwd.hpp b/src/Tensile/data/Source/lib/include/Tensile/TensorOps_fwd.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/TensorOps_fwd.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/TensorOps_fwd.hpp
diff --git a/Tensile/Source/lib/include/Tensile/UserDrivenTuningParser.hpp b/src/Tensile/data/Source/lib/include/Tensile/UserDrivenTuningParser.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/UserDrivenTuningParser.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/UserDrivenTuningParser.hpp
diff --git a/Tensile/Source/lib/include/Tensile/Utils.hpp b/src/Tensile/data/Source/lib/include/Tensile/Utils.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/Utils.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/Utils.hpp
diff --git a/Tensile/Source/lib/include/Tensile/geom.hpp b/src/Tensile/data/Source/lib/include/Tensile/geom.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/geom.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/geom.hpp
diff --git a/Tensile/Source/lib/include/Tensile/hip/HipHardware.hpp b/src/Tensile/data/Source/lib/include/Tensile/hip/HipHardware.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/hip/HipHardware.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/hip/HipHardware.hpp
diff --git a/Tensile/Source/lib/include/Tensile/hip/HipSolutionAdapter.hpp b/src/Tensile/data/Source/lib/include/Tensile/hip/HipSolutionAdapter.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/hip/HipSolutionAdapter.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/hip/HipSolutionAdapter.hpp
diff --git a/Tensile/Source/lib/include/Tensile/hip/HipUtils.hpp b/src/Tensile/data/Source/lib/include/Tensile/hip/HipUtils.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/hip/HipUtils.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/hip/HipUtils.hpp
diff --git a/Tensile/Source/lib/include/Tensile/hip_f8_impl.h b/src/Tensile/data/Source/lib/include/Tensile/hip_f8_impl.h
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/hip_f8_impl.h
rename to src/Tensile/data/Source/lib/include/Tensile/hip_f8_impl.h
diff --git a/Tensile/Source/lib/include/Tensile/llvm/Loading.hpp b/src/Tensile/data/Source/lib/include/Tensile/llvm/Loading.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/llvm/Loading.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/llvm/Loading.hpp
diff --git a/Tensile/Source/lib/include/Tensile/llvm/YAML.hpp b/src/Tensile/data/Source/lib/include/Tensile/llvm/YAML.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/llvm/YAML.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/llvm/YAML.hpp
diff --git a/Tensile/Source/lib/include/Tensile/msgpack/Loading.hpp b/src/Tensile/data/Source/lib/include/Tensile/msgpack/Loading.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/msgpack/Loading.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/msgpack/Loading.hpp
diff --git a/Tensile/Source/lib/include/Tensile/msgpack/MessagePack.hpp b/src/Tensile/data/Source/lib/include/Tensile/msgpack/MessagePack.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/msgpack/MessagePack.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/msgpack/MessagePack.hpp
diff --git a/Tensile/Source/lib/include/Tensile/ocl/OclFwd.hpp b/src/Tensile/data/Source/lib/include/Tensile/ocl/OclFwd.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/ocl/OclFwd.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/ocl/OclFwd.hpp
diff --git a/Tensile/Source/lib/include/Tensile/ocl/OclHardware.hpp b/src/Tensile/data/Source/lib/include/Tensile/ocl/OclHardware.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/ocl/OclHardware.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/ocl/OclHardware.hpp
diff --git a/Tensile/Source/lib/include/Tensile/ocl/OclSolutionAdapter.hpp b/src/Tensile/data/Source/lib/include/Tensile/ocl/OclSolutionAdapter.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/ocl/OclSolutionAdapter.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/ocl/OclSolutionAdapter.hpp
diff --git a/Tensile/Source/lib/include/Tensile/ocl/OclUtils.hpp b/src/Tensile/data/Source/lib/include/Tensile/ocl/OclUtils.hpp
similarity index 100%
rename from Tensile/Source/lib/include/Tensile/ocl/OclUtils.hpp
rename to src/Tensile/data/Source/lib/include/Tensile/ocl/OclUtils.hpp
diff --git a/Tensile/Source/lib/source/AMDGPU.cpp b/src/Tensile/data/Source/lib/source/AMDGPU.cpp
similarity index 100%
rename from Tensile/Source/lib/source/AMDGPU.cpp
rename to src/Tensile/data/Source/lib/source/AMDGPU.cpp
diff --git a/Tensile/Source/lib/source/ArithmeticUnitTypes.cpp b/src/Tensile/data/Source/lib/source/ArithmeticUnitTypes.cpp
similarity index 100%
rename from Tensile/Source/lib/source/ArithmeticUnitTypes.cpp
rename to src/Tensile/data/Source/lib/source/ArithmeticUnitTypes.cpp
diff --git a/Tensile/Source/lib/source/ContractionProblem.cpp b/src/Tensile/data/Source/lib/source/ContractionProblem.cpp
similarity index 100%
rename from Tensile/Source/lib/source/ContractionProblem.cpp
rename to src/Tensile/data/Source/lib/source/ContractionProblem.cpp
diff --git a/Tensile/Source/lib/source/ContractionSolution.cpp b/src/Tensile/data/Source/lib/source/ContractionSolution.cpp
similarity index 100%
rename from Tensile/Source/lib/source/ContractionSolution.cpp
rename to src/Tensile/data/Source/lib/source/ContractionSolution.cpp
diff --git a/Tensile/Source/lib/source/DataTypes.cpp b/src/Tensile/data/Source/lib/source/DataTypes.cpp
similarity index 100%
rename from Tensile/Source/lib/source/DataTypes.cpp
rename to src/Tensile/data/Source/lib/source/DataTypes.cpp
diff --git a/Tensile/Source/lib/source/Debug.cpp b/src/Tensile/data/Source/lib/source/Debug.cpp
similarity index 100%
rename from Tensile/Source/lib/source/Debug.cpp
rename to src/Tensile/data/Source/lib/source/Debug.cpp
diff --git a/Tensile/Source/lib/source/EmbeddedData.cpp b/src/Tensile/data/Source/lib/source/EmbeddedData.cpp
similarity index 100%
rename from Tensile/Source/lib/source/EmbeddedData.cpp
rename to src/Tensile/data/Source/lib/source/EmbeddedData.cpp
diff --git a/Tensile/Source/lib/source/EmbeddedLibrary.cpp b/src/Tensile/data/Source/lib/source/EmbeddedLibrary.cpp
similarity index 100%
rename from Tensile/Source/lib/source/EmbeddedLibrary.cpp
rename to src/Tensile/data/Source/lib/source/EmbeddedLibrary.cpp
diff --git a/Tensile/Source/lib/source/KernelArguments.cpp b/src/Tensile/data/Source/lib/source/KernelArguments.cpp
similarity index 100%
rename from Tensile/Source/lib/source/KernelArguments.cpp
rename to src/Tensile/data/Source/lib/source/KernelArguments.cpp
diff --git a/Tensile/Source/lib/source/KernelLanguageTypes.cpp b/src/Tensile/data/Source/lib/source/KernelLanguageTypes.cpp
similarity index 100%
rename from Tensile/Source/lib/source/KernelLanguageTypes.cpp
rename to src/Tensile/data/Source/lib/source/KernelLanguageTypes.cpp
diff --git a/Tensile/Source/lib/source/MLFeatures.cpp b/src/Tensile/data/Source/lib/source/MLFeatures.cpp
similarity index 100%
rename from Tensile/Source/lib/source/MLFeatures.cpp
rename to src/Tensile/data/Source/lib/source/MLFeatures.cpp
diff --git a/Tensile/Source/lib/source/PerformanceMetricTypes.cpp b/src/Tensile/data/Source/lib/source/PerformanceMetricTypes.cpp
similarity index 100%
rename from Tensile/Source/lib/source/PerformanceMetricTypes.cpp
rename to src/Tensile/data/Source/lib/source/PerformanceMetricTypes.cpp
diff --git a/Tensile/Source/lib/source/ScalarValueTypes.cpp b/src/Tensile/data/Source/lib/source/ScalarValueTypes.cpp
similarity index 100%
rename from Tensile/Source/lib/source/ScalarValueTypes.cpp
rename to src/Tensile/data/Source/lib/source/ScalarValueTypes.cpp
diff --git a/Tensile/Source/lib/source/Tensile.cpp b/src/Tensile/data/Source/lib/source/Tensile.cpp
similarity index 100%
rename from Tensile/Source/lib/source/Tensile.cpp
rename to src/Tensile/data/Source/lib/source/Tensile.cpp
diff --git a/Tensile/Source/lib/source/TensorDescriptor.cpp b/src/Tensile/data/Source/lib/source/TensorDescriptor.cpp
similarity index 100%
rename from Tensile/Source/lib/source/TensorDescriptor.cpp
rename to src/Tensile/data/Source/lib/source/TensorDescriptor.cpp
diff --git a/Tensile/Source/lib/source/TensorOps.cpp b/src/Tensile/data/Source/lib/source/TensorOps.cpp
similarity index 100%
rename from Tensile/Source/lib/source/TensorOps.cpp
rename to src/Tensile/data/Source/lib/source/TensorOps.cpp
diff --git a/Tensile/Source/lib/source/UserDrivenTuningParser.cpp b/src/Tensile/data/Source/lib/source/UserDrivenTuningParser.cpp
similarity index 100%
rename from Tensile/Source/lib/source/UserDrivenTuningParser.cpp
rename to src/Tensile/data/Source/lib/source/UserDrivenTuningParser.cpp
diff --git a/Tensile/Source/lib/source/Utils.cpp b/src/Tensile/data/Source/lib/source/Utils.cpp
similarity index 100%
rename from Tensile/Source/lib/source/Utils.cpp
rename to src/Tensile/data/Source/lib/source/Utils.cpp
diff --git a/Tensile/Source/lib/source/hip/CMakeLists.txt b/src/Tensile/data/Source/lib/source/hip/CMakeLists.txt
similarity index 100%
rename from Tensile/Source/lib/source/hip/CMakeLists.txt
rename to src/Tensile/data/Source/lib/source/hip/CMakeLists.txt
diff --git a/Tensile/Source/lib/source/hip/HipHardware.cpp b/src/Tensile/data/Source/lib/source/hip/HipHardware.cpp
similarity index 100%
rename from Tensile/Source/lib/source/hip/HipHardware.cpp
rename to src/Tensile/data/Source/lib/source/hip/HipHardware.cpp
diff --git a/Tensile/Source/lib/source/hip/HipSolutionAdapter.cpp b/src/Tensile/data/Source/lib/source/hip/HipSolutionAdapter.cpp
similarity index 100%
rename from Tensile/Source/lib/source/hip/HipSolutionAdapter.cpp
rename to src/Tensile/data/Source/lib/source/hip/HipSolutionAdapter.cpp
diff --git a/Tensile/Source/lib/source/llvm/Loading.cpp b/src/Tensile/data/Source/lib/source/llvm/Loading.cpp
similarity index 100%
rename from Tensile/Source/lib/source/llvm/Loading.cpp
rename to src/Tensile/data/Source/lib/source/llvm/Loading.cpp
diff --git a/Tensile/Source/lib/source/llvm/YAML.cpp b/src/Tensile/data/Source/lib/source/llvm/YAML.cpp
similarity index 100%
rename from Tensile/Source/lib/source/llvm/YAML.cpp
rename to src/Tensile/data/Source/lib/source/llvm/YAML.cpp
diff --git a/Tensile/Source/lib/source/msgpack/MessagePack.cpp b/src/Tensile/data/Source/lib/source/msgpack/MessagePack.cpp
similarity index 100%
rename from Tensile/Source/lib/source/msgpack/MessagePack.cpp
rename to src/Tensile/data/Source/lib/source/msgpack/MessagePack.cpp
diff --git a/Tensile/Source/lib/source/ocl/CMakeLists.txt b/src/Tensile/data/Source/lib/source/ocl/CMakeLists.txt
similarity index 100%
rename from Tensile/Source/lib/source/ocl/CMakeLists.txt
rename to src/Tensile/data/Source/lib/source/ocl/CMakeLists.txt
diff --git a/Tensile/Source/lib/source/ocl/OclHardware.cpp b/src/Tensile/data/Source/lib/source/ocl/OclHardware.cpp
similarity index 100%
rename from Tensile/Source/lib/source/ocl/OclHardware.cpp
rename to src/Tensile/data/Source/lib/source/ocl/OclHardware.cpp
diff --git a/Tensile/Source/lib/source/ocl/OclSolutionAdapter.cpp b/src/Tensile/data/Source/lib/source/ocl/OclSolutionAdapter.cpp
similarity index 100%
rename from Tensile/Source/lib/source/ocl/OclSolutionAdapter.cpp
rename to src/Tensile/data/Source/lib/source/ocl/OclSolutionAdapter.cpp
diff --git a/Tensile/Source/lib/source/ocl/OclUtils.cpp b/src/Tensile/data/Source/lib/source/ocl/OclUtils.cpp
similarity index 100%
rename from Tensile/Source/lib/source/ocl/OclUtils.cpp
rename to src/Tensile/data/Source/lib/source/ocl/OclUtils.cpp
diff --git a/Tensile/Source/multigpu.sh b/src/Tensile/data/Source/multigpu.sh
similarity index 100%
rename from Tensile/Source/multigpu.sh
rename to src/Tensile/data/Source/multigpu.sh
diff --git a/Tensile/Source/tensile_bfloat16.h b/src/Tensile/data/Source/tensile_bfloat16.h
similarity index 100%
rename from Tensile/Source/tensile_bfloat16.h
rename to src/Tensile/data/Source/tensile_bfloat16.h
diff --git a/Tensile/Source/tensile_float8_bfloat8.h b/src/Tensile/data/Source/tensile_float8_bfloat8.h
similarity index 100%
rename from Tensile/Source/tensile_float8_bfloat8.h
rename to src/Tensile/data/Source/tensile_float8_bfloat8.h
diff --git a/Tensile/Source/winners.awk b/src/Tensile/data/Source/winners.awk
similarity index 100%
rename from Tensile/Source/winners.awk
rename to src/Tensile/data/Source/winners.awk
diff --git a/Tensile/Utilities/archive/merge_rocblas_yaml_files.py b/src/Tensile/data/Utilities/archive/merge_rocblas_yaml_files.py
similarity index 100%
rename from Tensile/Utilities/archive/merge_rocblas_yaml_files.py
rename to src/Tensile/data/Utilities/archive/merge_rocblas_yaml_files.py
diff --git a/Tensile/Utilities/merge.py b/src/Tensile/data/Utilities/merge.py
similarity index 100%
rename from Tensile/Utilities/merge.py
rename to src/Tensile/data/Utilities/merge.py
diff --git a/Tensile/cmake/TensileConfig.cmake b/src/Tensile/data/cmake/TensileConfig.cmake
similarity index 100%
rename from Tensile/cmake/TensileConfig.cmake
rename to src/Tensile/data/cmake/TensileConfig.cmake
diff --git a/Tensile/cmake/TensileConfigVersion.cmake b/src/Tensile/data/cmake/TensileConfigVersion.cmake
similarity index 100%
rename from Tensile/cmake/TensileConfigVersion.cmake
rename to src/Tensile/data/cmake/TensileConfigVersion.cmake

From d24a9531a93e20ec4dae6f2594a0a5f088e52884 Mon Sep 17 00:00:00 2001
From: Jonathan MERCIER <bioinfornatics@gmail.com>
Date: Tue, 12 Dec 2023 01:31:06 +0100
Subject: [PATCH 02/13] Let poetry to create executable

---
 pyproject.toml                            | 39 ++++++++++++++++++++
 src/Tensile/GenerateSummations.py         |  3 +-
 src/Tensile/TensileCreateLibrary.py       |  2 +-
 src/Tensile/bin/Tensile                   | 39 --------------------
 src/Tensile/bin/TensileBenchmarkCluster   | 39 --------------------
 src/Tensile/bin/TensileClientConfig       | 39 --------------------
 src/Tensile/bin/TensileCreateLibrary      | 43 -----------------------
 src/Tensile/bin/TensileGenerateSummations | 41 ---------------------
 src/Tensile/bin/TensileLibLogicToYaml     | 39 --------------------
 src/Tensile/bin/TensileMergeLibrary       | 41 ---------------------
 src/Tensile/bin/TensileRetuneLibrary      | 39 --------------------
 src/Tensile/bin/TensileUpdateLibrary      | 41 ---------------------
 12 files changed, 42 insertions(+), 363 deletions(-)
 create mode 100644 pyproject.toml
 delete mode 100755 src/Tensile/bin/Tensile
 delete mode 100755 src/Tensile/bin/TensileBenchmarkCluster
 delete mode 100755 src/Tensile/bin/TensileClientConfig
 delete mode 100755 src/Tensile/bin/TensileCreateLibrary
 delete mode 100755 src/Tensile/bin/TensileGenerateSummations
 delete mode 100755 src/Tensile/bin/TensileLibLogicToYaml
 delete mode 100755 src/Tensile/bin/TensileMergeLibrary
 delete mode 100755 src/Tensile/bin/TensileRetuneLibrary
 delete mode 100755 src/Tensile/bin/TensileUpdateLibrary

diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000..9fa1466336
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,39 @@
+[build-system]
+requires = ["setuptools>=65.5.1", "wheel", "poetry_core>=1.5.0"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.poetry]
+name = "Tensile"
+version = "4.40.0"
+description = "Tensile is a tool for creating benchmark-driven backend libraries for GEMMs"
+license = "MIT"
+classifiers = [
+    "Intended Audience :: Science/Research",
+    "Topic :: Scientific/Engineering",
+]
+packages = [
+    { include = "Tensile", from = "src" },
+]
+
+[tool.poetry.dependencies]
+python = ">=3.8"
+toml = ">=0.10"
+pyyaml = ">=6.0"
+msgpack = ">=1.0"
+joblib = ">=1.2"
+pandas = ">=1.5"
+
+[tool.poetry.group.dev.dependencies]
+pytest = "7.3.2"
+
+
+[tool.poetry.scripts]
+tensile = "Tensile.Tensile:main"
+tensile_benchmark_cluster = "Tensile.TensileBenchmarkCluster:main"
+tensile_client_config = "Tensile.TensileClientConfig:main"
+tensile_create_library = "Tensile.TensileCreateLibrary:main"
+tensile_generate_summations = "Tensile.GenerateSummations:main"
+tensile_lib_logic_to_yaml = "Tensile.TensileLibLogicToYaml:main"
+tensile_merge_library = "Tensile.TensileMergeLibrary:main"
+tensile_retune_library = "Tensile.TensileRetuneLibrary:main"
+tensile_update_library = "Tensile.TensileUpdateLibrary:main"
\ No newline at end of file
diff --git a/src/Tensile/GenerateSummations.py b/src/Tensile/GenerateSummations.py
index bd255414f7..404300fb2d 100644
--- a/src/Tensile/GenerateSummations.py
+++ b/src/Tensile/GenerateSummations.py
@@ -29,6 +29,7 @@
 import yaml
 import subprocess
 import glob
+import sys
 
 from shutil import copyfile
 from copy import deepcopy
@@ -70,7 +71,7 @@ def createLibraryForBenchmark(logicPath, libraryPath, currentPath):
     except (subprocess.CalledProcessError, OSError) as e:
         printExit("ClientWriter Benchmark Process exited with error: {}".format(e))
 
-def GenerateSummations(userArgs):
+def main(userArgs = sys.argv[1:]):
 
     inputLogicPath = userArgs[0]
     outputPath = userArgs[1]
diff --git a/src/Tensile/TensileCreateLibrary.py b/src/Tensile/TensileCreateLibrary.py
index dae1d0502e..e3bcdb4222 100644
--- a/src/Tensile/TensileCreateLibrary.py
+++ b/src/Tensile/TensileCreateLibrary.py
@@ -1023,7 +1023,7 @@ def WriteClientLibraryFromSolutions(solutionList, libraryWorkingPath, tensileSou
 ################################################################################
 # Tensile Create Library
 ################################################################################
-def TensileCreateLibrary():
+def main():
   print1("")
   print1(HR)
   print1("# Tensile Create Library")
diff --git a/src/Tensile/bin/Tensile b/src/Tensile/bin/Tensile
deleted file mode 100755
index 1c53682cdd..0000000000
--- a/src/Tensile/bin/Tensile
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/env python3
-
-################################################################################
-#
-# Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-################################################################################
-
-try:
-    from Tensile import Tensile
-except ImportError:
-    import os.path
-    import sys
-    parentdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", ".."))
-    sys.path.append(parentdir)
-
-    from Tensile import Tensile
-
-# script run from commandline
-if __name__ == "__main__":
-    Tensile.main()
diff --git a/src/Tensile/bin/TensileBenchmarkCluster b/src/Tensile/bin/TensileBenchmarkCluster
deleted file mode 100755
index e1ac2592ec..0000000000
--- a/src/Tensile/bin/TensileBenchmarkCluster
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/env python3
-
-################################################################################
-#
-# Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-################################################################################
-
-try:
-    from Tensile import TensileBenchmarkCluster
-except ImportError:
-    import os.path
-    import sys
-    parentdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", ".."))
-    sys.path.append(parentdir)
-
-    from Tensile import TensileBenchmarkCluster
-
-# script run from commandline
-if __name__ == "__main__":
-    TensileBenchmarkCluster.main()
diff --git a/src/Tensile/bin/TensileClientConfig b/src/Tensile/bin/TensileClientConfig
deleted file mode 100755
index 3c076ccb92..0000000000
--- a/src/Tensile/bin/TensileClientConfig
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/env python3
-
-################################################################################
-#
-# Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-################################################################################
-
-try:
-    from Tensile import TensileClientConfig
-except ImportError:
-    import os.path
-    import sys
-    parentdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", ".."))
-    sys.path.append(parentdir)
-
-    from Tensile import TensileClientConfig
-
-# script run from commandline
-if __name__ == "__main__":
-    TensileClientConfig.main()
diff --git a/src/Tensile/bin/TensileCreateLibrary b/src/Tensile/bin/TensileCreateLibrary
deleted file mode 100755
index e90be28536..0000000000
--- a/src/Tensile/bin/TensileCreateLibrary
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/env python3
-
-################################################################################
-#
-# Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-################################################################################
-
-# This script only gets called by CMake
-
-try:
-    from Tensile.TensileCreateLibrary import TensileCreateLibrary
-except ImportError:
-    import os.path
-    import sys
-    parentdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", ".."))
-    sys.path.append(parentdir)
-
-    from Tensile.TensileCreateLibrary import TensileCreateLibrary
-
-################################################################################
-# Main
-################################################################################
-if __name__ == "__main__":
-    TensileCreateLibrary()
diff --git a/src/Tensile/bin/TensileGenerateSummations b/src/Tensile/bin/TensileGenerateSummations
deleted file mode 100755
index 3807c2b1dc..0000000000
--- a/src/Tensile/bin/TensileGenerateSummations
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/usr/bin/env python3
-
-################################################################################
-#
-# Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-################################################################################
-
-try:
-    from Tensile.GenerateSummations import GenerateSummations
-except ImportError:
-    import os.path
-    import sys
-    parentdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", ".."))
-    sys.path.append(parentdir)
-
-    from Tensile.GenerateSummations import GenerateSummations
-
-################################################################################
-# Main
-################################################################################
-if __name__ == "__main__":
-    GenerateSummations(sys.argv[1:])
diff --git a/src/Tensile/bin/TensileLibLogicToYaml b/src/Tensile/bin/TensileLibLogicToYaml
deleted file mode 100755
index a16867cde0..0000000000
--- a/src/Tensile/bin/TensileLibLogicToYaml
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/env python3
-
-################################################################################
-#
-# Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-################################################################################
-
-try:
-    from Tensile import TensileLibLogicToYaml
-except ImportError:
-    import os.path
-    import sys
-    parentdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", ".."))
-    sys.path.append(parentdir)
-
-    from Tensile import TensileLibLogicToYaml
-
-# script run from commandline
-if __name__ == "__main__":
-    TensileLibLogicToYaml.main()
diff --git a/src/Tensile/bin/TensileMergeLibrary b/src/Tensile/bin/TensileMergeLibrary
deleted file mode 100755
index a2bb92fa31..0000000000
--- a/src/Tensile/bin/TensileMergeLibrary
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/usr/bin/env python3
-
-################################################################################
-#
-# Copyright 2016-2022 Advanced Micro Devices, Inc. All rights reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-################################################################################
-
-# This script only gets called by CMake
-
-try:
-    from Tensile import TensileMergeLibrary
-except ImportError:
-    import os.path
-    import sys
-    parentdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", ".."))
-    sys.path.append(parentdir)
-
-    from Tensile import TensileMergeLibrary
-
-# script run from commandline
-if __name__ == "__main__":
-    TensileMergeLibrary.main()
diff --git a/src/Tensile/bin/TensileRetuneLibrary b/src/Tensile/bin/TensileRetuneLibrary
deleted file mode 100755
index 21a8fca46c..0000000000
--- a/src/Tensile/bin/TensileRetuneLibrary
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/env python3
-
-################################################################################
-#
-# Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-################################################################################
-
-try:
-    from Tensile import TensileRetuneLibrary
-except ImportError:
-    import os.path
-    import sys
-    parentdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", ".."))
-    sys.path.append(parentdir)
-
-    from Tensile import TensileRetuneLibrary
-
-# script run from commandline
-if __name__ == "__main__":
-    TensileRetuneLibrary.main()
diff --git a/src/Tensile/bin/TensileUpdateLibrary b/src/Tensile/bin/TensileUpdateLibrary
deleted file mode 100755
index 84a55de88a..0000000000
--- a/src/Tensile/bin/TensileUpdateLibrary
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/usr/bin/env python3
-
-################################################################################
-#
-# Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-################################################################################
-
-# This script only gets called by CMake
-
-try:
-    from Tensile import TensileUpdateLibrary
-except ImportError:
-    import os.path
-    import sys
-    parentdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", ".."))
-    sys.path.append(parentdir)
-
-    from Tensile import TensileUpdateLibrary
-
-# script run from commandline
-if __name__ == "__main__":
-    TensileUpdateLibrary.main()

From 5b3abbfd2e9335f301416d807d7853733f2058e5 Mon Sep 17 00:00:00 2001
From: Jonathan MERCIER <bioinfornatics@gmail.com>
Date: Wed, 13 Dec 2023 01:13:20 +0100
Subject: [PATCH 03/13] fix typo } used instead of )

---
 src/Tensile/data/cmake/TensileConfigVersion.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Tensile/data/cmake/TensileConfigVersion.cmake b/src/Tensile/data/cmake/TensileConfigVersion.cmake
index 8c97bc461d..138865de22 100644
--- a/src/Tensile/data/cmake/TensileConfigVersion.cmake
+++ b/src/Tensile/data/cmake/TensileConfigVersion.cmake
@@ -36,7 +36,7 @@ else()
     set(PACKAGE_VERSION_EXACT FALSE)
 endif()
 
-if(PACKAGE_VERSION_EXACT} OR (PACKAGE_FIND_VERSION VERSION_GREATER PACKAGE_VERSION))
+if(PACKAGE_VERSION_EXACT) OR (PACKAGE_FIND_VERSION VERSION_GREATER PACKAGE_VERSION))
     set(PACKAGE_VERSION_COMPATIBLE TRUE)
 else()
     set(PACKAGE_VERSION_COMPATIBLE FALSE)

From ece0fc03d5ed97efe97284aa5864cb706cbbad90 Mon Sep 17 00:00:00 2001
From: Jonathan MERCIER <bioinfornatics@gmail.com>
Date: Wed, 13 Dec 2023 01:15:39 +0100
Subject: [PATCH 04/13] Version is filled dynamically

---
 ...ileConfigVersion.cmake => TensileConfigVersion.cmake.j2} | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
 rename src/Tensile/data/cmake/{TensileConfigVersion.cmake => TensileConfigVersion.cmake.j2} (95%)

diff --git a/src/Tensile/data/cmake/TensileConfigVersion.cmake b/src/Tensile/data/cmake/TensileConfigVersion.cmake.j2
similarity index 95%
rename from src/Tensile/data/cmake/TensileConfigVersion.cmake
rename to src/Tensile/data/cmake/TensileConfigVersion.cmake.j2
index 138865de22..d2267a7a33 100644
--- a/src/Tensile/data/cmake/TensileConfigVersion.cmake
+++ b/src/Tensile/data/cmake/TensileConfigVersion.cmake.j2
@@ -23,9 +23,9 @@
 ################################################################################
 
 # hardcoded tensile version; also in Tensile/__init__.py
-set(TENSILE_VERSION_MAJOR  4)
-set(TENSILE_VERSION_MINOR 40)
-set(TENSILE_VERSION_PATCH  0)
+set(TENSILE_VERSION_MAJOR  {TENSILE_VERSION_MAJOR})
+set(TENSILE_VERSION_MINOR {TENSILE_VERSION_MINOR})
+set(TENSILE_VERSION_PATCH  {TENSILE_VERSION_PATCH})
 
 # export version
 set(PACKAGE_VERSION "${TENSILE_VERSION_MAJOR}.${TENSILE_VERSION_MINOR}.${TENSILE_VERSION_PATCH}")

From 736244ee5b031345e26906fbee89921a03fc7c6a Mon Sep 17 00:00:00 2001
From: Jonathan MERCIER <bioinfornatics@gmail.com>
Date: Wed, 13 Dec 2023 01:24:02 +0100
Subject: [PATCH 05/13] use f'string

---
 src/Tensile/BenchmarkProblems.py | 35 +++++++++++++++-----------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/src/Tensile/BenchmarkProblems.py b/src/Tensile/BenchmarkProblems.py
index 2fb094d609..35f896429e 100644
--- a/src/Tensile/BenchmarkProblems.py
+++ b/src/Tensile/BenchmarkProblems.py
@@ -83,7 +83,7 @@ def generateCustomKernelSolutions(problemType, customKernels, failOnMismatch):
     """Creates a list with a Solution object for each name in customKernel"""
     solutions = []
     for kernelName in customKernels:
-        print1("# Processing custom kernel {}".format(kernelName))
+        print1(f"# Processing custom kernel {kernelName}")
         solution = getCustomKernelSolutionObj(kernelName)
         if solution["ProblemType"] != problemType:
             # Raise error if this kernel was specifically requested and problem type doesn't match
@@ -93,8 +93,8 @@ def generateCustomKernelSolutions(problemType, customKernels, failOnMismatch):
                 customSet = set([(k,tuple(v)) if type(v) is list else (k,v) \
                         for k,v in solution["ProblemType"].items()])
 
-                msg = "The problem type in the config file does not match " \
-                        "that of the custom kernel, {}.".format(kernelName) \
+                msg = f"The problem type in the config file does not match " \
+                        "that of the custom kernel, {kernelName}." \
                         + "\nDiffering parameters:\n" \
                         + "\tConfig values:\n\t" \
                         + str(sorted(benchmarkSet - (customSet & benchmarkSet))) \
@@ -102,9 +102,9 @@ def generateCustomKernelSolutions(problemType, customKernels, failOnMismatch):
                         +  str(sorted(customSet - (customSet & benchmarkSet)))
                 printExit(msg)
             else:
-                print1("# Rejected {}: Problem Type doesn't match".format(kernelName))
+                print1(f"# Rejected {kernelName}: Problem Type doesn't match")
         else:
-            print1("# Added {} to solutions".format(kernelName))
+            print1(f"# Added {kernelName} to solutions")
             if solution["Valid"]:
                 solutions.append(solution)
             elif globalParameters["PrintSolutionRejectionReason"]:
@@ -214,14 +214,15 @@ def benchmarkProblemType(problemTypeConfig, problemSizeGroupConfig, problemSizeG
     benchmarkProcess = BenchmarkProcess(problemTypeConfig, problemSizeGroupConfig)
 
     enableTileSelection = benchmarkProcess.problemType["TileAwareSelection"]
-    groupName = "{}_{:02d}".format(str(benchmarkProcess.problemType), problemSizeGroupIdx)
+    problemType = str(benchmarkProcess.problemType)
+    groupName = f"{problemType}_{problemSizeGroupIdx:02d}"
     pushWorkingPath(groupName)
     ensurePath(os.path.join(globalParameters["WorkingPath"], "Data"))
 
     totalBenchmarkSteps = len(benchmarkProcess)
     resultsFileBaseFinal = None
 
-    print1("# NumBenchmarkSteps: {}".format(totalBenchmarkSteps))
+    print1(f"# NumBenchmarkSteps: {totalBenchmarkSteps}")
     print1("")
     print1(HR)
     print1("# Done Creating BenchmarkProcess Object")
@@ -236,11 +237,11 @@ def benchmarkProblemType(problemTypeConfig, problemSizeGroupConfig, problemSizeG
         print1(HR)
         currentTime = time.time()
         elapsedTime = currentTime - startTime
-        print1("# Benchmark Step: {} - {} {:.3f}s".format(groupName, stepName, elapsedTime))
-        print1("# Num Sizes: {}".format(benchmarkStep.problemSizes.totalProblemSizes))
+        print1(f"# Benchmark Step: {groupName} - {stepName} {elapsedTime:.3f}s")
+        print1(f"# Num Sizes: {benchmarkStep.problemSizes.totalProblemSizes}")
         print1("# Fork Parameters:")
         for k, v in sorted(benchmarkStep.forkParams.items()):
-            print1("#     {}: {}".format(k, v))
+            print1(f"#     {k}: {v}")
 
         pushWorkingPath(shortName)
         stepBaseDir = globalParameters["WorkingPath"]
@@ -284,8 +285,7 @@ def benchmarkProblemType(problemTypeConfig, problemSizeGroupConfig, problemSizeG
             maxPossibleSolutions += len(kcSolutions)
             solutions = regSolutions + kcSolutions
 
-            print1("# Actual Solutions: {} / {} after SolutionStructs\n" \
-                .format(len(solutions), maxPossibleSolutions))
+            print1(f"# Actual Solutions: {len(solutions)} / {maxPossibleSolutions} after SolutionStructs\n")
 
             # handle no valid solutions
             if len(solutions) == 0:
@@ -300,7 +300,7 @@ def benchmarkProblemType(problemTypeConfig, problemSizeGroupConfig, problemSizeG
 
             if globalParameters["PrintLevel"] >= 1:
                 for solution in solutions:
-                    print2("#    ({}:{}) {}".format(0, 0, Solution.getNameFull(solution)))
+                    print2f("#    (0:0) {Solution.getNameFull(solution)}")
                 print2(HR)
 
             # write benchmarkFiles
@@ -320,8 +320,7 @@ def benchmarkProblemType(problemTypeConfig, problemSizeGroupConfig, problemSizeG
             }
             LibraryIO.writeYAML(cachePath, cacheData)
 
-            print1("# Actual Solutions: {} / {} after KernelWriter\n" \
-                    .format(len(solutions), prevCount ))
+            print1(f"# Actual Solutions: {len(solutions)} / {prevCount} after KernelWriter\n")
         else:
             solutions = None
             print1("# Using cached solution data")
@@ -348,8 +347,7 @@ def benchmarkProblemType(problemTypeConfig, problemSizeGroupConfig, problemSizeG
 
             if returncode:
                 benchmarkTestFails += 1
-                printWarning("BenchmarkProblems: Benchmark Process exited with code {}" \
-                        .format(returncode))
+                printWarning("BenchmarkProblems: Benchmark Process exited with code {returncode}")
         else:
             print1("# Already benchmarked; skipping.")
 
@@ -357,8 +355,7 @@ def benchmarkProblemType(problemTypeConfig, problemSizeGroupConfig, problemSizeG
         popWorkingPath()  # stepName
         currentTime = time.time()
         elapsedTime = currentTime - startTime
-        print1("{}\n# {}\n# {}: End - {:.3f}s\n{}\n" \
-                .format(HR, groupName, shortName, elapsedTime, HR))
+        print1(f"{HR}\n# {groupName}\n# {shortName}: End - {elapsedTime:.3f}s\n{HR}\n")
 
     popWorkingPath()  # ProblemType
     return (resultsFileBaseFinal, benchmarkTestFails)

From 4d74bedd37dcafd85b2641797ce8f9cd53f480f3 Mon Sep 17 00:00:00 2001
From: Jonathan MERCIER <bioinfornatics@gmail.com>
Date: Wed, 13 Dec 2023 01:31:59 +0100
Subject: [PATCH 06/13] replace multiple call len(solutions) to
 num_of_solutions var

---
 src/Tensile/BenchmarkProblems.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/Tensile/BenchmarkProblems.py b/src/Tensile/BenchmarkProblems.py
index 35f896429e..04f6163c5c 100644
--- a/src/Tensile/BenchmarkProblems.py
+++ b/src/Tensile/BenchmarkProblems.py
@@ -284,11 +284,12 @@ def benchmarkProblemType(problemTypeConfig, problemSizeGroupConfig, problemSizeG
 
             maxPossibleSolutions += len(kcSolutions)
             solutions = regSolutions + kcSolutions
+            num_of_solutions = len(solutions)
 
-            print1(f"# Actual Solutions: {len(solutions)} / {maxPossibleSolutions} after SolutionStructs\n")
+            print1(f"# Actual Solutions: {num_of_solutions} / {maxPossibleSolutions} after SolutionStructs\n")
 
             # handle no valid solutions
-            if len(solutions) == 0:
+            if num_of_solutions == 0:
                 msg = "Your parameters resulted in 0 valid solutions."
                 if globalParameters["PrintSolutionRejectionReason"]:
                     msg += "\nExamine reject and backtrace messages above to see why" \
@@ -304,7 +305,7 @@ def benchmarkProblemType(problemTypeConfig, problemSizeGroupConfig, problemSizeG
                 print2(HR)
 
             # write benchmarkFiles
-            prevCount = len(solutions)
+            prevCount = num_of_solutions
             codeObjectFiles = writeBenchmarkFiles(stepBaseDir, solutions, \
                     benchmarkStep.problemSizes, shortName, [])
             # ^ this mutates solutions
@@ -320,7 +321,7 @@ def benchmarkProblemType(problemTypeConfig, problemSizeGroupConfig, problemSizeG
             }
             LibraryIO.writeYAML(cachePath, cacheData)
 
-            print1(f"# Actual Solutions: {len(solutions)} / {prevCount} after KernelWriter\n")
+            print1(f"# Actual Solutions: {num_of_solutions} / {prevCount} after KernelWriter\n")
         else:
             solutions = None
             print1("# Using cached solution data")

From 9101d9277dbd60732b32088cec6994fa6ac86a76 Mon Sep 17 00:00:00 2001
From: Jonathan MERCIER <bioinfornatics@gmail.com>
Date: Wed, 13 Dec 2023 01:33:23 +0100
Subject: [PATCH 07/13] use f'string

---
 src/Tensile/ClientExecutable.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Tensile/ClientExecutable.py b/src/Tensile/ClientExecutable.py
index cde870b73e..b8349284a0 100644
--- a/src/Tensile/ClientExecutable.py
+++ b/src/Tensile/ClientExecutable.py
@@ -43,7 +43,7 @@ def generate(self):
 
         args = ['cmake']
         args += ['-G', 'Ninja'] if (os.name == 'nt') else []
-        args += itertools.chain.from_iterable([ ['-D{}={}'.format(key, value)] for key,value in self.options.items()])
+        args += itertools.chain.from_iterable([ [f'-D{key}={value}'] for key,value in self.options.items()])
         args += [self.sourceDir]
         args = [cmake_path(arg) for arg in args]
 

From 66b35b15d0421c0a82d97c3d9e899631b41a1af4 Mon Sep 17 00:00:00 2001
From: Jonathan MERCIER <bioinfornatics@gmail.com>
Date: Wed, 13 Dec 2023 01:34:37 +0100
Subject: [PATCH 08/13] Error output got to stderr

---
 src/Tensile/ClientExecutable.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/Tensile/ClientExecutable.py b/src/Tensile/ClientExecutable.py
index b8349284a0..0e739b979f 100644
--- a/src/Tensile/ClientExecutable.py
+++ b/src/Tensile/ClientExecutable.py
@@ -25,6 +25,7 @@
 import itertools
 import os
 import subprocess
+from sys import stderr
 
 from . import Common
 from .Common import globalParameters, print2
@@ -54,7 +55,7 @@ def generate(self):
                 out = subprocess.check_output(args, stderr=subprocess.STDOUT, cwd=Common.ensurePath(self.buildDir))
                 print2(out)
             except subprocess.CalledProcessError as err:
-                print(err.output)
+                print(err.output, file=stderr)
                 raise
             
 
@@ -67,7 +68,7 @@ def build(self):
                 out = subprocess.check_output(args, stderr=subprocess.STDOUT, cwd=self.buildDir)
                 print2(out)
             except subprocess.CalledProcessError as err:
-                print(err.output)
+                print(err.output, file=stderr)
                 raise
 
     def builtPath(self, path, *paths):

From bc48adbdd737cadb25460c881b9538fd0862619e Mon Sep 17 00:00:00 2001
From: Jonathan MERCIER <bioinfornatics@gmail.com>
Date: Wed, 10 Jan 2024 22:44:40 +0100
Subject: [PATCH 09/13] Fetch version from metadata to avoid hardcoded value

---
 src/Tensile/__init__.py                              | 4 ++--
 src/Tensile/data/cmake/TensileConfigVersion.cmake.j2 | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/Tensile/__init__.py b/src/Tensile/__init__.py
index bcbaed7d9c..f6b58f4853 100644
--- a/src/Tensile/__init__.py
+++ b/src/Tensile/__init__.py
@@ -24,9 +24,9 @@
 
 # Even though we don't support python 2, this is still packaged sometimes with python 2.
 from __future__ import print_function
+from importlib import metadata
 
-# hardcoded tensile version; also in Tensile/Source/TensileConfigVersion.cmake
-__version__ = "4.40.0"
+__version__ = metadata.version("Tensile")
 
 def PrintTensileRoot():
     import os.path
diff --git a/src/Tensile/data/cmake/TensileConfigVersion.cmake.j2 b/src/Tensile/data/cmake/TensileConfigVersion.cmake.j2
index d2267a7a33..1c56f4a612 100644
--- a/src/Tensile/data/cmake/TensileConfigVersion.cmake.j2
+++ b/src/Tensile/data/cmake/TensileConfigVersion.cmake.j2
@@ -22,7 +22,6 @@
 #
 ################################################################################
 
-# hardcoded tensile version; also in Tensile/__init__.py
 set(TENSILE_VERSION_MAJOR  {TENSILE_VERSION_MAJOR})
 set(TENSILE_VERSION_MINOR {TENSILE_VERSION_MINOR})
 set(TENSILE_VERSION_PATCH  {TENSILE_VERSION_PATCH})

From 62b10e4024ad4b287c63652bbe6956169265e803 Mon Sep 17 00:00:00 2001
From: Jonathan MERCIER <bioinfornatics@gmail.com>
Date: Wed, 10 Jan 2024 23:17:35 +0100
Subject: [PATCH 10/13] The variable CMAKE_SOURCE_DIR has the same value and
 should be preferred

---
 src/Tensile/data/Source/client/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Tensile/data/Source/client/CMakeLists.txt b/src/Tensile/data/Source/client/CMakeLists.txt
index a9ac80a919..cae52f54c1 100644
--- a/src/Tensile/data/Source/client/CMakeLists.txt
+++ b/src/Tensile/data/Source/client/CMakeLists.txt
@@ -60,7 +60,7 @@ find_package(Boost COMPONENTS program_options REQUIRED)
 if (NOT WIN32)
     find_package(ROCmSMI QUIET)
     if(NOT ROCmSMI_FOUND)
-        set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH}" "${Tensile_DIR}" "${Tensile_DIR}/../Source/cmake" "${CMAKE_HOME_DIRECTORY}/cmake")
+        set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH}" "${Tensile_DIR}" "${Tensile_DIR}/../Source/cmake" "${CMAKE_SOURCE_DIR}/cmake")
         find_package(ROCmSMI REQUIRED)
     endif()
 endif()

From 2ba8577e5c25cefda8fdbf714c63a881e3048e62 Mon Sep 17 00:00:00 2001
From: Jonathan MERCIER <bioinfornatics@gmail.com>
Date: Wed, 10 Jan 2024 23:37:06 +0100
Subject: [PATCH 11/13] Use comprehension list for readability

---
 src/Tensile/TensileCreateLibrary.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/Tensile/TensileCreateLibrary.py b/src/Tensile/TensileCreateLibrary.py
index e3bcdb4222..cd20da9ee3 100644
--- a/src/Tensile/TensileCreateLibrary.py
+++ b/src/Tensile/TensileCreateLibrary.py
@@ -862,9 +862,7 @@ def writeCMake(outputPath, solutionFiles, kernelFiles, libraryStaticFiles, maste
     buildObjectFilePaths(cmakeSrcDir, solutionFiles, kernelFiles, [], [], [], masterLibraries)
 
   # Build full paths the static library files
-  staticFilePaths = []
-  for staticFile in libraryStaticFiles:
-    staticFilePaths += [ os.path.join(cmakeSrcDir, staticFile) ]
+  staticFilePaths = (os.path.join(cmakeSrcDir, staticFile) for staticFile in libraryStaticFiles)
 
   # Proceed to generate cmake file
   generatedFile = open(os.path.join(os.path.normcase(outputPath), "Generated.cmake"), "w")

From be0d3f0c9d28b2ddb7847192207b83e481fbe6b6 Mon Sep 17 00:00:00 2001
From: Jonathan MERCIER <bioinfornatics@gmail.com>
Date: Wed, 10 Jan 2024 23:53:19 +0100
Subject: [PATCH 12/13] Static file instruction already done few line above
 from the function copyStaticFiles

---
 src/Tensile/TensileCreateLibrary.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/Tensile/TensileCreateLibrary.py b/src/Tensile/TensileCreateLibrary.py
index cd20da9ee3..455435c85c 100644
--- a/src/Tensile/TensileCreateLibrary.py
+++ b/src/Tensile/TensileCreateLibrary.py
@@ -1216,10 +1216,7 @@ def splitExtraParameters(par):
   if not arguments["GenerateSourcesAndExit"]:
     writeCMake(outputPath, solutionFiles, sourceKernelFiles, staticFiles, masterLibraries)
 
-  # Make sure to copy the library static files.
-  for fileName in staticFiles:
-    shutil.copy( os.path.join(globalParameters["SourcePath"], fileName), \
-      outputPath )
+
 
   # write solutions and kernels
   codeObjectFiles = writeSolutionsAndKernels(outputPath, CxxCompiler, None, solutions,

From df083f6539108c34b5f4eeaf24afe07f4c12e89a Mon Sep 17 00:00:00 2001
From: Jonathan MERCIER <bioinfornatics@gmail.com>
Date: Thu, 18 Jan 2024 14:21:14 +0100
Subject: [PATCH 13/13] WIP refactoring by using importlib

---
 pyproject.toml                                |     5 +
 src/Tensile/ClientWriter.py                   |     2 +-
 src/Tensile/Common.py                         |    20 +-
 src/Tensile/CustomKernels.py                  |    26 +-
 ...128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4.s | 10827 ----------------
 src/Tensile/KernelWriter.py                   |     2 +-
 src/Tensile/TensileCreateLibrary.py           |     9 +-
 src/Tensile/data/Source/client/CMakeLists.txt |     2 +-
 .../cmake/TensileConfigVersion.cmake.j2       |     2 +-
 9 files changed, 49 insertions(+), 10846 deletions(-)
 delete mode 100644 src/Tensile/CustomKernels/DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4.s
 rename src/Tensile/data/{ => template}/cmake/TensileConfigVersion.cmake.j2 (96%)

diff --git a/pyproject.toml b/pyproject.toml
index 9fa1466336..c1e4b64878 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,6 +6,7 @@ build-backend = "poetry.core.masonry.api"
 name = "Tensile"
 version = "4.40.0"
 description = "Tensile is a tool for creating benchmark-driven backend libraries for GEMMs"
+authors = ["Rocm team"]
 license = "MIT"
 classifiers = [
     "Intended Audience :: Science/Research",
@@ -15,6 +16,10 @@ packages = [
     { include = "Tensile", from = "src" },
 ]
 
+include = [
+    "src/Tensile/data/**/*",
+]
+
 [tool.poetry.dependencies]
 python = ">=3.8"
 toml = ">=0.10"
diff --git a/src/Tensile/ClientWriter.py b/src/Tensile/ClientWriter.py
index 49f8a24612..bea7c7fb02 100644
--- a/src/Tensile/ClientWriter.py
+++ b/src/Tensile/ClientWriter.py
@@ -176,7 +176,7 @@ def runClient(libraryLogicPath, forBenchmark, enableTileSelection, configPaths=N
 def getBuildClientLibraryScript(buildPath, libraryLogicPath):
   callCreateLibraryCmd = ["python"] if os.name == "nt" else []
 
-  callCreateLibraryCmd += [os.path.join(globalParameters["ScriptPath"] , "bin", "TensileCreateLibrary")]
+  callCreateLibraryCmd += ['tensile_create_library']
 
   if globalParameters["MergeFiles"]:
     callCreateLibraryCmd += ["--merge-files"]
diff --git a/src/Tensile/Common.py b/src/Tensile/Common.py
index caf361fcb4..3e0fb7937c 100644
--- a/src/Tensile/Common.py
+++ b/src/Tensile/Common.py
@@ -34,6 +34,9 @@
 import sys
 import time
 
+import shutil
+from importlib.resources import path, contents, files
+
 startTime = time.time()
 
 ParallelMap = Parallel.ParallelMap
@@ -225,7 +228,7 @@
 globalParameters["AssemblerPath"] = None                # /opt/rocm/hip/bin/hipcc
 globalParameters["WorkingPath"] = os.getcwd()           # path where tensile called from
 globalParameters["IndexChars"] =  "IJKLMNOPQRSTUVWXYZ"  # which characters to use for C[ij]=Sum[k] A[ik]*B[jk]
-globalParameters["ScriptPath"] = os.path.dirname(os.path.realpath(__file__))            # path to Tensile/Tensile.py
+# FIXME source is now package with importlib
 globalParameters["SourcePath"] = os.path.join(globalParameters["ScriptPath"], "Source") # path to Tensile/Source/
 globalParameters["HipClangVersion"] = "0.0.0"
 
@@ -256,7 +259,7 @@
 globalParameters["GranularityThreshold"] = 0.0
 
 # directory where custom kernels are located
-globalParameters["CustomKernelDirectory"] = os.path.join(os.path.dirname(os.path.realpath(__file__)), "CustomKernels")
+globalParameters["CustomKernelDirectory"] = files('Tensile.data').joinpath("CustomKernels")
 
 globalParameters["PristineOnGPU"] = True # use Pristine memory on Tensile training verification or not
 
@@ -2517,3 +2520,16 @@ def __del__(self):
 """
 
 HR = "################################################################################"
+
+def copy_data_files(data_to_copy: List[str], destination_path: str) -> None:
+    if not os.path.exists(destination_path):
+        os.makedirs(destination_path)
+
+    for resource in contents('Tensile.data'):
+      for data in data_to_copy:
+        if resource.startswith(data):
+            with path(Tensile.data, resource) as resource_path:
+                if os.path.isfile(resource_path):
+                    shutil.copy(resource_path, destination_path)
+                elif os.path.isdir(resource_path):
+                    shutil.copytree(resource_path, os.path.join(destination_path, resource))
diff --git a/src/Tensile/CustomKernels.py b/src/Tensile/CustomKernels.py
index f3254b55f4..9c6a3907c5 100644
--- a/src/Tensile/CustomKernels.py
+++ b/src/Tensile/CustomKernels.py
@@ -27,24 +27,34 @@
 import yaml
 
 import os
+from pathlib import Path
+from typing import Union, List
 
 def isCustomKernelConfig(config):
     return "CustomKernelName" in config and config["CustomKernelName"]
 
-def getCustomKernelFilepath(name, directory=globalParameters["CustomKernelDirectory"]):
-    return os.path.join(directory, (name + ".s"))
+def getCustomKernelFilepath(name, directory: Union[str, Path]=globalParameters["CustomKernelDirectory"]):
+    if not isinstance(directory, Path):
+        directory = Path(directory)
+    return directory.join(name + ".s")
 
-def getAllCustomKernelNames(directory=globalParameters["CustomKernelDirectory"]):
-    return [fname[:-2] for fname in os.listdir(directory) if fname.endswith(".s")]
+def getAllCustomKernelNames(directory: Union[str, Path]=globalParameters["CustomKernelDirectory"]):
+    if not isinstance(directory, Path):
+        directory = Path(directory)
+    return [fname[:-2] for fname in directory.iterdir() if fname.endswith(".s")]
 
-def getCustomKernelContents(name, directory=globalParameters["CustomKernelDirectory"]):
+def getCustomKernelContents(name, directory: Union[str, Path]=globalParameters["CustomKernelDirectory"]):
+    if not isinstance(directory, Path):
+        directory = Path(directory)
     try:
         with open(getCustomKernelFilepath(name, directory)) as f:
             return f.read()
     except:
         raise RuntimeError("Failed to find custom kernel: {}".format(os.path.join(directory, name)))
 
-def getCustomKernelConfigAndAssembly(name, directory=globalParameters["CustomKernelDirectory"]):
+def getCustomKernelConfigAndAssembly(name, directory: Union[str, Path]=globalParameters["CustomKernelDirectory"]):
+    if not isinstance(directory, Path):
+        directory = Path(directory)
     contents  = getCustomKernelContents(name, directory)
     config = "\n"    #Yaml configuration properties
     assembly = ""
@@ -57,7 +67,9 @@ def getCustomKernelConfigAndAssembly(name, directory=globalParameters["CustomKer
 
     return (config, assembly)
 
-def getCustomKernelConfig(name, directory=globalParameters["CustomKernelDirectory"]):
+def getCustomKernelConfig(name,directory: Union[str, Path]=globalParameters["CustomKernelDirectory"]):
+    if not isinstance(directory, Path):
+        directory = Path(directory)
     rawConfig, _ = getCustomKernelConfigAndAssembly(name, directory)
     try:
         return yaml.safe_load(rawConfig)["custom.config"]
diff --git a/src/Tensile/CustomKernels/DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4.s b/src/Tensile/CustomKernels/DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4.s
deleted file mode 100644
index 6199997f34..0000000000
--- a/src/Tensile/CustomKernels/DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4.s
+++ /dev/null
@@ -1,10827 +0,0 @@
-/***********************************************************************************/
-/*                                                                                 */
-/* Copyright (C) 2021-2022 Advanced Micro Devices, Inc. All rights reserved.       */
-/*                                                                                 */
-/* Permission is hereby granted, free of charge, to any person obtaining a copy    */
-/* of this software and associated documentation files (the "Software"), to deal   */
-/* in the Software without restriction, including without limitation the rights    */
-/* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell       */
-/* copies of the Software, and to permit persons to whom the Software is           */
-/* furnished to do so, subject to the following conditions:                        */
-/*                                                                                 */
-/* The above copyright notice and this permission notice shall be included in      */
-/* all copies or substantial portions of the Software.                             */
-/*                                                                                 */
-/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR      */
-/* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,        */
-/* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE     */
-/* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER          */
-/* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,   */
-/* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE   */
-/* SOFTWARE.                                                                       */
-/*                                                                                 */
-/***********************************************************************************/
-
-
-/******************************************/
-/* Function Prefix                        */
-/******************************************/
-
-
-
-/******************************************/
-/* Begin Kernel                           */
-/******************************************/
-
-// Component.Signature.SignatureCOV3
-.amdgcn_target "amdgcn-amd-amdhsa--gfx90a"
-.text
-.protected DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4
-.globl DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4
-.p2align 8
-.type DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4,@function
-.section .rodata,#alloc
-.p2align 6
-.amdhsa_kernel DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4
-  .amdhsa_user_sgpr_kernarg_segment_ptr 1
-  .amdhsa_accum_offset 256 // accvgpr offset
-  .amdhsa_next_free_vgpr 256 // vgprs
-  .amdhsa_next_free_sgpr 73 // sgprs
-  .amdhsa_group_segment_fixed_size 32768 // lds bytes
-  .amdhsa_private_segment_fixed_size 0
-  .amdhsa_system_sgpr_workgroup_id_x 1
-  .amdhsa_system_sgpr_workgroup_id_y 1
-  .amdhsa_system_sgpr_workgroup_id_z 1
-  .amdhsa_system_vgpr_workitem_id 0
-.end_amdhsa_kernel
-.text
-
-/******************************************/
-/* Optimizations and Config:              */
-/******************************************/
-/* ThreadTile= 8 x 8 */
-/* SubGroup= 16 x 16 */
-/* VectorWidth=2 */
-/* GlobalLoadVectorWidthA=2, GlobalLoadVectorWidthB=2 */
-/* DirectToLdsA=False */
-/* DirectToLdsB=False */
-/* UseSgprForGRO=False */
-.amdgpu_metadata
----
-custom.config:
-   ProblemType:
-      OperationType: GEMM
-      DataType: D
-      TransposeA: False
-      TransposeB: False
-      UseBeta: True
-      Batched: True
-   MatrixInstruction: [ 16, 16, 4, 1 ]
-   ThreadTile: [ 2, 128 ]
-   WorkGroup: [ 64, 4, 1 ]
-   DepthU: 16
-   VectorWidth: 2
-   SourceSwap: 1
-   GlobalReadVectorWidth: 2
-   StaggerUStride: 128
-   StaggerU: 4
-   WorkGroupMapping: 4
-   AssertSizeMultiple: {3: 32}
-amdhsa.version:
-  - 1
-  - 1
-amdhsa.kernels:
-  - .name: DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4
-    .symbol: 'DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4.kd'
-    .language:                   OpenCL C
-    .language_version:
-      - 2
-      - 0
-    .args:
-      - .name:            sizeC
-        .size:            8
-        .offset:          0
-        .value_kind:      by_value
-        .value_type:      u64
-      - .name:            sizeA
-        .size:            8
-        .offset:          8
-        .value_kind:      by_value
-        .value_type:      u64
-      - .name:            sizeB
-        .size:            8
-        .offset:          16
-        .value_kind:      by_value
-        .value_type:      u64
-      - .name:            D
-        .size:            8
-        .offset:          24
-        .value_kind:      global_buffer
-        .value_type:      f64
-        .address_space:   generic
-      - .name:            C
-        .size:            8
-        .offset:          32
-        .value_kind:      global_buffer
-        .value_type:      f64
-        .address_space:   generic
-      - .name:            A
-        .size:            8
-        .offset:          40
-        .value_kind:      global_buffer
-        .value_type:      f64
-        .address_space:   generic
-      - .name:            B
-        .size:            8
-        .offset:          48
-        .value_kind:      global_buffer
-        .value_type:      f64
-        .address_space:   generic
-      - .name:            alpha
-        .size:            8
-        .offset:          56
-        .value_kind:      by_value
-        .value_type:      f64
-      - .name:            beta
-        .size:            8
-        .offset:          64
-        .value_kind:      by_value
-        .value_type:      f64
-      - .name:            strideD0
-        .size:            4
-        .offset:          72
-        .value_kind:      by_value
-        .value_type:      u32
-      - .name:            strideD1
-        .size:            4
-        .offset:          76
-        .value_kind:      by_value
-        .value_type:      u32
-      - .name:            strideC0
-        .size:            4
-        .offset:          80
-        .value_kind:      by_value
-        .value_type:      u32
-      - .name:            strideC1
-        .size:            4
-        .offset:          84
-        .value_kind:      by_value
-        .value_type:      u32
-      - .name:            strideA0
-        .size:            4
-        .offset:          88
-        .value_kind:      by_value
-        .value_type:      u32
-      - .name:            strideA1
-        .size:            4
-        .offset:          92
-        .value_kind:      by_value
-        .value_type:      u32
-      - .name:            strideB0
-        .size:            4
-        .offset:          96
-        .value_kind:      by_value
-        .value_type:      u32
-      - .name:            strideB1
-        .size:            4
-        .offset:          100
-        .value_kind:      by_value
-        .value_type:      u32
-      - .name:            SizesFree0
-        .size:            4
-        .offset:          104
-        .value_kind:      by_value
-        .value_type:      u32
-      - .name:            SizesFree1
-        .size:            4
-        .offset:          108
-        .value_kind:      by_value
-        .value_type:      u32
-      - .name:            SizesFree2
-        .size:            4
-        .offset:          112
-        .value_kind:      by_value
-        .value_type:      u32
-      - .name:            SizesSum0
-        .size:            4
-        .offset:          116
-        .value_kind:      by_value
-        .value_type:      u32
-      - .name:            OrigStaggerUIter
-        .size:            4
-        .offset:          120
-        .value_kind:      by_value
-        .value_type:      i32
-      - .name:            NumWorkGroups0
-        .size:            4
-        .offset:          124
-        .value_kind:      by_value
-        .value_type:      u32
-      - .name:            NumWorkGroups1
-        .size:            4
-        .offset:          128
-        .value_kind:      by_value
-        .value_type:      u32
-      - .name:            NumFullBlocks
-        .size:            4
-        .offset:          132
-        .value_kind:      by_value
-        .value_type:      u32
-      - .name:            WgmRemainder1
-        .size:            4
-        .offset:          136
-        .value_kind:      by_value
-        .value_type:      u32
-      - .name:            MagicNumberWgmRemainder1
-        .size:            4
-        .offset:          140
-        .value_kind:      by_value
-        .value_type:      u32
-      - .name:            padding
-        .size:            4
-        .offset:          144
-        .value_kind:      by_value
-        .value_type:      u32
-    .group_segment_fixed_size:   32768
-    .kernarg_segment_align:      8
-    .kernarg_segment_size:       152
-    .max_flat_workgroup_size:    256
-    .private_segment_fixed_size: 0
-    .sgpr_count:                 73
-    .sgpr_spill_count:           0
-    .vgpr_count:                 256
-    .vgpr_spill_count:           0
-    .wavefront_size:             64
-...
-.end_amdgpu_metadata
-DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4:
-
-/******************************************/
-/* Asm syntax workarounds                 */
-/******************************************/
-.macro _v_add_co_u32 dst:req, cc:req, src0:req, src1:req, dpp=
-   v_add_co_u32 \dst, \cc, \src0, \src1 \dpp
-.endm
-
-.macro _v_add_u32 dst:req, src0:req, src1:req, dpp=
-   v_add_u32 \dst, \src0, \src1 \dpp
-.endm
-
-.macro _v_add_i32 dst:req, src0:req, src1:req, dpp=
-   v_add_i32 \dst, \src0, \src1 \dpp
-.endm
-
-.macro _v_addc_co_u32 dst:req, ccOut:req, src0:req, ccIn:req, src1:req, dpp=
-   v_addc_co_u32 \dst, \ccOut, \src0, \ccIn, \src1 \dpp
-.endm
-
-.macro _v_sub_co_u32 dst:req, cc:req, src0:req, src1:req, dpp=
-   v_sub_co_u32 \dst, \cc, \src0, \src1 \dpp
-.endm
-
-.macro _v_sub_u32 dst:req, src0:req, src1:req, dpp=
-   v_sub_u32 \dst, \src0, \src1 \dpp
-.endm
-
-.macro _v_sub_i32 dst:req, src0:req, src1:req, dpp=
-   v_sub_i32 \dst, \src0, \src1 \dpp
-.endm
-
-.macro _v_add_lshl_u32 dst:req, src0:req, src1:req, shiftCnt:req
-    v_add_lshl_u32 \dst, \src0, \src1, \shiftCnt
-.endm
-
-.macro _v_lshl_add_u32 dst:req, src0:req, src1:req, shiftCnt:req
-    v_lshl_add_u32 \dst, \src0, \src1, \shiftCnt
-.endm
-
-.macro _v_lshl_or_b32 dst:req, src0:req, shiftCnt:req, src1:req
-    v_lshl_or_b32 \dst, \src0, \shiftCnt, \src1
-.endm
-
-.macro _v_cmpx_lt_i16 dst, src0, src1=
-   v_cmpx_lt_i16 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_lt_i32 dst, src0, src1=
-   v_cmpx_lt_i32 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_lt_i64 dst, src0, src1=
-   v_cmpx_lt_i64 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_lt_u16 dst, src0, src1=
-   v_cmpx_lt_u16 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_lt_u32 dst, src0, src1=
-   v_cmpx_lt_u32 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_lt_u64 dst, src0, src1=
-   v_cmpx_lt_u64 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_eq_i16 dst, src0, src1=
-   v_cmpx_eq_i16 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_eq_i32 dst, src0, src1=
-   v_cmpx_eq_i32 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_eq_i64 dst, src0, src1=
-   v_cmpx_eq_i64 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_eq_u16 dst, src0, src1=
-   v_cmpx_eq_u16 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_eq_u32 dst, src0, src1=
-   v_cmpx_eq_u32 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_eq_u64 dst, src0, src1=
-   v_cmpx_eq_u64 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_le_i16 dst, src0, src1=
-   v_cmpx_le_i16 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_le_i32 dst, src0, src1=
-   v_cmpx_le_i32 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_le_i64 dst, src0, src1=
-   v_cmpx_le_i64 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_le_u16 dst, src0, src1=
-   v_cmpx_le_u16 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_le_u32 dst, src0, src1=
-   v_cmpx_le_u32 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_le_u64 dst, src0, src1=
-   v_cmpx_le_u64 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_gt_i16 dst, src0, src1=
-   v_cmpx_gt_i16 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_gt_i32 dst, src0, src1=
-   v_cmpx_gt_i32 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_gt_i64 dst, src0, src1=
-   v_cmpx_gt_i64 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_gt_u16 dst, src0, src1=
-   v_cmpx_gt_u16 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_gt_u32 dst, src0, src1=
-   v_cmpx_gt_u32 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_gt_u64 dst, src0, src1=
-   v_cmpx_gt_u64 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_ne_i16 dst, src0, src1=
-   v_cmpx_ne_i16 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_ne_i32 dst, src0, src1=
-   v_cmpx_ne_i32 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_ne_i64 dst, src0, src1=
-   v_cmpx_ne_i64 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_ne_u16 dst, src0, src1=
-   v_cmpx_ne_u16 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_ne_u32 dst, src0, src1=
-   v_cmpx_ne_u32 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_ne_u64 dst, src0, src1=
-   v_cmpx_ne_u64 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_lg_i16 dst, src0, src1=
-   v_cmpx_lg_i16 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_lg_i32 dst, src0, src1=
-   v_cmpx_lg_i32 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_lg_i64 dst, src0, src1=
-   v_cmpx_lg_i64 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_lg_u16 dst, src0, src1=
-   v_cmpx_lg_u16 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_lg_u32 dst, src0, src1=
-   v_cmpx_lg_u32 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_lg_u64 dst, src0, src1=
-   v_cmpx_lg_u64 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_ge_i16 dst, src0, src1=
-   v_cmpx_ge_i16 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_ge_i32 dst, src0, src1=
-   v_cmpx_ge_i32 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_ge_i64 dst, src0, src1=
-   v_cmpx_ge_i64 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_ge_u16 dst, src0, src1=
-   v_cmpx_ge_u16 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_ge_u32 dst, src0, src1=
-   v_cmpx_ge_u32 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_ge_u64 dst, src0, src1=
-   v_cmpx_ge_u64 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_o_i16 dst, src0, src1=
-   v_cmpx_o_i16 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_o_i32 dst, src0, src1=
-   v_cmpx_o_i32 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_o_i64 dst, src0, src1=
-   v_cmpx_o_i64 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_o_u16 dst, src0, src1=
-   v_cmpx_o_u16 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_o_u32 dst, src0, src1=
-   v_cmpx_o_u32 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_o_u64 dst, src0, src1=
-   v_cmpx_o_u64 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_u_i16 dst, src0, src1=
-   v_cmpx_u_i16 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_u_i32 dst, src0, src1=
-   v_cmpx_u_i32 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_u_i64 dst, src0, src1=
-   v_cmpx_u_i64 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_u_u16 dst, src0, src1=
-   v_cmpx_u_u16 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_u_u32 dst, src0, src1=
-   v_cmpx_u_u32 \dst, \src0, \src1 
-.endm
-
-.macro _v_cmpx_u_u64 dst, src0, src1=
-   v_cmpx_u_u64 \dst, \src0, \src1 
-.endm
-.macro _v_mac_f32 c:req, a:req, b:req
-    v_mac_f32 \c, \a, \b
-.endmacro
-
-/******************************************/
-/* Magic div and mod functions            */
-/******************************************/
-.macro V_MAGIC_DIV dstIdx:req, dividend:req, magicNumber:req, magicShift:req, magicA:req
-    v_mul_hi_u32 v[\dstIdx+1], \dividend, \magicNumber
-    v_mul_lo_u32 v[\dstIdx+0], \dividend, \magicA
-    _v_add_u32 v[\dstIdx+0], v[\dstIdx+0], v[\dstIdx+1]
-    v_lshrrev_b32 v[\dstIdx+0], \magicShift, v[\dstIdx+0]
-.endm
-
-/******************************************/
-/* VGPR Assignments                       */
-/******************************************/
-/* ValuC range: [0-128), serializedStore enabled */
-.set vgprValuC, 0
-/* ValuA/B   Xn=PLR buffer idx */
-.set vgprValuA_X0_I0, 128
-.set vgprValuA_X1_I0, 132
-.set vgprValuA_X2_I0, 136
-.set vgprValuA_X3_I0, 140
-.set vgprValuB_X0_I0, 144
-.set vgprValuB_X1_I0, 160
-.set vgprValuB_X2_I0, 176
-.set vgprValuB_X3_I0, 192
-.set vgprLocalWriteAddrA, 208
-.set vgprLocalWriteAddrB, 209
-.set vgprGlobalReadOffsetA, 210
-.set vgprGlobalReadOffsetB, 214
-.set vgprG2LA, 218
-.set vgprValuA_X0_I1, 218
-.set vgprValuA_X1_I1, 222
-.set vgprValuA_X2_I1, 226
-.set vgprValuA_X3_I1, 230
-.set vgprG2LB, 234
-.set vgprLocalReadAddrA, 250
-.set vgprLocalReadAddrB, 251
-.set vgprSerial, 252
-/* Num VGPR=256 */
-/* Num AccVGPR=0 */
-
-/******************************************/
-/* SGPR Assignments                       */
-/******************************************/
-.set sgprKernArgAddress, 0
-.set sgprWorkGroup0, 2
-.set sgprWorkGroup1, 3
-.set sgprWorkGroup2, 4
-.set sgprLoopCounterL, 5
-.set sgprOrigLoopCounter, 6
-.set sgprSrdA, 8
-.set sgprSrdB, 12
-.set sgprSrdD, 16
-.set sgprSrdC, 20
-.set sgprTensor2dSizeA, 24
-.set sgprTensor2dSizeB, 26
-.set sgprAddressD, 28
-.set sgprAddressC, 30
-.set sgprAddressA, 32
-.set sgprAddressB, 34
-/* offsets pre-applied */
-.set sgprAlpha, 44
-.set sgprBeta, 46
-.set sgprStridesD, 48
-.set sgprStridesC, 50
-.set sgprStridesA, 52
-.set sgprStridesB, 54
-.set sgprSizesFree, 56
-.set sgprSizesSum, 59
-.set sgprOrigStaggerUIter, 60
-.set sgprNumWorkGroups0, 61
-.set sgprNumWorkGroups1, 62
-.set sgprNumFullBlocks, 63
-.set sgprWgmRemainder1, 64
-.set sgprMagicNumberWgmRemainder1, 65
-.set sgprShadowLimitA, 36
-.set sgprShadowLimitB, 38
-.set sgprStaggerUIter, 7
-.set sgprWrapUA, 40
-.set sgprWrapUB, 42
-.set sgprGlobalReadIncsA, 66
-.set sgprGlobalReadIncsB, 67
-/* max SGPR=73 */
-
-/* Size Assignments */
-.set sgprSizeI, sgprSizesFree+0
-.set sgprSizeJ, sgprSizesFree+1
-.set sgprSizeK, sgprSizesFree+2
-.set sgprSizeL, sgprSizesSum+0
-
-/* Stride Assignments */
-.set constStrideD0I, 1
-.set sgprStrideD1J, sgprStridesD+0
-.set sgprStrideDK, sgprStridesD+1
-.set constStrideC0I, 1
-.set sgprStrideC1J, sgprStridesC+0
-.set sgprStrideCK, sgprStridesC+1
-.set constStrideA0I, 1
-.set sgprStrideAL, sgprStridesA+0
-.set sgprStrideAK, sgprStridesA+1
-.set constStrideBL, 1
-.set sgprStrideB1J, sgprStridesB+0
-.set sgprStrideBK, sgprStridesB+1
-
-.set MT0, 128
-.set MT1, 128
-.set DepthU, 16
-.set GSU, 1
-.set BpeA, 8
-.set BpeALog2, 3
-.set BpeB, 8
-.set BpeBLog2, 3
-/* Number of elements to shift-left SRD */
-.set SrdShiftLeftA, 2
-.set SrdShiftLeftB, 2
-/* 2GB limit - set offsets to -1 to exceed this and clamp */
-.set BufferLimit, 0xffffffff
-.set BufferOOB, 0x80000000
-
-/******************************************/
-/* Bits 127:96 of SRD.                    */
-/* hex: 0x00020000                        */
-/* dst_sel_x (3b): 0                      */
-/* dst_sel_y (3b): 0                      */
-/* dst_sel_z (3b): 0                      */
-/* dst_sel_w (3b): 0                      */
-/* num_format (3b): 0                     */
-/* data_format (4b): 4                    */
-/* user_vm_enable (1b): 0                 */
-/* user_vm_mode (1b): 0                   */
-/* index_stride (2b): 0                   */
-/* add_tid_enable (1b): 0                 */
-/* _unusedA (3b): 0                       */
-/* nv (1b): 0                             */
-/* _unusedB (2b): 0                       */
-/* type (2b): 0                           */
-/******************************************/
-.set Srd127_96, 0x00020000
-
-/* Global Offset A */
-.macro GLOBAL_OFFSET_A vgprAddr:req vgprOffset0I:req vgprOffsetL:req vgprTmp:req
-v_mul_lo_u32 v[\vgprTmp+0], s[sgprStrideAL], v[\vgprOffsetL] // mul d1 lower
-_v_add_co_u32 v[\vgprAddr+0], vcc, v[\vgprOffset0I], v[\vgprTmp+0] // accumulate K lower
-_v_add_u32 v[\vgprAddr+0], 0x2, v[\vgprAddr+0]     // add prepad for pointer shift
-v_lshlrev_b32 v[\vgprAddr+0], 0x3, v[\vgprAddr+0]  // offset *= bytes/element
-.endm
-
-/* Global Offset B */
-.macro GLOBAL_OFFSET_B vgprAddr:req vgprOffsetL:req vgprOffset1J:req vgprTmp:req
-v_mul_lo_u32 v[\vgprTmp+0], s[sgprStrideB1J], v[\vgprOffset1J] // mul d1 lower
-_v_add_co_u32 v[\vgprAddr+0], vcc, v[\vgprOffsetL], v[\vgprTmp+0] // accumulate K lower
-_v_add_u32 v[\vgprAddr+0], 0x2, v[\vgprAddr+0]     // add prepad for pointer shift
-v_lshlrev_b32 v[\vgprAddr+0], 0x3, v[\vgprAddr+0]  // offset *= bytes/element
-.endm
-
-/******************************************/
-/* Dynamic Scalar Divide: vQuotient=vDividend/vDivisor; vRemainder=vDividend%vDivisor; */
-/******************************************/
-.macro DYNAMIC_VECTOR_DIVIDE vQuotient vRemainder vDividend vDivisor vTmp0 vTmp1 sTmp
-v_cvt_f32_u32 v[\vQuotient], v[\vDivisor]          // 
-v_rcp_f32 v[\vQuotient], v[\vQuotient]             // 
-v_mul_f32 v[\vQuotient], 0x4f800000, v[\vQuotient] // 
-v_cvt_u32_f32 v[\vQuotient], v[\vQuotient]         // 
-v_mul_lo_u32 v[\vRemainder], v[\vDivisor], v[\vQuotient] // 
-v_mul_hi_u32 v[\vTmp0], v[\vDivisor], v[\vQuotient] // 
-_v_sub_co_u32 v[\vTmp1], vcc, 0x0, v[\vRemainder]  // 
-v_cmp_ne_i32 s[\sTmp:\sTmp+1], 0x0, v[\vTmp0]      // 
-v_cndmask_b32 v[\vRemainder], v[\vTmp1], v[\vRemainder], s[\sTmp:\sTmp+1] // 
-v_mul_hi_u32 v[\vRemainder], v[\vRemainder], v[\vQuotient] // 
-_v_sub_co_u32 v[\vTmp0], vcc, v[\vQuotient], v[\vRemainder] // 
-_v_add_co_u32 v[\vQuotient], vcc, v[\vQuotient], v[\vRemainder] // 
-v_cndmask_b32 v[\vQuotient], v[\vQuotient], v[\vTmp0], s[\sTmp:\sTmp+1] // 
-v_mul_hi_u32 v[\vQuotient], v[\vQuotient], v[\vDividend] // 
-v_mul_lo_u32 v[\vRemainder], v[\vQuotient], v[\vDivisor] // 
-_v_sub_co_u32 v[\vTmp0], vcc, v[\vDividend], v[\vRemainder] // 
-v_cmp_ge_u32 s[\sTmp:\sTmp+1], v[\vDividend], v[\vRemainder] // 
-_v_add_co_u32 v[\vRemainder], vcc, 0x1, v[\vQuotient] // 
-_v_add_co_u32 v[\vTmp1], vcc, -1, v[\vQuotient]    // 
-v_cmp_le_u32 vcc, v[\vDivisor], v[\vTmp0]          // 
-s_and_b64 vcc, s[\sTmp:\sTmp+1], vcc               // 
-v_cndmask_b32 v[\vQuotient], v[\vQuotient], v[\vRemainder], vcc // 
-v_cndmask_b32 v[\vQuotient], v[\vTmp1], v[\vQuotient], s[\sTmp:\sTmp+1] // 
-v_cmp_ne_i32 vcc, 0x0, v[\vDivisor]                // 
-v_cndmask_b32 v[\vQuotient], -1, v[\vQuotient], vcc // final result
-v_mul_lo_u32 v[\vRemainder], v[\vQuotient], v[\vDivisor] // 
-_v_sub_co_u32 v[\vRemainder], vcc, v[\vDividend], v[\vRemainder] // final result
-.endm
-
-
-
-/******************************************/
-/* Allocate Resources                     */
-/******************************************/
-
-s_setprio 3                                        // optimization store
-s_mov_b32 m0, 0x9000                               // LDS clamp at 36864 bytes
-v_mov_b32 v[vgprSerial], v0                        // thread serial id
-
-/* Load Kernel Args */
-s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x8 // 
-s_load_dwordx16 s[48:63], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x48 // 
-s_load_dwordx2 s[64:65], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x88 // 
-s_waitcnt lgkmcnt(0)                               // wait for 160 bytes of kern args
-s_mov_b32 s44, s36
-s_mov_b32 s45, s37
-s_mov_b32 s46, s38
-s_mov_b32 s47, s39
-
-/* Short circuit condition if Alpha == 0, then sumDims=0 */
-v_cmp_eq_f64 vcc, s[sgprAlpha:sgprAlpha+1], 0.0    // Alpha == 0.0 ?
-s_cbranch_vccz label_AlphaNonZero                  // branch if Alpha != 0
-s_mov_b32 s[sgprSizesSum+0], 0x0                   // Set summation dim=0 if Alpha == 0
-label_AlphaNonZero:
-
-
-/******************************************/
-/* Local Read Addresses                   */
-/******************************************/
-
-
-/* local read addresses: tile assignments a/b */
-
-/*lr0I*/
-v_and_b32 v2, 63, v[vgprSerial]                    // 0. thread id in wave: wtid = tid % wavelength(64)
-v_and_b32 v1, 15, v2                               // 1. N offset: nIdx = wtid % MI_N(16)
-                                                   // 1. N offset: nOffset = nIdx * nStride(1) (multiplier is 1, do nothing)
-v_lshrrev_b32 v0, 4, v2                            // 2. block offset: bnIdx = wtid / dividedForBlkId(16)
-v_and_b32 v0, 0, v0                                // 2. block offset: bnIdx = bnIdx % num1DBlocks(1)
-v_lshlrev_b32 v0, 0x4, v0                          // 2. block offset: bnOffset = bnIdx * strideBlock(16)
-_v_add_u32 v1, v0, v1                              // 3. add N and block offset: bnOffset = block and N offset
-v_lshlrev_b32 v1, 0x1, v1                          // 3. apply VectorWidth: bnOffset = bnOffset * vw(2)
-v_lshrrev_b32 v2, 4, v2                            // 4. K offset: kIdx = wtid / (MIN(16) * MIBB(1))
-v_lshlrev_b32 v2, 0x8, v2                          // 4. K offset: lrKOffset = kIdx * mStride(256)
-_v_add_u32 v1, v2, v1                              // 5. offset in wave: lrOffset = bnOffset + lrKOffset
-v_lshrrev_b32 v0, 6, v[vgprSerial]                 // 6. wave offset in N dimen: wtid = tid / dividedForWaveId(64)
-v_and_b32 v0, 3, v0                                // 6. wave offset in M dimen: wtid0 = wtid / num1DWaves(4)
-v_lshlrev_b32 v0, 0x5, v0                          // 6. wave offset in M dimen: wOffset = wtid0 * W0Stride(32)
-_v_add_u32 v1, v0, v1                              // 7. final local read offset: flrOffset = lrOffset + WOffset
-/*lr1J*/
-v_and_b32 v3, 63, v[vgprSerial]                    // 0. thread id in wave: wtid = tid % wavelength(64)
-v_and_b32 v2, 15, v3                               // 1. N offset: nIdx = wtid % MI_N(16)
-v_lshlrev_b32 v2, 0x4, v2                          // 1. N offset: nOffset = nIdx * nStride(16)
-v_lshrrev_b32 v0, 4, v3                            // 2. block offset: bnIdx = wtid / dividedForBlkId(16)
-v_and_b32 v0, 0, v0                                // 2. block offset: bnIdx = bnIdx % num1DBlocks(1)
-v_lshlrev_b32 v0, 0x8, v0                          // 2. block offset: bnOffset = bnIdx * strideBlock(256)
-_v_add_u32 v2, v0, v2                              // 3. add N and block offset: bnOffset = block and N offset
-                                                   // 3. apply VectorWidth: bnOffset = bnOffset * vw(1) (multiplier is 1, do nothing)
-v_lshrrev_b32 v3, 4, v3                            // 4. K offset: kIdx = wtid / (MIN(16) * MIBB(1))
-v_lshlrev_b32 v3, 0x1, v3                          // 4. K offset: lrKOffset = kIdx * mStride(2)
-_v_add_u32 v2, v3, v2                              // 5. offset in wave: lrOffset = bnOffset + lrKOffset
-
-
-/* local read addresses: final offsets a */
-
-// v_lshrrev_b32 v0, 8, v[vgprSerial]                 // LSU offset: sgid = Serial / subGroup(256)
-// s_mov_b32 s68, 128                                 // LSU offset: stirde = MT0(128) + PAD0(0)
-// v_mul_lo_u32 v0, s68, v0                           // LSU offset: lsuoffset = sgid*(MT0+PAD)
-// _v_add_lshl_u32 v[vgprLocalReadAddrA], v0, v1, 0x3 // Final Offset: offset = (lro0*VW+lsuoffset)*bpe
-
-
-/* local read addresses: final offsets b */
-
-v_lshrrev_b32 v0, 8, v[vgprSerial]                 // LSU offset: sgid = Serial / subGroup(256)
-s_mov_b32 s68, 128                                 // LSU offset: stirde = MT1(128) + PAD1(0)
-v_mul_lo_u32 v0, s68, v0                           // LSU offset: lsuoffset = sgid*(MT1+PAD)
-_v_add_lshl_u32 v[vgprLocalReadAddrB], v0, v2, 0x3 // Final Offset: offset = (lro1*VW+lsuoffset)*bpe
-v_lshrrev_b32 v1, 7, v[vgprLocalReadAddrB]         // Final Offset: padding 4 per block 128
-v_lshlrev_b32 v1, 0x5, v1                          // Final Offset: padding 4 per block 128
-_v_add_u32 v[vgprLocalReadAddrB], v1, v[vgprLocalReadAddrB] // Final Offset: add padding 4 per block 128
-
-
-/* local read addresses: declare addresses a */
-
-/* N/A */
-
-
-/* local read addresses: declare addresses b */
-
-// _v_add_co_u32 v[vgprLocalReadAddrB+0], vcc, 0x4000, v[vgprLocalReadAddrB+0] //  += LdsOffsetB (lower)
-
-
-
-/******************************************/
-/* Begin setupNewTile, isPap=False           */
-/******************************************/
-
-
-/* global read addresses: work-group */
-
-/* graWorkGroup mapping */
-s_mov_b32 s71, 0x20000001L                         // magic number for WGM==4
-s_mul_hi_u32 s69, s[sgprWorkGroup1], s71           // s_magic mul
-s_mul_i32 s68, s[sgprWorkGroup1], s71              // s_magic mul
-s_lshr_b64 s[68:69], s[68:69], 31                  // sMagicDiv
-s_mul_i32 s69, s68, 4                              // quotient * non-magic divisor
-s_sub_u32 s69, s[sgprWorkGroup1], s69              // WorkGroup1=remainder
-s_mul_i32 s69, s69, s[sgprNumWorkGroups0]          // (wg1 % WGM)*nwg0
-s_add_u32 s69, s69, s[sgprWorkGroup0]              // wgSerial = wg0 + (wg1 % WGM)*nwg0
-s_cmp_ge_u32 s68, s[sgprNumFullBlocks]             // blockId >= numFullBlocks ?
-s_cmov_b32 s71, s[sgprMagicNumberWgmRemainder1]    // 
-s_cselect_b32 s70, s[sgprWgmRemainder1], 4         // 
-s_mul_hi_u32 s3, s69, s71                          // s_magic mul
-s_mul_i32 s2, s69, s71                             // s_magic mul
-s_lshr_b64 s[2:3], s[2:3], 31                      // sMagicDiv
-s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s70 // quotient * non-magic divisor
-s_sub_u32 s[sgprWorkGroup1], s69, s[sgprWorkGroup1] // WorkGroup1=remainder
-s_mul_i32 s68, s68, 4                              // blockId * WGM
-s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s68 // wg1 += blockId * WGM
-
-
-/* global read addresses: tile offset assignment a */
-
-/* LVCA = 64 */
-/* v0 = (local)groA-tile = serial%LVCA (note (wgA*MTA) will be added to SRD) */
-/* v1 = groA-unroll = serial/LVCA */
-v_and_b32 v0, 15, v[vgprSerial]     // v0 = v[vgprSerial] % 16
-v_lshlrev_b32 v0, 0x1, v0           // v0 = v0 * 2
-v_lshrrev_b32 v1, 6, v[vgprSerial]  // v1 = v[vgprSerial] / 64
-v_lshlrev_b32 v1, 5, v1             // v1 = v1 * 32
-v_add_u32 v0, v0, v1                // v0 = v0 + v1
-
-v_and_b32 v1, 63, v[vgprSerial]     // v1 = (v[vgprSerial] % 64) / 16
-v_lshrrev_b32 v1, 4, v1             // v1 = (v[vgprSerial] % 64) / 16
-v_lshlrev_b32 v1, 1, v1             // v1 = v1 * 2
-
-// v_lshrrev_b32 v1, 6, v[vgprSerial]                 // v1 = v[vgprSerial] / 64
-// v_and_b32 v0, 63, v[vgprSerial]                    // v0 = v[vgprSerial] % 64
-// /* gro-tile *= glvw */
-// v_lshlrev_b32 v0, 0x1, v0                          // v0 = v0 * 2
-
-
-/* global read addresses: tile offset assignment b */
-
-/* LVCB = 8 */
-/* v2 = (local)groB-tile = serial/LVCB (note (wgB*MTB) will be added to SRD) */
-/* v3 = groB-unroll = serial%LVCB */
-v_and_b32 v6, 63, v[vgprSerial]                    // v6 = v[vgprSerial] % 64
-v_lshrrev_b32 v2, 3, v6                            // v2 = v6 / 8
-v_and_b32 v3, 7, v6                                // v3 = v6 % 8
-v_readfirstlane_b32 s68, v[vgprSerial]             // WaveIdxWavefrontWidth
-s_lshr_b32 s68, s68, 0x6                           // WaveId
-s_mul_i32 s68, s68, 32                             // Global Read Wave: each wave loads continuous lsp(8)*nrp(4) columns
-_v_add_u32 v2, s68, v2                             // Global Read Wave: add back to cloumn index
-/* gro-unroll *= glvw */
-v_lshlrev_b32 v3, 0x1, v3                          // v3 = v3 * 2
-
-
-/* global read addresses: unroll assignment a */
-
-/* v1 */
-
-
-/* global read addresses: unroll assignment b */
-
-/* v3 */
-
-
-/* global read addresses: other free assignments */
-
-/* s[sgprWorkGroup2] */
-
-
-/* global read addresses: tile offsets a */
-
-v_mov_b32 v4, v0                                   // groA0I_0
-
-
-/* global read addresses: tile offsets b */
-
-v_mov_b32 v5, v2                                   // groB1J_0
-_v_add_co_u32 v6, vcc, 8, v5                       // groB1J_1 += LSPB
-_v_add_co_u32 v7, vcc, 8, v6                       // groB1J_2 += LSPB
-_v_add_co_u32 v8, vcc, 8, v7                       // groB1J_3 += LSPB
-
-
-/* global read addresses: unroll offsets a */
-
-v_mov_b32 v9, v1                                   // groAL_0
-_v_add_co_u32 v10, vcc, 1, v9                      // groAL_1 + LSPA
-_v_add_co_u32 v11, vcc, 8, v9                     // groAL_2 + LSPA
-_v_add_co_u32 v12, vcc, 9, v9                     // groAL_3 + LSPA
-
-
-/* global read addresses: unroll offsets b */
-
-v_mov_b32 v13, v3                                  // groBL_0
-
-
-/* global read addresses: shift a */
-
-s_mul_i32 s68, s[sgprWorkGroup0], 128              // WorkGroup[01] * MT
-s_sub_u32 s68, s[sgprSizeI], s68                   // edge = Size0I - WG*MT
-s_sub_u32 s68, s68, 2                              // edge -= margin(2)
-v_mov_b32 v14, s68                                 // edge vgpr = Size0I- WG*MT - margin(2)
-_v_add_co_u32 v15, vcc, v14, 2                     // shiftedEdge = edge + srdShiftLeft(2)
-_v_add_co_u32 v16, vcc, v4, 2                      // shiftedOffset = offset + srdShiftLeft(2)
-v_cmp_lt_u32 s[68:69], v16, v15                    // shiftedOffset < shiftedEdge
-v_cndmask_b32 v4, v14, v4, s[68:69]                // offset = (offset < edge) ? offset(v4) : edge(v14)
-
-
-/* global read addresses: final offsets a */
-
-GLOBAL_OFFSET_A vgprGlobalReadOffsetA+0,  4,  9, 14 // gROA_0_0_0_0
-GLOBAL_OFFSET_A vgprGlobalReadOffsetA+1,  4, 10, 14 // gROA_0_0_1_0
-GLOBAL_OFFSET_A vgprGlobalReadOffsetA+2,  4, 11, 14 // gROA_0_0_2_0
-GLOBAL_OFFSET_A vgprGlobalReadOffsetA+3,  4, 12, 14 // gROA_0_0_3_0
-
-
-/* global read addresses: final offsets b */
-
-GLOBAL_OFFSET_B vgprGlobalReadOffsetB+0, 13,  5, 9 // gROB_0_0_0_0
-GLOBAL_OFFSET_B vgprGlobalReadOffsetB+1, 13,  6, 9 // gROB_0_0_1_0
-GLOBAL_OFFSET_B vgprGlobalReadOffsetB+2, 13,  7, 9 // gROB_0_0_2_0
-GLOBAL_OFFSET_B vgprGlobalReadOffsetB+3, 13,  8, 9 // gROB_0_0_3_0
-
-
-/* global read addresses: addresses a */
-
-/* max read offset = size[n] * stride[n-1] */
-s_mul_hi_u32 s71, s[sgprWorkGroup0], 128           // WorkGroup[01] * MT
-s_mul_i32 s70, s[sgprWorkGroup0], 128              // WorkGroup[01] * MT
-s_sub_u32 s[sgprShadowLimitA+0], s[sgprTensor2dSizeA], s70 // sub tileStart
-s_subb_u32 s[sgprShadowLimitA+1], s[sgprTensor2dSizeA+1], s71 // sub tileStart
-s_lshl_b64 s[sgprShadowLimitA:sgprShadowLimitA+1], s[sgprShadowLimitA:sgprShadowLimitA+1], 0x3 // Set limit to use bytes
-s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], 16 // extend limit for pre-pad
-s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad
-s_cmp_eq_u32 s[sgprShadowLimitA+1], 0              // are we within 2^32?
-s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32
-s_mul_hi_u32 s69, s[sgprStrideAK], s[sgprWorkGroup2] // Stride*WG
-s_mul_i32 s68, s[sgprStrideAK], s[sgprWorkGroup2]  // Stride*WG
-s_add_u32 s70, s70, s68                            // accum wg term to tilestart
-s_addc_u32 s71, s71, s69                           // accum wg term to tilestart
-s_lshl_b64 s[70:71], s[70:71], 0x3                 // tileStart *= BPE
-s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s70    // SRD base = Address+ tileStart0
-s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s71   // SRD base = Address+ tileStart1
-s_sub_u32 s[sgprSrdA+0], s[sgprSrdA+0], 16         // pre-pad to make room for possible pointer shift
-s_subb_u32 s[sgprSrdA+1], s[sgprSrdA+1], 0         // pre-pad to make room for possible pointer shift
-s_mov_b32 s[sgprSrdA+3], Srd127_96                 // Set bits 127_96 in SRD
-
-
-/* global read addresses: addresses b */
-
-/* max read offset = size[n] * stride[n-1] */
-s_mul_hi_u32 s71, s[sgprWorkGroup1], 128           // WorkGroup[01] * MT
-s_mul_i32 s70, s[sgprWorkGroup1], 128              // WorkGroup[01] * MT
-s_mul_hi_u32 s71, s70, s[sgprStrideB1J]            // tlu=0, scaled tile-offset by stride
-s_mul_i32 s70, s70, s[sgprStrideB1J]               // tlu=0, scaled tile-offset by stride
-s_sub_u32 s[sgprShadowLimitB+0], s[sgprTensor2dSizeB], s70 // sub tileStart
-s_subb_u32 s[sgprShadowLimitB+1], s[sgprTensor2dSizeB+1], s71 // sub tileStart
-s_lshl_b64 s[sgprShadowLimitB:sgprShadowLimitB+1], s[sgprShadowLimitB:sgprShadowLimitB+1], 0x3 // Set limit to use bytes
-s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], 16 // extend limit for pre-pad
-s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], 0 // extend limit for pre-pad
-s_cmp_eq_u32 s[sgprShadowLimitB+1], 0              // are we within 2^32?
-s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32
-s_mul_hi_u32 s69, s[sgprStrideBK], s[sgprWorkGroup2] // Stride*WG
-s_mul_i32 s68, s[sgprStrideBK], s[sgprWorkGroup2]  // Stride*WG
-s_add_u32 s70, s70, s68                            // accum wg term to tilestart
-s_addc_u32 s71, s71, s69                           // accum wg term to tilestart
-s_lshl_b64 s[70:71], s[70:71], 0x3                 // tileStart *= BPE
-s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s70    // SRD base = Address+ tileStart0
-s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s71   // SRD base = Address+ tileStart1
-s_sub_u32 s[sgprSrdB+0], s[sgprSrdB+0], 16         // pre-pad to make room for possible pointer shift
-s_subb_u32 s[sgprSrdB+1], s[sgprSrdB+1], 0         // pre-pad to make room for possible pointer shift
-s_mov_b32 s[sgprSrdB+3], Srd127_96                 // Set bits 127_96 in SRD
-
-
-/* global read addresses: increments a */
-
-s_mul_i32 s[sgprGlobalReadIncsA+0], DepthU*BpeA, s[sgprStrideAL] // incrA unrollIdx)
-
-
-/* global read addresses: increments b */
-
-s_mov_b32 s[sgprGlobalReadIncsB+0], DepthU*BpeB    // incrB (unrollIdx)
-
-
-/******************************************/
-/* Local Write Addresses                  */
-/******************************************/
-
-/* lwaTileAssignmentA = v0 */
-
-/* lwaTileAssignmentB = v2 */
-
-/* lwaUnrollAssignmentA = v1 */
-
-/* lwaUnrollAssignmentB = v3 */
-
-
-/* local write addresses: first offset a */
-
-// v_mul_u32_u24 v[vgprLocalWriteAddrA], 0x80, v1     // lwAL**(MTA + PAD)
-// _v_add_lshl_u32 v[vgprLocalWriteAddrA], v0, v[vgprLocalWriteAddrA], 0x3 // lwFOA = (lwAA + lwAL*(MT0I+PAD))*bpe
-
-
-/* local write addresses: first offset b */
-
-v_mul_u32_u24 v[vgprLocalWriteAddrB], 0x10, v2     // lwBL**(DepthU_Compute + PAD)
-_v_add_lshl_u32 v[vgprLocalWriteAddrB], v3, v[vgprLocalWriteAddrB], 0x3 // lwFOB = (lwBB + lwBL*(DepthU+PAD))*bpe
-v_lshrrev_b32 v3, 7, v[vgprLocalWriteAddrB]        // padding 4 per block 128
-v_lshlrev_b32 v3, 0x5, v3                          // padding 4 per block 128
-_v_add_u32 v[vgprLocalWriteAddrB], v3, v[vgprLocalWriteAddrB] // add padding 4 per block 128
-// _v_add_co_u32 v[vgprLocalWriteAddrB], vcc, 0x4000, v[vgprLocalWriteAddrB] // lwFOB = lwB1J + lwBL*MT1J + LDS_OFFSET_B=2048*8
-
-
-
-
-
-
-
-/* declare loop num iterations */
-
-
-s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum+0], 4 // s[sgprLoopCounterL] = s[sgprSizesSum+0] / 16
-s_mov_b32 s[sgprOrigLoopCounter], s[sgprLoopCounterL] // copy loop counter
-
-s_and_b32 s[sgprStaggerUIter], s[sgprOrigStaggerUIter], s[sgprWorkGroup0] // Compute actual stagger start for this tile
-s_lshl_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], 0 // shift by StaggerUStride
-
-
-/* SRDs += (StaggerUIter) * GlobalReadIncsA+0 */
-s_mul_hi_i32 s69, s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] //  stagger byte offset
-s_mul_i32 s68, s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] //  stagger byte offset
-s_mul_hi_i32 s[sgprWrapUA+1], s[sgprLoopCounterL], s[sgprGlobalReadIncsA+0] // Number of bytes accessed by the unroll loop
-s_mul_i32 s[sgprWrapUA+0], s[sgprLoopCounterL], s[sgprGlobalReadIncsA+0] // Number of bytes accessed by the unroll loop
-s_sub_u32 s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0], s[sgprWrapUA+0] // remove one iteration
-s_subb_u32 s[sgprWrapUA+1], 0, s[sgprWrapUA+1]     // remove one iteration
-s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s68        // gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdA+1], s[sgprSrdA+1], s69      // gra SRD += inc(upper)
-s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s68 // limit -= inc)
-s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s69 // limit -= inc)
-s_cmp_eq_u32 s[sgprShadowLimitA+1], 0              // are we within 2^32?
-s_cmov_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0]    // Move shadow to real if we are within 2^32
-
-
-/* SRDs += (StaggerUIter) * GlobalReadIncsB+0 */
-s_mul_hi_i32 s69, s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] //  stagger byte offset
-s_mul_i32 s68, s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] //  stagger byte offset
-s_mul_hi_i32 s[sgprWrapUB+1], s[sgprLoopCounterL], s[sgprGlobalReadIncsB+0] // Number of bytes accessed by the unroll loop
-s_mul_i32 s[sgprWrapUB+0], s[sgprLoopCounterL], s[sgprGlobalReadIncsB+0] // Number of bytes accessed by the unroll loop
-s_sub_u32 s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0], s[sgprWrapUB+0] // remove one iteration
-s_subb_u32 s[sgprWrapUB+1], 0, s[sgprWrapUB+1]     // remove one iteration
-s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s68        // gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdB+1], s[sgprSrdB+1], s69      // gra SRD += inc(upper)
-s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s68 // limit -= inc)
-s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s69 // limit -= inc)
-s_cmp_eq_u32 s[sgprShadowLimitB+1], 0              // are we within 2^32?
-s_cmov_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0]    // Move shadow to real if we are within 2^32
-s_add_u32 s[sgprStaggerUIter], s[sgprStaggerUIter], 2 // Subtract (PGR-1); StaggerUIter now contains target iteration to wrap
-
-/* local read addresses: init pointers a */
-
-
-/* localReadInitPointers */
-
-/* local read addresses: init pointers b */
-
-
-/* localReadInitPointers */
-
-
-/* prefetch: global -> local */
-
-s_cmp_eq_u32 s[sgprLoopCounterL], 0                // at last iteration?
-// s_setprio 0                                        // optimization store
-s_cbranch_scc1 ShadowInitStart_9                   // skip to ShadowInitStart iter b/c numIter==0
-
-buffer_load_dwordx4 v[vgprG2LB+0:vgprG2LB+0+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_0_0
-buffer_load_dwordx4 v[vgprG2LB+4:vgprG2LB+4+3], v[vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_1_0
-buffer_load_dwordx4 v[vgprG2LB+8:vgprG2LB+8+3], v[vgprGlobalReadOffsetB+2], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_2_0
-buffer_load_dwordx4 v[vgprG2LB+12:vgprG2LB+12+3], v[vgprGlobalReadOffsetB+3], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_3_0
-
-buffer_load_dwordx4 v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_0_0
-buffer_load_dwordx4 v[vgprValuA_X0_I0+4:vgprValuA_X0_I0+4+3], v[vgprGlobalReadOffsetA+1], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_1_0
-buffer_load_dwordx4 v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], v[vgprGlobalReadOffsetA+2], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_2_0
-buffer_load_dwordx4 v[vgprValuA_X0_I0+12:vgprValuA_X0_I0+12+3], v[vgprGlobalReadOffsetA+3], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_3_0
-
-
-
-
-/* global read inc A loopL */
-s_add_u32 s70, s[sgprLoopCounterL], 1              // remove pf(1)
-s_cmp_eq_u32 s[sgprStaggerUIter], s70              // Is this wrapIter? (pf)
-s_cselect_b32 s68, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ?
-s_cselect_b32 s69, s[sgprWrapUA+1], 0              // incUpper <- ?
-s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s68        // gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdA+1], s[sgprSrdA+1], s69      // gra SRD += inc(upper)
-s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s68 // limit -= inc)
-s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s69 // limit -= inc)
-s_cmp_eq_u32 s[sgprShadowLimitA+1], 0              // are we within 2^32?
-s_cmov_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0]    // Move shadow to real if we are within 2^32
-
-/* global read inc B loopL */
-s_add_u32 s70, s[sgprLoopCounterL], 1              // remove pf(1)
-s_cmp_eq_u32 s[sgprStaggerUIter], s70              // Is this wrapIter? (pf)
-s_cselect_b32 s68, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ?
-s_cselect_b32 s69, s[sgprWrapUB+1], 0              // incUpper <- ?
-s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s68        // gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdB+1], s[sgprSrdB+1], s69      // gra SRD += inc(upper)
-s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s68 // limit -= inc)
-s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s69 // limit -= inc)
-s_cmp_eq_u32 s[sgprShadowLimitB+1], 0              // are we within 2^32?
-s_cmov_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0]    // Move shadow to real if we are within 2^32
-
-
-/******************************************/
-/* End setupNewTile, isPap=False             */
-/******************************************/
-
-ShadowInitStart_9: // 
-
-s_mov_b32 s[sgprSrdD+0], s[sgprAddressD+0]         // init SRD base address (lower)
-s_mov_b32 s[sgprSrdD+1], s[sgprAddressD+1]         // init SRD base address (upper) + other fields
-s_mov_b32 s[sgprSrdD+2], 0x80000000                // 
-s_mov_b32 s[sgprSrdD+3], Srd127_96                 // Set bits 127_96 in post-loop SRD
-
-s_mov_b32 s[sgprSrdC+0], s[sgprAddressC+0]         // init SRD base address (lower)
-s_mov_b32 s[sgprSrdC+1], s[sgprAddressC+1]         // init SRD base address (upper) + other fields
-s_mov_b32 s[sgprSrdC+2], 0x80000000                // 
-s_mov_b32 s[sgprSrdC+3], Srd127_96                 // Set bits 127_96 in post-loop SRD
-
-
-s_mul_i32 s70, MT1, s[sgprWorkGroup1]              // <- wg1*MT1
-s_mul_hi_u32 s69, s70, s[sgprStrideC1J]            // CScale s70 by Stride
-s_mul_i32 s68, s70, s[sgprStrideC1J]               // CScale s70 by Stride
-s_lshl_b64 s[68:69], s[68:69], 3                   // scale by bpe
-s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68        // add lo to SRD
-s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], s69       // add hi to SRD
-s_mul_hi_u32 s69, s70, s[sgprStrideD1J]            // Scale s70 by Stride
-s_mul_i32 s68, s70, s[sgprStrideD1J]               // Scale s70 by Stride
-s_lshl_b64 s[68:69], s[68:69], 3                   // scale by bpe
-s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // add lo to SRD
-s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s69       // add hi to SRD
-
-s_mul_hi_u32 s69, s[sgprWorkGroup2], s[sgprStrideCK] // CScale s[sgprWorkGroup2] by Stride
-s_mul_i32 s68, s[sgprWorkGroup2], s[sgprStrideCK]  // CScale s[sgprWorkGroup2] by Stride
-s_lshl_b64 s[68:69], s[68:69], 3                   // scale by bpe
-s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s68        // add lo to SRD
-s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], s69       // add hi to SRD
-s_mul_hi_u32 s69, s[sgprWorkGroup2], s[sgprStrideDK] // Scale s[sgprWorkGroup2] by Stride
-s_mul_i32 s68, s[sgprWorkGroup2], s[sgprStrideDK]  // Scale s[sgprWorkGroup2] by Stride
-s_lshl_b64 s[68:69], s[68:69], 3                   // scale by bpe
-s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s68        // add lo to SRD
-s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s69       // add hi to SRD
-
-
-
-/* initC: remove C-tile 0-128 from pool */
-v_mov_b32 v208, 15728640                           // set out-of-bound addr
-ds_read_b32 v[vgprValuC+0], v208, offset:0         // initC
-ds_read_b32 v[vgprValuC+1], v208, offset:0         // initC
-ds_read_b32 v[vgprValuC+2], v208, offset:0         // initC
-ds_read_b32 v[vgprValuC+3], v208, offset:0         // initC
-ds_read_b32 v[vgprValuC+4], v208, offset:0         // initC
-ds_read_b32 v[vgprValuC+5], v208, offset:0         // initC
-ds_read_b32 v[vgprValuC+6], v208, offset:0         // initC
-ds_read_b32 v[vgprValuC+7], v208, offset:0         // initC
-ds_read_b32 v[vgprValuC+8], v208, offset:0         // initC
-ds_read_b32 v[vgprValuC+9], v208, offset:0         // initC
-ds_read_b32 v[vgprValuC+10], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+11], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+12], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+13], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+14], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+15], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+16], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+17], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+18], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+19], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+20], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+21], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+22], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+23], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+24], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+25], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+26], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+27], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+28], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+29], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+30], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+31], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+32], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+33], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+34], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+35], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+36], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+37], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+38], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+39], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+40], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+41], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+42], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+43], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+44], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+45], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+46], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+47], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+48], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+49], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+50], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+51], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+52], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+53], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+54], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+55], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+56], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+57], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+58], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+59], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+60], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+61], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+62], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+63], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+64], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+65], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+66], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+67], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+68], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+69], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+70], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+71], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+72], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+73], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+74], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+75], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+76], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+77], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+78], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+79], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+80], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+81], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+82], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+83], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+84], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+85], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+86], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+87], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+88], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+89], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+90], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+91], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+92], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+93], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+94], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+95], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+96], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+97], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+98], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+99], v208, offset:0        // initC
-ds_read_b32 v[vgprValuC+100], v208, offset:0       // initC
-ds_read_b32 v[vgprValuC+101], v208, offset:0       // initC
-ds_read_b32 v[vgprValuC+102], v208, offset:0       // initC
-ds_read_b32 v[vgprValuC+103], v208, offset:0       // initC
-ds_read_b32 v[vgprValuC+104], v208, offset:0       // initC
-ds_read_b32 v[vgprValuC+105], v208, offset:0       // initC
-ds_read_b32 v[vgprValuC+106], v208, offset:0       // initC
-ds_read_b32 v[vgprValuC+107], v208, offset:0       // initC
-ds_read_b32 v[vgprValuC+108], v208, offset:0       // initC
-ds_read_b32 v[vgprValuC+109], v208, offset:0       // initC
-ds_read_b32 v[vgprValuC+110], v208, offset:0       // initC
-ds_read_b32 v[vgprValuC+111], v208, offset:0       // initC
-ds_read_b32 v[vgprValuC+112], v208, offset:0       // initC
-ds_read_b32 v[vgprValuC+113], v208, offset:0       // initC
-ds_read_b32 v[vgprValuC+114], v208, offset:0       // initC
-ds_read_b32 v[vgprValuC+115], v208, offset:0       // initC
-ds_read_b32 v[vgprValuC+116], v208, offset:0       // initC
-ds_read_b32 v[vgprValuC+117], v208, offset:0       // initC
-ds_read_b32 v[vgprValuC+118], v208, offset:0       // initC
-ds_read_b32 v[vgprValuC+119], v208, offset:0       // initC
-ds_read_b32 v[vgprValuC+120], v208, offset:0       // initC
-ds_read_b32 v[vgprValuC+121], v208, offset:0       // initC
-ds_read_b32 v[vgprValuC+122], v208, offset:0       // initC
-ds_read_b32 v[vgprValuC+123], v208, offset:0       // initC
-ds_read_b32 v[vgprValuC+124], v208, offset:0       // initC
-ds_read_b32 v[vgprValuC+125], v208, offset:0       // initC
-ds_read_b32 v[vgprValuC+126], v208, offset:0       // initC
-ds_read_b32 v[vgprValuC+127], v208, offset:0       // initC
-
-/* initC: remove AB-tile 128-208 from pool */
-
-s_cmp_eq_u32 s[sgprLoopCounterL], 0                // at last iteration?
-
-/* after InitC, skip to end of prefetch last iter if numIter==0 */
-s_cbranch_scc0 label_NoBranch_10                   // Only branch on scc1
-s_getpc_B64 s[68:69]                               // addr of next instr
-s_add_i32 s70, PrefetchGlobalLastIterEnd_4, 0x4    // target branch offset
-s_cmp_ge_i32 s70, 0x0                              // check positive or negative
-s_cbranch_scc1 label_Positive_11                   // jump when positive
-s_abs_i32 s70, s70                                 // abs offset
-s_sub_u32 s68, s68, s70                            // sub target branch offset
-s_subb_u32 s69, s69, 0                             // sub high and carry
-s_setpc_b64 s[68:69]                               // branch to PrefetchGlobalLastIterEnd_4
-label_Positive_11:
-s_add_u32 s68, s68, s70                            // add target branch offset
-s_addc_u32 s69, s69, 0                             // add high and carry
-s_setpc_b64 s[68:69]                               // branch to PrefetchGlobalLastIterEnd_4
-label_NoBranch_10:
-
-s_waitcnt vmcnt(4)                                 // lgkmcnt=-1 vmcnt=08wait for global read
-
-/* local write b */
-ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+0:vgprG2LB+0+3] offset:0 // lwoB_0_0_0_0 = (0*LSCB)*(MT1J+PAD) + (0*LSPB) = 0
-ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+4:vgprG2LB+4+3] offset:1280 // lwoB_0_0_1_0 = (0*LSCB)*(MT1J+PAD) + (1*LSPB) = 1280
-ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+8:vgprG2LB+8+3] offset:2560 // lwoB_0_0_2_0 = (0*LSCB)*(MT1J+PAD) + (2*LSPB) = 2560
-ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+12:vgprG2LB+12+3] offset:3840 // lwoB_0_0_3_0 = (0*LSCB)*(MT1J+PAD) + (3*LSPB) = 3840
-
-
-/* local write swap a */
-
-
-
-/* local write swap b */
-
-
-
-
-s_cmp_eq_u32 s[sgprLoopCounterL] 0x1               // PGR=2 but only 1 loop
-s_cbranch_scc1 label_0012                          // PGR=2 but only 1 loop
-
-
-buffer_load_dwordx4 v[vgprG2LB+0:vgprG2LB+0+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_0_0
-buffer_load_dwordx4 v[vgprG2LB+4:vgprG2LB+4+3], v[vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_1_0
-buffer_load_dwordx4 v[vgprG2LB+8:vgprG2LB+8+3], v[vgprGlobalReadOffsetB+2], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_2_0
-buffer_load_dwordx4 v[vgprG2LB+12:vgprG2LB+12+3], v[vgprGlobalReadOffsetB+3], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_3_0
-
-buffer_load_dwordx4 v[vgprValuA_X0_I1+0:vgprValuA_X0_I1+0+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_0_0
-buffer_load_dwordx4 v[vgprValuA_X0_I1+4:vgprValuA_X0_I1+4+3], v[vgprGlobalReadOffsetA+1], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_1_0
-buffer_load_dwordx4 v[vgprValuA_X0_I1+8:vgprValuA_X0_I1+8+3], v[vgprGlobalReadOffsetA+2], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_2_0
-buffer_load_dwordx4 v[vgprValuA_X0_I1+12:vgprValuA_X0_I1+12+3], v[vgprGlobalReadOffsetA+3], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_3_0
-
-label_0012:                                        // 
-
-s_waitcnt lgkmcnt(0)                               // lgkmcnt=0 vmcnt=-10prefetch wait for local write
-
-// Skip force waitcnt0
-s_barrier //
-
-
-/* local read prefetch b */
-
-ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:2560 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=1 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:5120 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=2 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:7680 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=3 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:10240 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=4 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:12800 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=5 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:15360 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=6 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:17920 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=7 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-
-
-/* local read inc a */
-
-/* N/A, lro->128 */
-/* self.localReadDoCntA 1 self.localReadDoCntB 1 */
-
-
-/* local read inc b */
-
-/* N/A, lro->8 */
-/* self.localReadDoCntA 1 self.localReadDoCntB 1 */
-
-
-
-/******************************************/
-/* Unrolled Loop(s) - Begin               */
-/******************************************/
-
-openLoopL_13:
-s_cmp_eq_u32 s[sgprLoopCounterL], 0x1              // LoopCounterL < EndCounter
-s_cbranch_scc1 label_0014                          // PGR=2 but only 1 loop, toPGR1
-s_cmp_le_u32 s[sgprLoopCounterL], 0x2              // LoopCounterL < EndCounter
-s_cbranch_scc1 LoopEndL_2                          // do not enter LoopL
-LoopBeginL_1:
-
-
-/******************************************/
-/* Unrolled Loop 1/2 - Begin              */
-/******************************************/
-
-
-/* Begin Each Unroll: Check VGPR.checkin for INT8 LW */
-
-
-
-
-
-/* iter 0 */
-
-/*  grEndMfmaIndex:6, lwStartMfmaIndex:18, lwEndMfmaIndex:48  */
-/*  numMfmaForLR:13, barrierMfmaIndex:50  */
-/*  mfmaIndex:0  */
-s_waitcnt lgkmcnt(0) vmcnt(8)                               // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=0 newLW=0 newLR=0
-v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[0:7]
-/*  mfmaIndex:1  */
-ds_read_b128 v[vgprValuB_X2_I0+0:vgprValuB_X2_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-/* global read inc A loopL */
-s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter?
-s_cselect_b32 s68, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ?
-s_cselect_b32 s69, s[sgprWrapUA+1], 0              // incUpper <- ?
-v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[8:15]
-/*  mfmaIndex:2  */
-ds_read_b128 v[vgprValuB_X2_I0+4:vgprValuB_X2_I0+4+3], v[vgprLocalReadAddrB] offset:2624 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=1 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s68        // gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdA+1], s[sgprSrdA+1], s69      // gra SRD += inc(upper)
-s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s68 // limit -= inc)
-v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[16:23]
-/*  mfmaIndex:3  */
-ds_read_b128 v[vgprValuB_X2_I0+8:vgprValuB_X2_I0+8+3], v[vgprLocalReadAddrB] offset:5184 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=2 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s69 // limit -= inc)
-s_cmp_eq_u32 s[sgprShadowLimitA+1], 0              // are we within 2^32?
-s_cmov_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0]    // Move shadow to real if we are within 2^32
-v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[24:31]
-/*  mfmaIndex:4  */
-s_setprio 0                                        // store optimization
-ds_read_b128 v[vgprValuB_X2_I0+12:vgprValuB_X2_I0+12+3], v[vgprLocalReadAddrB] offset:7744 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=3 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-/* global read inc B loopL */
-s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter?
-s_cselect_b32 s68, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ?
-s_cselect_b32 s69, s[sgprWrapUB+1], 0              // incUpper <- ?
-v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[32:39]
-/*  mfmaIndex:5  */
-ds_read_b128 v[vgprValuB_X2_I0+16:vgprValuB_X2_I0+16+3], v[vgprLocalReadAddrB] offset:10304 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=4 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s68        // gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdB+1], s[sgprSrdB+1], s69      // gra SRD += inc(upper)
-s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s68 // limit -= inc)
-v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[40:47]
-/*  mfmaIndex:6  */
-ds_read_b128 v[vgprValuB_X2_I0+20:vgprValuB_X2_I0+20+3], v[vgprLocalReadAddrB] offset:12864 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=5 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s69 // limit -= inc)
-s_cmp_eq_u32 s[sgprShadowLimitB+1], 0              // are we within 2^32?
-s_cmov_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0]    // Move shadow to real if we are within 2^32
-v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[48:55]
-/*  mfmaIndex:7  */
-ds_read_b128 v[vgprValuB_X2_I0+24:vgprValuB_X2_I0+24+3], v[vgprLocalReadAddrB] offset:15424 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=6 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[56:63]
-/*  mfmaIndex:8  */
-ds_read_b128 v[vgprValuB_X2_I0+28:vgprValuB_X2_I0+28+3], v[vgprLocalReadAddrB] offset:17984 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=7 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[64:71]
-/*  mfmaIndex:9  */
-/* localReadsVacancy: letencyLeft 1 */
-v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[72:79]
-/*  mfmaIndex:10  */
-v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[80:87]
-/*  mfmaIndex:11  */
-v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[88:95]
-/*  mfmaIndex:12  */
-s_setprio 3                                        // store optimization
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[96:103]
-/*  mfmaIndex:13  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[104:111]
-/*  mfmaIndex:14  */
-/* 1 LDS buffer: read-sync-write */
-s_waitcnt lgkmcnt(0)                               // 
-s_barrier                                          // 
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[112:119]
-/*  mfmaIndex:15  */
-s_waitcnt vmcnt(7)                                 // lgkmcnt=-1 vmcnt=7wait for global read before writing to local
-ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+0:vgprG2LB+0+3] offset:0 // lwoB_0_0_0_0 = (0*LSCB)*(MT1J+PAD) + (0*LSPB) = 0
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[120:127]
-buffer_load_dwordx4 v[vgprG2LB+0:vgprG2LB+0+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_0_0
-/* numPrefetchIter=0 */
-/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=1 */
-/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=8 */
-
-
-/* iter 1 */
-
-/*  grEndMfmaIndex:6, lwStartMfmaIndex:18, lwEndMfmaIndex:48  */
-/*  numMfmaForLR:13, barrierMfmaIndex:50  */
-/*  mfmaIndex:16  */
-/* localReadsVacancy: letencyLeft 5 */
-s_waitcnt lgkmcnt(1)                               // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=1, new=1 newLW=0 newLR=0
-v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[0:7]
-/*  mfmaIndex:17  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[8:15]
-/*  mfmaIndex:18  */
-v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[16:23]
-/*  mfmaIndex:19  */
-v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[24:31]
-/*  mfmaIndex:20  */
-v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[32:39]
-/*  mfmaIndex:21  */
-s_waitcnt vmcnt(7)                                 // lgkmcnt=-1 vmcnt=7wait for global read before writing to local
-ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+4:vgprG2LB+4+3] offset:1280 // lwoB_0_0_1_0 = (0*LSCB)*(MT1J+PAD) + (1*LSPB) = 1280
-v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[40:47]
-/*  mfmaIndex:22  */
-buffer_load_dwordx4 v[vgprG2LB+4:vgprG2LB+4+3], v[vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_1_0
-v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[48:55]
-/*  mfmaIndex:23  */
-v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[56:63]
-/*  mfmaIndex:24  */
-v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[64:71]
-/*  mfmaIndex:25  */
-v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[72:79]
-/*  mfmaIndex:26  */
-v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[80:87]
-/*  mfmaIndex:27  */
-s_waitcnt vmcnt(7)                                 // lgkmcnt=-1 vmcnt=7wait for global read before writing to local
-ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+8:vgprG2LB+8+3] offset:2560 // lwoB_0_0_2_0 = (0*LSCB)*(MT1J+PAD) + (2*LSPB) = 2560
-v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[88:95]
-/*  mfmaIndex:28  */
-buffer_load_dwordx4 v[vgprG2LB+8:vgprG2LB+8+3], v[vgprGlobalReadOffsetB+2], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_2_0
-v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[96:103]
-/*  mfmaIndex:29  */
-v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[104:111]
-/*  mfmaIndex:30  */
-v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[112:119]
-/*  mfmaIndex:31  */
-v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[120:127]
-/* numPrefetchIter=0 */
-/* dataAtIterA=0 numReadsIterA=2 skipReadsIterA=1 readsPerIterA=1 */
-/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */
-
-
-/* iter 2 (reset local read pointers iteration)  (swap local read pointers iteration)  */
-
-/*  grEndMfmaIndex:6, lwStartMfmaIndex:18, lwEndMfmaIndex:48  */
-/*  numMfmaForLR:13, barrierMfmaIndex:50  */
-/*  mfmaIndex:32  */
-s_waitcnt lgkmcnt(3)                               // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=4 newLW=4 newLR=0
-v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[0:7]
-/*  mfmaIndex:33  */
-s_waitcnt vmcnt(7)                                 // lgkmcnt=-1 vmcnt=7wait for global read before writing to local
-ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+12:vgprG2LB+12+3] offset:3840 // lwoB_0_0_3_0 = (0*LSCB)*(MT1J+PAD) + (3*LSPB) = 3840
-v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[8:15]
-/*  mfmaIndex:34  */
-buffer_load_dwordx4 v[vgprG2LB+12:vgprG2LB+12+3], v[vgprGlobalReadOffsetB+3], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_3_0
-v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[16:23]
-/*  mfmaIndex:35  */
-v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[24:31]
-/*  mfmaIndex:36  */
-s_setprio 0                                        // store optimization
-v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[32:39]
-/*  mfmaIndex:37  */
-v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[40:47]
-/*  mfmaIndex:38  */
-v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[48:55]
-/*  mfmaIndex:39  */
-v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[56:63]
-/*  mfmaIndex:40  */
-v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[64:71]
-/*  mfmaIndex:41  */
-v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[72:79]
-/*  mfmaIndex:42  */
-v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[80:87]
-/*  mfmaIndex:43  */
-v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[88:95]
-/*  mfmaIndex:44  */
-s_setprio 3                                        // store optimization
-s_waitcnt lgkmcnt(0)                               // lgkmcnt=0 vmcnt=-13wait for local write
-s_barrier //
-v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[96:103]
-/*  mfmaIndex:45  */
-v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[104:111]
-/*  mfmaIndex:46  */
-v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[112:119]
-/*  mfmaIndex:47  */
-
-/* local read swap offsets a */
-
-/* local read swap offsets b */
-
-/* local read init pointers a */
-
-/* localReadInitPointers */
-
-/* local read init pointers b */
-
-/* localReadInitPointers */
-v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[120:127]
-/* numPrefetchIter=0 */
-/* dataAtIterA=1 numReadsIterA=3 skipReadsIterA=1 readsPerIterA=1 */
-/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */
-
-
-/* iter 3 (swap and reset local write pointers iteration)  */
-
-/*  grEndMfmaIndex:6, lwStartMfmaIndex:18, lwEndMfmaIndex:48  */
-/*  numMfmaForLR:13, barrierMfmaIndex:50  */
-/*  mfmaIndex:48  */
-
-/* local write swap offsets a */
-
-/* local write swap offsets b */
-s_waitcnt lgkmcnt(4)                               // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=4 newLW=4 newLR=0
-v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[0:7]
-/*  mfmaIndex:49  */
-v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[8:15]
-/*  mfmaIndex:50  */
-ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[16:23]
-/*  mfmaIndex:51  */
-ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:2560 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=1 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[24:31]
-/*  mfmaIndex:52  */
-buffer_load_dwordx4 v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_0_0
-v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[32:39]
-/*  mfmaIndex:53  */
-ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:5120 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=2 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[40:47]
-/*  mfmaIndex:54  */
-ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:7680 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=3 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[48:55]
-/*  mfmaIndex:55  */
-ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:10240 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=4 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[56:63]
-/*  mfmaIndex:56  */
-buffer_load_dwordx4 v[vgprValuA_X0_I0+4:vgprValuA_X0_I0+4+3], v[vgprGlobalReadOffsetA+1], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_1_0
-v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[64:71]
-/*  mfmaIndex:57  */
-ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:12800 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=5 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[72:79]
-/*  mfmaIndex:58  */
-ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:15360 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=6 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[80:87]
-/*  mfmaIndex:59  */
-ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:17920 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=7 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[88:95]
-/*  mfmaIndex:60  */
-buffer_load_dwordx4 v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], v[vgprGlobalReadOffsetA+2], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_2_0
-v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[96:103]
-/*  mfmaIndex:61  */
-v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[104:111]
-/*  mfmaIndex:62  */
-v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[112:119]
-/*  mfmaIndex:63  */
-v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[120:127]
-buffer_load_dwordx4 v[vgprValuA_X0_I0+12:vgprValuA_X0_I0+12+3], v[vgprGlobalReadOffsetA+3], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_3_0
-/* numPrefetchIter=1 */
-/* dataAtIterA=2 numReadsIterA=3 skipReadsIterA=1 readsPerIterA=1 */
-/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=8 */
-
-
-
-
-/******************************************/
-/* Unrolled Loop - End                    */
-/******************************************/
-
-s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCounterL], 1 // dec counterL
-
-/******************************************/
-/* Unrolled Loop 2/2 - Begin              */
-/******************************************/
-
-
-/* Begin Each Unroll: Check VGPR.checkin for INT8 LW */
-
-
-
-
-
-/* iter 0 */
-
-/*  grEndMfmaIndex:6, lwStartMfmaIndex:18, lwEndMfmaIndex:48  */
-/*  numMfmaForLR:13, barrierMfmaIndex:50  */
-/*  mfmaIndex:0  */
-s_waitcnt lgkmcnt(0) vmcnt(8)                               // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=0 newLW=0 newLR=0
-v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[0:7]
-/*  mfmaIndex:1  */
-ds_read_b128 v[vgprValuB_X2_I0+0:vgprValuB_X2_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-/* global read inc A loopL */
-s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter?
-s_cselect_b32 s68, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ?
-s_cselect_b32 s69, s[sgprWrapUA+1], 0              // incUpper <- ?
-v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[8:15]
-/*  mfmaIndex:2  */
-ds_read_b128 v[vgprValuB_X2_I0+4:vgprValuB_X2_I0+4+3], v[vgprLocalReadAddrB] offset:2624 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=1 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s68        // gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdA+1], s[sgprSrdA+1], s69      // gra SRD += inc(upper)
-s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s68 // limit -= inc)
-v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[16:23]
-/*  mfmaIndex:3  */
-ds_read_b128 v[vgprValuB_X2_I0+8:vgprValuB_X2_I0+8+3], v[vgprLocalReadAddrB] offset:5184 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=2 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s69 // limit -= inc)
-s_cmp_eq_u32 s[sgprShadowLimitA+1], 0              // are we within 2^32?
-s_cmov_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0]    // Move shadow to real if we are within 2^32
-v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[24:31]
-/*  mfmaIndex:4  */
-s_setprio 0                                        // store optimization
-ds_read_b128 v[vgprValuB_X2_I0+12:vgprValuB_X2_I0+12+3], v[vgprLocalReadAddrB] offset:7744 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=3 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-/* global read inc B loopL */
-s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter?
-s_cselect_b32 s68, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ?
-s_cselect_b32 s69, s[sgprWrapUB+1], 0              // incUpper <- ?
-v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[32:39]
-/*  mfmaIndex:5  */
-ds_read_b128 v[vgprValuB_X2_I0+16:vgprValuB_X2_I0+16+3], v[vgprLocalReadAddrB] offset:10304 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=4 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s68        // gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdB+1], s[sgprSrdB+1], s69      // gra SRD += inc(upper)
-s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s68 // limit -= inc)
-v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[40:47]
-/*  mfmaIndex:6  */
-ds_read_b128 v[vgprValuB_X2_I0+20:vgprValuB_X2_I0+20+3], v[vgprLocalReadAddrB] offset:12864 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=5 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s69 // limit -= inc)
-s_cmp_eq_u32 s[sgprShadowLimitB+1], 0              // are we within 2^32?
-s_cmov_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0]    // Move shadow to real if we are within 2^32
-v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[48:55]
-/*  mfmaIndex:7  */
-ds_read_b128 v[vgprValuB_X2_I0+24:vgprValuB_X2_I0+24+3], v[vgprLocalReadAddrB] offset:15424 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=6 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[56:63]
-/*  mfmaIndex:8  */
-ds_read_b128 v[vgprValuB_X2_I0+28:vgprValuB_X2_I0+28+3], v[vgprLocalReadAddrB] offset:17984 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=7 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[64:71]
-/*  mfmaIndex:9  */
-/* localReadsVacancy: letencyLeft 1 */
-v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[72:79]
-/*  mfmaIndex:10  */
-v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[80:87]
-/*  mfmaIndex:11  */
-v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[88:95]
-/*  mfmaIndex:12  */
-s_setprio 3                                        // store optimization
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[96:103]
-/*  mfmaIndex:13  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[104:111]
-/*  mfmaIndex:14  */
-/* 1 LDS buffer: read-sync-write */
-s_waitcnt lgkmcnt(0)                               // 
-s_barrier                                          // 
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[112:119]
-/*  mfmaIndex:15  */
-s_waitcnt vmcnt(7)                                 // lgkmcnt=-1 vmcnt=7wait for global read before writing to local
-ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+0:vgprG2LB+0+3] offset:0 // lwoB_0_0_0_0 = (0*LSCB)*(MT1J+PAD) + (0*LSPB) = 0
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[120:127]
-buffer_load_dwordx4 v[vgprG2LB+0:vgprG2LB+0+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_0_0
-/* numPrefetchIter=0 */
-/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=1 */
-/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=8 */
-
-
-/* iter 1 */
-
-/*  grEndMfmaIndex:6, lwStartMfmaIndex:18, lwEndMfmaIndex:48  */
-/*  numMfmaForLR:13, barrierMfmaIndex:50  */
-/*  mfmaIndex:16  */
-/* localReadsVacancy: letencyLeft 5 */
-s_waitcnt lgkmcnt(1)                               // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=1, new=1 newLW=0 newLR=0
-v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[0:7]
-/*  mfmaIndex:17  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[8:15]
-/*  mfmaIndex:18  */
-v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[16:23]
-/*  mfmaIndex:19  */
-v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[24:31]
-/*  mfmaIndex:20  */
-v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[32:39]
-/*  mfmaIndex:21  */
-s_waitcnt vmcnt(7)                                 // lgkmcnt=-1 vmcnt=7wait for global read before writing to local
-ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+4:vgprG2LB+4+3] offset:1280 // lwoB_0_0_1_0 = (0*LSCB)*(MT1J+PAD) + (1*LSPB) = 1280
-v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[40:47]
-/*  mfmaIndex:22  */
-buffer_load_dwordx4 v[vgprG2LB+4:vgprG2LB+4+3], v[vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_1_0
-v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[48:55]
-/*  mfmaIndex:23  */
-v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[56:63]
-/*  mfmaIndex:24  */
-v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[64:71]
-/*  mfmaIndex:25  */
-v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[72:79]
-/*  mfmaIndex:26  */
-v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[80:87]
-/*  mfmaIndex:27  */
-s_waitcnt vmcnt(7)                                 // lgkmcnt=-1 vmcnt=7wait for global read before writing to local
-ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+8:vgprG2LB+8+3] offset:2560 // lwoB_0_0_2_0 = (0*LSCB)*(MT1J+PAD) + (2*LSPB) = 2560
-v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[88:95]
-/*  mfmaIndex:28  */
-buffer_load_dwordx4 v[vgprG2LB+8:vgprG2LB+8+3], v[vgprGlobalReadOffsetB+2], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_2_0
-v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[96:103]
-/*  mfmaIndex:29  */
-v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[104:111]
-/*  mfmaIndex:30  */
-v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[112:119]
-/*  mfmaIndex:31  */
-v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[120:127]
-/* numPrefetchIter=0 */
-/* dataAtIterA=0 numReadsIterA=2 skipReadsIterA=1 readsPerIterA=1 */
-/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */
-
-
-/* iter 2 (reset local read pointers iteration)  (swap local read pointers iteration)  */
-
-/*  grEndMfmaIndex:6, lwStartMfmaIndex:18, lwEndMfmaIndex:48  */
-/*  numMfmaForLR:13, barrierMfmaIndex:50  */
-/*  mfmaIndex:32  */
-s_waitcnt lgkmcnt(3)                               // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=4 newLW=4 newLR=0
-v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[0:7]
-/*  mfmaIndex:33  */
-s_waitcnt vmcnt(7)                                 // lgkmcnt=-1 vmcnt=7wait for global read before writing to local
-ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+12:vgprG2LB+12+3] offset:3840 // lwoB_0_0_3_0 = (0*LSCB)*(MT1J+PAD) + (3*LSPB) = 3840
-v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[8:15]
-/*  mfmaIndex:34  */
-buffer_load_dwordx4 v[vgprG2LB+12:vgprG2LB+12+3], v[vgprGlobalReadOffsetB+3], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // G -> Reg 0_0_3_0
-v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[16:23]
-/*  mfmaIndex:35  */
-v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[24:31]
-/*  mfmaIndex:36  */
-s_setprio 0                                        // store optimization
-v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[32:39]
-/*  mfmaIndex:37  */
-v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[40:47]
-/*  mfmaIndex:38  */
-v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[48:55]
-/*  mfmaIndex:39  */
-v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[56:63]
-/*  mfmaIndex:40  */
-v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[64:71]
-/*  mfmaIndex:41  */
-v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[72:79]
-/*  mfmaIndex:42  */
-v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[80:87]
-/*  mfmaIndex:43  */
-v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[88:95]
-/*  mfmaIndex:44  */
-s_setprio 3                                        // store optimization
-s_waitcnt lgkmcnt(0)                               // lgkmcnt=0 vmcnt=-13wait for local write
-s_barrier //
-v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[96:103]
-/*  mfmaIndex:45  */
-v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[104:111]
-/*  mfmaIndex:46  */
-v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[112:119]
-/*  mfmaIndex:47  */
-
-/* local read swap offsets a */
-
-/* local read swap offsets b */
-
-/* local read init pointers a */
-
-/* localReadInitPointers */
-
-/* local read init pointers b */
-
-/* localReadInitPointers */
-v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[120:127]
-/* numPrefetchIter=0 */
-/* dataAtIterA=1 numReadsIterA=3 skipReadsIterA=1 readsPerIterA=1 */
-/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */
-
-
-/* iter 3 (swap and reset local write pointers iteration)  */
-
-/*  grEndMfmaIndex:6, lwStartMfmaIndex:18, lwEndMfmaIndex:48  */
-/*  numMfmaForLR:13, barrierMfmaIndex:50  */
-/*  mfmaIndex:48  */
-
-/* local write swap offsets a */
-
-/* local write swap offsets b */
-s_waitcnt lgkmcnt(4)                               // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=4 newLW=4 newLR=0
-v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[0:7]
-/*  mfmaIndex:49  */
-v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[8:15]
-/*  mfmaIndex:50  */
-ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[16:23]
-/*  mfmaIndex:51  */
-ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:2560 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=1 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[24:31]
-/*  mfmaIndex:52  */
-buffer_load_dwordx4 v[vgprValuA_X0_I1+0:vgprValuA_X0_I1+0+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_0_0
-v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[32:39]
-/*  mfmaIndex:53  */
-ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:5120 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=2 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[40:47]
-/*  mfmaIndex:54  */
-ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:7680 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=3 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[48:55]
-/*  mfmaIndex:55  */
-ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:10240 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=4 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[56:63]
-/*  mfmaIndex:56  */
-buffer_load_dwordx4 v[vgprValuA_X0_I1+4:vgprValuA_X0_I1+4+3], v[vgprGlobalReadOffsetA+1], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_1_0
-v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[64:71]
-/*  mfmaIndex:57  */
-ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:12800 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=5 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[72:79]
-/*  mfmaIndex:58  */
-ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:15360 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=6 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[80:87]
-/*  mfmaIndex:59  */
-ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:17920 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=7 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[88:95]
-/*  mfmaIndex:60  */
-buffer_load_dwordx4 v[vgprValuA_X0_I1+8:vgprValuA_X0_I1+8+3], v[vgprGlobalReadOffsetA+2], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_2_0
-v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[96:103]
-/*  mfmaIndex:61  */
-v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[104:111]
-/*  mfmaIndex:62  */
-v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[112:119]
-/*  mfmaIndex:63  */
-s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCounterL], 1 // dec counterL
-s_cmp_eq_u32 s[sgprLoopCounterL], 0x2              // counterL==2
-v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[120:127]
-buffer_load_dwordx4 v[vgprValuA_X0_I1+12:vgprValuA_X0_I1+12+3], v[vgprGlobalReadOffsetA+3], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // G -> Reg 0_0_3_0
-/* numPrefetchIter=1 */
-/* dataAtIterA=2 numReadsIterA=3 skipReadsIterA=1 readsPerIterA=1 */
-/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=8 */
-
-
-
-
-/******************************************/
-/* Unrolled Loop - End                    */
-/******************************************/
-
-
-/* closeLoop loopL finalLoop=1 tailLoop=0 */
-s_cbranch_scc0 LoopBeginL_1                        // restart LoopL
-LoopEndL_oddexit_3: // unroll loop odditer exit
-LoopEndL_2:
-
-
-/* Before NLL: Check VGPR.checkin for INT8 LW */
-
-
-
-
-/******************************************/
-/*  NoGlobalLoadLoop - Begin              */
-/******************************************/
-
-s_setprio 0                                        // store optimization
-
-/* iter 0 */
-
-/*  grEndMfmaIndex:6, lwStartMfmaIndex:18, lwEndMfmaIndex:48  */
-/*  numMfmaForLR:13, barrierMfmaIndex:50  */
-/*  mfmaIndex:0  */
-s_waitcnt lgkmcnt(0) vmcnt(8)                               // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=0 newLW=0 newLR=0
-v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[0:7]
-/*  mfmaIndex:1  */
-ds_read_b128 v[vgprValuB_X2_I0+0:vgprValuB_X2_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-/* global read inc A loopL */
-s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter?
-s_cselect_b32 s68, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ?
-s_cselect_b32 s69, s[sgprWrapUA+1], 0              // incUpper <- ?
-v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[8:15]
-/*  mfmaIndex:2  */
-ds_read_b128 v[vgprValuB_X2_I0+4:vgprValuB_X2_I0+4+3], v[vgprLocalReadAddrB] offset:2624 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=1 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s68        // gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdA+1], s[sgprSrdA+1], s69      // gra SRD += inc(upper)
-s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s68 // limit -= inc)
-v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[16:23]
-/*  mfmaIndex:3  */
-ds_read_b128 v[vgprValuB_X2_I0+8:vgprValuB_X2_I0+8+3], v[vgprLocalReadAddrB] offset:5184 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=2 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s69 // limit -= inc)
-s_cmp_eq_u32 s[sgprShadowLimitA+1], 0              // are we within 2^32?
-s_cmov_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0]    // Move shadow to real if we are within 2^32
-v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[24:31]
-/*  mfmaIndex:4  */
-ds_read_b128 v[vgprValuB_X2_I0+12:vgprValuB_X2_I0+12+3], v[vgprLocalReadAddrB] offset:7744 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=3 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-/* global read inc B loopL */
-s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter?
-s_cselect_b32 s68, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ?
-s_cselect_b32 s69, s[sgprWrapUB+1], 0              // incUpper <- ?
-v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[32:39]
-/*  mfmaIndex:5  */
-ds_read_b128 v[vgprValuB_X2_I0+16:vgprValuB_X2_I0+16+3], v[vgprLocalReadAddrB] offset:10304 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=4 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s68        // gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdB+1], s[sgprSrdB+1], s69      // gra SRD += inc(upper)
-s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s68 // limit -= inc)
-v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[40:47]
-/*  mfmaIndex:6  */
-ds_read_b128 v[vgprValuB_X2_I0+20:vgprValuB_X2_I0+20+3], v[vgprLocalReadAddrB] offset:12864 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=5 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s69 // limit -= inc)
-s_cmp_eq_u32 s[sgprShadowLimitB+1], 0              // are we within 2^32?
-s_cmov_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0]    // Move shadow to real if we are within 2^32
-v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[48:55]
-/*  mfmaIndex:7  */
-ds_read_b128 v[vgprValuB_X2_I0+24:vgprValuB_X2_I0+24+3], v[vgprLocalReadAddrB] offset:15424 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=6 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[56:63]
-/*  mfmaIndex:8  */
-ds_read_b128 v[vgprValuB_X2_I0+28:vgprValuB_X2_I0+28+3], v[vgprLocalReadAddrB] offset:17984 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=7 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[64:71]
-/*  mfmaIndex:9  */
-/* localReadsVacancy: letencyLeft 1 */
-v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[72:79]
-/*  mfmaIndex:10  */
-v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[80:87]
-/*  mfmaIndex:11  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[88:95]
-/*  mfmaIndex:12  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[96:103]
-/*  mfmaIndex:13  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[104:111]
-/*  mfmaIndex:14  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[112:119]
-/*  mfmaIndex:15  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[120:127]
-/* numPrefetchIter=0 */
-/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=1 */
-/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=8 */
-
-
-/* iter 1 */
-
-/*  grEndMfmaIndex:6, lwStartMfmaIndex:18, lwEndMfmaIndex:48  */
-/*  numMfmaForLR:13, barrierMfmaIndex:50  */
-/*  mfmaIndex:16  */
-/* localReadsVacancy: letencyLeft 5 */
-s_waitcnt lgkmcnt(0)                               // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=1, new=1 newLW=0 newLR=0
-v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[0:7]
-/*  mfmaIndex:17  */
-/* localReadsVacancy: letencyLeft 5 */
-/* 1 LDS buffer: read-sync-write */
-s_waitcnt lgkmcnt(0)                               // 
-s_barrier                                          // 
-v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[8:15]
-/*  mfmaIndex:18  */
-v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[16:23]
-/*  mfmaIndex:19  */
-v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[24:31]
-/*  mfmaIndex:20  */
-v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[32:39]
-/*  mfmaIndex:21  */
-v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[40:47]
-/*  mfmaIndex:22  */
-v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[48:55]
-/*  mfmaIndex:23  */
-v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[56:63]
-/*  mfmaIndex:24  */
-v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[64:71]
-/*  mfmaIndex:25  */
-v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[72:79]
-/*  mfmaIndex:26  */
-v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[80:87]
-/*  mfmaIndex:27  */
-v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[88:95]
-/*  mfmaIndex:28  */
-v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[96:103]
-/*  mfmaIndex:29  */
-v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[104:111]
-/*  mfmaIndex:30  */
-v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X1_I0+0+0+0:vgprValuA_X1_I0+0+0+0+1], v[112:119]
-/*  mfmaIndex:31  */
-v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X1_I0+2+0+0:vgprValuA_X1_I0+2+0+0+1], v[120:127]
-/* numPrefetchIter=0 */
-/* dataAtIterA=0 numReadsIterA=2 skipReadsIterA=1 readsPerIterA=1 */
-/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */
-
-
-/* iter 2 (reset local read pointers iteration)  (swap local read pointers iteration)  */
-
-/*  grEndMfmaIndex:6, lwStartMfmaIndex:18, lwEndMfmaIndex:48  */
-/*  numMfmaForLR:13, barrierMfmaIndex:50  */
-/*  mfmaIndex:32  */
-s_waitcnt lgkmcnt(0)                               // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=4 newLW=4 newLR=0
-v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[0:7]
-/*  mfmaIndex:33  */
-v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[8:15]
-/*  mfmaIndex:34  */
-v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[16:23]
-/*  mfmaIndex:35  */
-v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[24:31]
-/*  mfmaIndex:36  */
-v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[32:39]
-/*  mfmaIndex:37  */
-v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[40:47]
-/*  mfmaIndex:38  */
-v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[48:55]
-/*  mfmaIndex:39  */
-v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[56:63]
-/*  mfmaIndex:40  */
-v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[64:71]
-/*  mfmaIndex:41  */
-s_waitcnt vmcnt(7)                                 // lgkmcnt=-1 vmcnt=3wait for global read before writing to local
-ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+0:vgprG2LB+0+3] offset:0 // lwoB_0_0_0_0 = (0*LSCB)*(MT1J+PAD) + (0*LSPB) = 0
-v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[72:79]
-/*  mfmaIndex:42  */
-v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[80:87]
-/*  mfmaIndex:43  */
-s_waitcnt vmcnt(6)                                 // lgkmcnt=-1 vmcnt=2wait for global read before writing to local
-ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+4:vgprG2LB+4+3] offset:1280 // lwoB_0_0_1_0 = (0*LSCB)*(MT1J+PAD) + (1*LSPB) = 1280
-v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[88:95]
-/*  mfmaIndex:44  */
-v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[96:103]
-/*  mfmaIndex:45  */
-s_waitcnt vmcnt(5)                                 // lgkmcnt=-1 vmcnt=1wait for global read before writing to local
-ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+8:vgprG2LB+8+3] offset:2560 // lwoB_0_0_2_0 = (0*LSCB)*(MT1J+PAD) + (2*LSPB) = 2560
-v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[104:111]
-/*  mfmaIndex:46  */
-v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], v[112:119]
-/*  mfmaIndex:47  */
-s_waitcnt vmcnt(4)                                 // lgkmcnt=-1 vmcnt=0wait for global read before writing to local
-ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+12:vgprG2LB+12+3] offset:3840 // lwoB_0_0_3_0 = (0*LSCB)*(MT1J+PAD) + (3*LSPB) = 3840
-
-/* local read swap offsets a */
-
-/* local read swap offsets b */
-
-/* local read init pointers a */
-
-/* localReadInitPointers */
-
-/* local read init pointers b */
-
-/* localReadInitPointers */
-v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+2+0+0:vgprValuA_X2_I0+2+0+0+1], v[120:127]
-/* numPrefetchIter=0 */
-/* dataAtIterA=1 numReadsIterA=3 skipReadsIterA=1 readsPerIterA=1 */
-/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */
-
-
-/* iter 3 (swap and reset local write pointers iteration)  */
-
-/*  grEndMfmaIndex:6, lwStartMfmaIndex:18, lwEndMfmaIndex:48  */
-/*  numMfmaForLR:13, barrierMfmaIndex:50  */
-/*  mfmaIndex:48  */
-
-/* local write swap offsets a */
-
-/* local write swap offsets b */
-s_waitcnt lgkmcnt(4)                               // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=4 newLW=4 newLR=0
-v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[0:7]
-/*  mfmaIndex:49  */
-v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[8:15]
-/*  mfmaIndex:50  */
-v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[16:23]
-/*  mfmaIndex:51  */
-s_waitcnt lgkmcnt(0)                               // lgkmcnt=0 vmcnt=-13wait for local write
-// Skip force waitcnt0
-s_barrier //
-v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[24:31]
-/*  mfmaIndex:52  */
-ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[32:39]
-/*  mfmaIndex:53  */
-ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:2560 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=1 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[40:47]
-/*  mfmaIndex:54  */
-ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:5120 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=2 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[48:55]
-/*  mfmaIndex:55  */
-ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:7680 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=3 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[56:63]
-/*  mfmaIndex:56  */
-ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:10240 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=4 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[64:71]
-/*  mfmaIndex:57  */
-ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:12800 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=5 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[72:79]
-/*  mfmaIndex:58  */
-ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:15360 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=6 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[80:87]
-/*  mfmaIndex:59  */
-ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:17920 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=7 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[88:95]
-/*  mfmaIndex:60  */
-v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[96:103]
-/*  mfmaIndex:61  */
-v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[104:111]
-/*  mfmaIndex:62  */
-v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X3_I0+0+0+0:vgprValuA_X3_I0+0+0+0+1], v[112:119]
-/*  mfmaIndex:63  */
-v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X3_I0+2+0+0:vgprValuA_X3_I0+2+0+0+1], v[120:127]
-/* numPrefetchIter=1 */
-/* dataAtIterA=2 numReadsIterA=3 skipReadsIterA=1 readsPerIterA=1 */
-/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=8 */
-
-label_0014:
-
-
-/******************************************/
-/* Opt. NoLoadLoop Without PAP - Begin         */
-/******************************************/
-
-s_mov_b32 s68, s[sgprBeta+0]                       // tmp = Beta[0]
-s_or_b32 s68, s[sgprBeta+1], s68                   // tmp |= Beta[1] 
-s_cmpk_eq_u32 s68, 0x0                             // Beta == 0
-s_cbranch_scc0 OptNLL_End_16                       // Branch if Beta is not zero
-
-s_mov_b32 s68, 0                                   // Low part of double 1.0
-s_mov_b32 s69, 0x3ff00000                          // High part of double 1.0
-s_cmp_eq_u64 s[sgprAlpha:sgprAlpha+1], s[68:69]    // Alpha == 1.0 ?
-s_cbranch_scc0 OptNLL_End_16                       // branch if alpha != 1
-
-s_and_b32 s68, 127, s[sgprSizeI]                   // s68 = s[sgprSizeI] % 128
-s_add_u32 s69, -0x1, s[sgprNumWorkGroups0]         // 
-s_cmp_ge_u32 s[sgprWorkGroup0], s69                // wg0 >= nwg0-1 ?
-s_cselect_b32 s68, s68, 0                          // set rMT0
-s_cmpk_gt_u32 s68, 0x0                             // rMT0 > 0
-s_cbranch_scc1 OptNLL_End_16                       // jump if edges required
-s_and_b32 s68, 127, s[sgprSizeJ]                   // s68 = s[sgprSizeJ] % 128
-s_add_u32 s69, -0x1, s[sgprNumWorkGroups1]         // 
-s_cmp_ge_u32 s[sgprWorkGroup1], s69                // wg1 >= nwg1-1
-s_cselect_b32 s68, s68, 0                          // set rMT1
-s_cmpk_gt_u32 s68, 0x0                             // rMT1 > 0
-s_cbranch_scc1 OptNLL_End_16                       // jump if edges required
-
-s_and_b32 s69, 15, s[sgprSizesSum+0]               // s69 = s[sgprSizesSum+0] % 16
-s_cmp_eq_u32 s69, 0x0                              // numIterL == 0
-s_cbranch_scc0 OptNLL_End_16                       // skip if tail loop required
-
-
-
-/* iter 0 (last unrolled loop) */
-
-/*  grEndMfmaIndex:0, lwStartMfmaIndex:48, lwEndMfmaIndex:48  */
-/*  numMfmaForLR:13, barrierMfmaIndex:50  */
-/*  mfmaIndex:0  */
-s_waitcnt lgkmcnt(0) vmcnt(3)                               // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=0 newLW=0 newLR=0
-v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[0:7]
-/*  mfmaIndex:1  */
-ds_read_b128 v[vgprValuB_X2_I0+0:vgprValuB_X2_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[8:15]
-/*  mfmaIndex:2  */
-ds_read_b128 v[vgprValuB_X2_I0+4:vgprValuB_X2_I0+4+3], v[vgprLocalReadAddrB] offset:2624 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=1 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[16:23]
-/*  mfmaIndex:3  */
-ds_read_b128 v[vgprValuB_X2_I0+8:vgprValuB_X2_I0+8+3], v[vgprLocalReadAddrB] offset:5184 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=2 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[24:31]
-/*  mfmaIndex:4  */
-ds_read_b128 v[vgprValuB_X2_I0+12:vgprValuB_X2_I0+12+3], v[vgprLocalReadAddrB] offset:7744 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=3 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[32:39]
-/*  mfmaIndex:5  */
-ds_read_b128 v[vgprValuB_X2_I0+16:vgprValuB_X2_I0+16+3], v[vgprLocalReadAddrB] offset:10304 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=4 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[40:47]
-/*  mfmaIndex:6  */
-ds_read_b128 v[vgprValuB_X2_I0+20:vgprValuB_X2_I0+20+3], v[vgprLocalReadAddrB] offset:12864 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=5 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[48:55]
-/*  mfmaIndex:7  */
-ds_read_b128 v[vgprValuB_X2_I0+24:vgprValuB_X2_I0+24+3], v[vgprLocalReadAddrB] offset:15424 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=6 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[56:63]
-/*  mfmaIndex:8  */
-ds_read_b128 v[vgprValuB_X2_I0+28:vgprValuB_X2_I0+28+3], v[vgprLocalReadAddrB] offset:17984 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=7 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[64:71]
-/*  mfmaIndex:9  */
-/* localReadsVacancy: letencyLeft 1 */
-v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[72:79]
-/*  mfmaIndex:10  */
-v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[80:87]
-/*  mfmaIndex:11  */
-v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[88:95]
-/*  mfmaIndex:12  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[96:103]
-/*  mfmaIndex:13  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[104:111]
-/*  mfmaIndex:14  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[112:119]
-/*  mfmaIndex:15  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[120:127]
-/* numPrefetchIter=0 */
-/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=1 */
-/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=8 */
-
-
-/* iter 1 (last unrolled loop) */
-
-/*  grEndMfmaIndex:0, lwStartMfmaIndex:48, lwEndMfmaIndex:48  */
-/*  numMfmaForLR:13, barrierMfmaIndex:50  */
-/*  mfmaIndex:16  */
-/* localReadsVacancy: letencyLeft 5 */
-s_waitcnt lgkmcnt(0) vmcnt(2)                               // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=1, new=1 newLW=0 newLR=0
-v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[0:7]
-/*  mfmaIndex:17  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[8:15]
-/*  mfmaIndex:18  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[16:23]
-/*  mfmaIndex:19  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[24:31]
-/*  mfmaIndex:20  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[32:39]
-/*  mfmaIndex:21  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[40:47]
-/*  mfmaIndex:22  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[48:55]
-/*  mfmaIndex:23  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[56:63]
-/*  mfmaIndex:24  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[64:71]
-/*  mfmaIndex:25  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[72:79]
-/*  mfmaIndex:26  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[80:87]
-/*  mfmaIndex:27  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[88:95]
-/*  mfmaIndex:28  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[96:103]
-/*  mfmaIndex:29  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[104:111]
-/*  mfmaIndex:30  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[112:119]
-/*  mfmaIndex:31  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[120:127]
-/* numPrefetchIter=0 */
-/* dataAtIterA=0 numReadsIterA=2 skipReadsIterA=1 readsPerIterA=1 */
-/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */
-
-
-/* iter 2 (last unrolled loop) */
-
-/*  grEndMfmaIndex:0, lwStartMfmaIndex:48, lwEndMfmaIndex:48  */
-/*  numMfmaForLR:13, barrierMfmaIndex:50  */
-/*  mfmaIndex:32  */
-/* localReadsVacancy: letencyLeft 5 */
-s_waitcnt lgkmcnt(0) vmcnt(1)                               // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=0 newLW=0 newLR=0
-v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[0:7]
-/*  mfmaIndex:33  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[8:15]
-/*  mfmaIndex:34  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[16:23]
-/*  mfmaIndex:35  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[24:31]
-/*  mfmaIndex:36  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[32:39]
-/*  mfmaIndex:37  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[40:47]
-/*  mfmaIndex:38  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[48:55]
-/*  mfmaIndex:39  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[56:63]
-/*  mfmaIndex:40  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[64:71]
-/*  mfmaIndex:41  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[72:79]
-/*  mfmaIndex:42  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[80:87]
-/*  mfmaIndex:43  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[88:95]
-/*  mfmaIndex:44  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[96:103]
-/*  mfmaIndex:45  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[104:111]
-/*  mfmaIndex:46  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[112:119]
-/*  mfmaIndex:47  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[120:127]
-/* numPrefetchIter=0 */
-/* dataAtIterA=1 numReadsIterA=3 skipReadsIterA=1 readsPerIterA=1 */
-/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */
-
-
-/* iter 3 (last unrolled loop) */
-
-/*  grEndMfmaIndex:0, lwStartMfmaIndex:48, lwEndMfmaIndex:48  */
-/*  numMfmaForLR:13, barrierMfmaIndex:50  */
-/*  mfmaIndex:48  */
-s_waitcnt lgkmcnt(0) vmcnt(0)                               // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=0 newLW=0 newLR=0
-v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[0:7]
-/*  mfmaIndex:49  */
-v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[8:15]
-/*  mfmaIndex:50  */
-v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[16:23]
-/*  mfmaIndex:51  */
-v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[24:31]
-/*  mfmaIndex:52  */
-v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[32:39]
-/*  mfmaIndex:53  */
-v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[40:47]
-/*  mfmaIndex:54  */
-v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[48:55]
-/*  mfmaIndex:55  */
-v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[56:63]
-/*  mfmaIndex:56  */
-v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[64:71]
-/*  mfmaIndex:57  */
-v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[72:79]
-/*  mfmaIndex:58  */
-v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[80:87]
-/*  mfmaIndex:59  */
-v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[88:95]
-/*  mfmaIndex:60  */
-v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[96:103]
-/*  mfmaIndex:61  */
-v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[104:111]
-/*  mfmaIndex:62  */
-v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[112:119]
-/*  mfmaIndex:63  */
-v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[120:127]
-/* numPrefetchIter=0 */
-/* dataAtIterA=2 numReadsIterA=3 skipReadsIterA=0 readsPerIterA=1 */
-/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */
-
-/* Stores for OptNLL */
-Summation_End_OptNLL_17:
-s_setprio 0                                        // optimization store
-/* endSummation: add vgpr [128...132) to pool (vgprValuA_X0_I0) */
-/* endSummation: add vgpr [144...160) to pool (vgprValuB_X0_I0) */
-/* endSummation: add vgpr [208...252) to pool */
-.set NumFullBlocks, UNDEF
-.set WgmRemainder1, UNDEF
-.set MagicNumberWgmRemainder1, UNDEF
-.set ShadowLimitA, UNDEF
-.set ShadowLimitB, UNDEF
-.set WrapUA, UNDEF
-.set WrapUB, UNDEF
-.set GlobalReadIncsA, UNDEF
-.set GlobalReadIncsB, UNDEF
-
-/* Mapping of Acc register -> C Vgpr register */
-
-/* Multiply MI out register with Alpha -> C Vgpr register */
-/* computeStoreVgprs */
-v_lshrrev_b32 v144, 6, v[vgprSerial]               // v144 = v[vgprSerial] / 64
-v_lshrrev_b32 v145, 2, v144                        // v145 = v144 / 4
-v_mul_lo_u32 v145, 0x10, v145                      // wave coordination offset 1
-v_and_b32 v129, 63, v[vgprSerial]                  // v129 = v[vgprSerial] % 64
-v_lshrrev_b32 v129, 4, v129                        // v129 = v129 / 16
-                                                   // thread0 * continuous_output (multiplier is 1, do nothing)
-v_add_u32 v129, v145, v129                         // coordination 1 = wave_id1 + tid1
-v_mul_lo_u32 v130, v129, s[sgprStrideC1J]          //  offset 1
-v_mul_lo_u32 v131, v129, s[sgprStrideD1J]          //  offset 1
-v_and_b32 v128, 3, v144                            // v128 = v144 % 4
-v_mul_lo_u32 v128, 0x10, v128                      // wave coordination offset 0
-v_and_b32 v145, 15, v[vgprSerial]                  // v145 = v[vgprSerial] % 16
-_v_add_lshl_u32 v128, v145, v128, 1                // coordination 0 = wave_id0 + tid0
-s_mul_i32 s63, 128, s[sgprWorkGroup0]              // wgp0 * MT0
-v_add_u32 v128, s63, v128                          // coord 0 = (tid0/MI_m)*4 + waveG0*MIB_m + MT0*SG0
-s_mul_i32 s63, 128, s[sgprWorkGroup1]              // wgp1 * MT1
-v_add_u32 v129, s63, v129                          // coord 1 = (tid0%MI_m) + waveG1*MIB_n + MT1*SG1
-GW_B0_E0_20:
-
-/* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=1 */
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #0 (d1,d0,vc1,vc0) = */
-/*    (0,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(0,0,0,0) */
-_v_add_lshl_u32 v146, v131, v128, 0x3              // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=128, coord0Vgpr=128
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+0] // copy MI out reg to vreg[0]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+1] // copy MI out reg to vreg[1]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+8] // copy MI out reg to vreg[2]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+9] // copy MI out reg to vreg[3]
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #1 (d1,d0,vc1,vc0) = */
-/*    (1,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(1,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+2] // copy MI out reg to vreg[4]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+3] // copy MI out reg to vreg[5]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+10] // copy MI out reg to vreg[6]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+11] // copy MI out reg to vreg[7]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #2 (d1,d0,vc1,vc0) = */
-/*    (2,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(2,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+4] // copy MI out reg to vreg[8]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+5] // copy MI out reg to vreg[9]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+12] // copy MI out reg to vreg[10]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+13] // copy MI out reg to vreg[11]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #3 (d1,d0,vc1,vc0) = */
-/*    (3,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(3,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+6] // copy MI out reg to vreg[12]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+7] // copy MI out reg to vreg[13]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+14] // copy MI out reg to vreg[14]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+15] // copy MI out reg to vreg[15]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #4 (d1,d0,vc1,vc0) = */
-/*    (4,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(4,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+16] // copy MI out reg to vreg[16]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+17] // copy MI out reg to vreg[17]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+24] // copy MI out reg to vreg[18]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+25] // copy MI out reg to vreg[19]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #5 (d1,d0,vc1,vc0) = */
-/*    (5,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(5,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+18] // copy MI out reg to vreg[20]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+19] // copy MI out reg to vreg[21]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+26] // copy MI out reg to vreg[22]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+27] // copy MI out reg to vreg[23]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #6 (d1,d0,vc1,vc0) = */
-/*    (6,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(6,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+20] // copy MI out reg to vreg[24]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+21] // copy MI out reg to vreg[25]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+28] // copy MI out reg to vreg[26]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+29] // copy MI out reg to vreg[27]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #7 (d1,d0,vc1,vc0) = */
-/*    (7,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(7,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+22] // copy MI out reg to vreg[28]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+23] // copy MI out reg to vreg[29]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+30] // copy MI out reg to vreg[30]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+31] // copy MI out reg to vreg[31]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #8 (d1,d0,vc1,vc0) = */
-/*    (8,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(8,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+32] // copy MI out reg to vreg[32]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+33] // copy MI out reg to vreg[33]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+40] // copy MI out reg to vreg[34]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+41] // copy MI out reg to vreg[35]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #9 (d1,d0,vc1,vc0) = */
-/*    (9,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(9,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+34] // copy MI out reg to vreg[36]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+35] // copy MI out reg to vreg[37]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+42] // copy MI out reg to vreg[38]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+43] // copy MI out reg to vreg[39]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #10 (d1,d0,vc1,vc0) = */
-/*    (10,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(10,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+36] // copy MI out reg to vreg[40]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+37] // copy MI out reg to vreg[41]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+44] // copy MI out reg to vreg[42]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+45] // copy MI out reg to vreg[43]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #11 (d1,d0,vc1,vc0) = */
-/*    (11,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(11,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+38] // copy MI out reg to vreg[44]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+39] // copy MI out reg to vreg[45]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+46] // copy MI out reg to vreg[46]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+47] // copy MI out reg to vreg[47]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #12 (d1,d0,vc1,vc0) = */
-/*    (12,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(12,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+48] // copy MI out reg to vreg[48]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+49] // copy MI out reg to vreg[49]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+56] // copy MI out reg to vreg[50]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+57] // copy MI out reg to vreg[51]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #13 (d1,d0,vc1,vc0) = */
-/*    (13,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(13,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+50] // copy MI out reg to vreg[52]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+51] // copy MI out reg to vreg[53]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+58] // copy MI out reg to vreg[54]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+59] // copy MI out reg to vreg[55]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #14 (d1,d0,vc1,vc0) = */
-/*    (14,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(14,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+52] // copy MI out reg to vreg[56]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+53] // copy MI out reg to vreg[57]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+60] // copy MI out reg to vreg[58]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+61] // copy MI out reg to vreg[59]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #15 (d1,d0,vc1,vc0) = */
-/*    (15,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(15,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+54] // copy MI out reg to vreg[60]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+55] // copy MI out reg to vreg[61]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+62] // copy MI out reg to vreg[62]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+63] // copy MI out reg to vreg[63]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #16 (d1,d0,vc1,vc0) = */
-/*    (16,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(16,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+64] // copy MI out reg to vreg[64]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+65] // copy MI out reg to vreg[65]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+72] // copy MI out reg to vreg[66]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+73] // copy MI out reg to vreg[67]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #17 (d1,d0,vc1,vc0) = */
-/*    (17,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(17,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+66] // copy MI out reg to vreg[68]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+67] // copy MI out reg to vreg[69]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+74] // copy MI out reg to vreg[70]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+75] // copy MI out reg to vreg[71]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #18 (d1,d0,vc1,vc0) = */
-/*    (18,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(18,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+68] // copy MI out reg to vreg[72]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+69] // copy MI out reg to vreg[73]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+76] // copy MI out reg to vreg[74]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+77] // copy MI out reg to vreg[75]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #19 (d1,d0,vc1,vc0) = */
-/*    (19,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(19,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+70] // copy MI out reg to vreg[76]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+71] // copy MI out reg to vreg[77]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+78] // copy MI out reg to vreg[78]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+79] // copy MI out reg to vreg[79]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #20 (d1,d0,vc1,vc0) = */
-/*    (20,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(20,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+80] // copy MI out reg to vreg[80]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+81] // copy MI out reg to vreg[81]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+88] // copy MI out reg to vreg[82]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+89] // copy MI out reg to vreg[83]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #21 (d1,d0,vc1,vc0) = */
-/*    (21,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(21,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+82] // copy MI out reg to vreg[84]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+83] // copy MI out reg to vreg[85]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+90] // copy MI out reg to vreg[86]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+91] // copy MI out reg to vreg[87]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #22 (d1,d0,vc1,vc0) = */
-/*    (22,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(22,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+84] // copy MI out reg to vreg[88]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+85] // copy MI out reg to vreg[89]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+92] // copy MI out reg to vreg[90]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+93] // copy MI out reg to vreg[91]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #23 (d1,d0,vc1,vc0) = */
-/*    (23,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(23,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+86] // copy MI out reg to vreg[92]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+87] // copy MI out reg to vreg[93]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+94] // copy MI out reg to vreg[94]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+95] // copy MI out reg to vreg[95]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #24 (d1,d0,vc1,vc0) = */
-/*    (24,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(24,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+96] // copy MI out reg to vreg[96]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+97] // copy MI out reg to vreg[97]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+104] // copy MI out reg to vreg[98]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+105] // copy MI out reg to vreg[99]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #25 (d1,d0,vc1,vc0) = */
-/*    (25,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(25,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+98] // copy MI out reg to vreg[100]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+99] // copy MI out reg to vreg[101]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+106] // copy MI out reg to vreg[102]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+107] // copy MI out reg to vreg[103]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #26 (d1,d0,vc1,vc0) = */
-/*    (26,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(26,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+100] // copy MI out reg to vreg[104]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+101] // copy MI out reg to vreg[105]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+108] // copy MI out reg to vreg[106]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+109] // copy MI out reg to vreg[107]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #27 (d1,d0,vc1,vc0) = */
-/*    (27,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(27,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+102] // copy MI out reg to vreg[108]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+103] // copy MI out reg to vreg[109]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+110] // copy MI out reg to vreg[110]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+111] // copy MI out reg to vreg[111]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #28 (d1,d0,vc1,vc0) = */
-/*    (28,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(28,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+112] // copy MI out reg to vreg[112]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+113] // copy MI out reg to vreg[113]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+120] // copy MI out reg to vreg[114]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+121] // copy MI out reg to vreg[115]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #29 (d1,d0,vc1,vc0) = */
-/*    (29,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(29,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+114] // copy MI out reg to vreg[116]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+115] // copy MI out reg to vreg[117]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+122] // copy MI out reg to vreg[118]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+123] // copy MI out reg to vreg[119]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #30 (d1,d0,vc1,vc0) = */
-/*    (30,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(30,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+116] // copy MI out reg to vreg[120]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+117] // copy MI out reg to vreg[121]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+124] // copy MI out reg to vreg[122]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+125] // copy MI out reg to vreg[123]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #31 (d1,d0,vc1,vc0) = */
-/*    (31,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(31,0,0,0) */
-v_mov_b32 v[vgprValuC+148], v[vgprValuC+118] // copy MI out reg to vreg[124]
-v_mov_b32 v[vgprValuC+149], v[vgprValuC+119] // copy MI out reg to vreg[125]
-v_mov_b32 v[vgprValuC+150], v[vgprValuC+126] // copy MI out reg to vreg[126]
-v_mov_b32 v[vgprValuC+151], v[vgprValuC+127] // copy MI out reg to vreg[127]
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-s_branch label_GW_End_22                           // jump to end
-label_GW_End_22:
-
-s_endpgm                                           // Kernel End
-OptNLL_End_16:
-
-
-/******************************************/
-/* Ord. NoLoadLoop - Begin                */
-/******************************************/
-
-
-
-
-/* iter 0 (last unrolled loop) */
-
-/*  grEndMfmaIndex:0, lwStartMfmaIndex:48, lwEndMfmaIndex:48  */
-/*  numMfmaForLR:13, barrierMfmaIndex:50  */
-/*  mfmaIndex:0  */
-s_waitcnt lgkmcnt(0) vmcnt(3)                               // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=0 newLW=0 newLR=0
-v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[0:7]
-/*  mfmaIndex:1  */
-ds_read_b128 v[vgprValuB_X2_I0+0:vgprValuB_X2_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[8:15]
-/*  mfmaIndex:2  */
-ds_read_b128 v[vgprValuB_X2_I0+4:vgprValuB_X2_I0+4+3], v[vgprLocalReadAddrB] offset:2624 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=1 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[16:23]
-/*  mfmaIndex:3  */
-ds_read_b128 v[vgprValuB_X2_I0+8:vgprValuB_X2_I0+8+3], v[vgprLocalReadAddrB] offset:5184 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=2 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[24:31]
-/*  mfmaIndex:4  */
-ds_read_b128 v[vgprValuB_X2_I0+12:vgprValuB_X2_I0+12+3], v[vgprLocalReadAddrB] offset:7744 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=3 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[32:39]
-/*  mfmaIndex:5  */
-ds_read_b128 v[vgprValuB_X2_I0+16:vgprValuB_X2_I0+16+3], v[vgprLocalReadAddrB] offset:10304 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=4 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[40:47]
-/*  mfmaIndex:6  */
-ds_read_b128 v[vgprValuB_X2_I0+20:vgprValuB_X2_I0+20+3], v[vgprLocalReadAddrB] offset:12864 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=5 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[48:55]
-/*  mfmaIndex:7  */
-ds_read_b128 v[vgprValuB_X2_I0+24:vgprValuB_X2_I0+24+3], v[vgprLocalReadAddrB] offset:15424 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=6 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[56:63]
-/*  mfmaIndex:8  */
-ds_read_b128 v[vgprValuB_X2_I0+28:vgprValuB_X2_I0+28+3], v[vgprLocalReadAddrB] offset:17984 // L -> Reg lro=8 swapByteOffset=0 ti=16 vIdx=7 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
-v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[64:71]
-/*  mfmaIndex:9  */
-/* localReadsVacancy: letencyLeft 1 */
-v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[72:79]
-/*  mfmaIndex:10  */
-v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[80:87]
-/*  mfmaIndex:11  */
-v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[88:95]
-/*  mfmaIndex:12  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[96:103]
-/*  mfmaIndex:13  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[104:111]
-/*  mfmaIndex:14  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I1+0+0+0:vgprValuA_X0_I1+0+0+0+1], v[112:119]
-/*  mfmaIndex:15  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I1+2+0+0:vgprValuA_X0_I1+2+0+0+1], v[120:127]
-/* numPrefetchIter=0 */
-/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=1 */
-/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=8 */
-
-
-/* iter 1 (last unrolled loop) */
-
-/*  grEndMfmaIndex:0, lwStartMfmaIndex:48, lwEndMfmaIndex:48  */
-/*  numMfmaForLR:13, barrierMfmaIndex:50  */
-/*  mfmaIndex:16  */
-/* localReadsVacancy: letencyLeft 5 */
-s_waitcnt lgkmcnt(0) vmcnt(2)                               // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=1, new=1 newLW=0 newLR=0
-v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[0:7]
-/*  mfmaIndex:17  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[8:15]
-/*  mfmaIndex:18  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[16:23]
-/*  mfmaIndex:19  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[24:31]
-/*  mfmaIndex:20  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[32:39]
-/*  mfmaIndex:21  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[40:47]
-/*  mfmaIndex:22  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[48:55]
-/*  mfmaIndex:23  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[56:63]
-/*  mfmaIndex:24  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[64:71]
-/*  mfmaIndex:25  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[72:79]
-/*  mfmaIndex:26  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[80:87]
-/*  mfmaIndex:27  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[88:95]
-/*  mfmaIndex:28  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[96:103]
-/*  mfmaIndex:29  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[104:111]
-/*  mfmaIndex:30  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X1_I1+0+0+0:vgprValuA_X1_I1+0+0+0+1], v[112:119]
-/*  mfmaIndex:31  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X1_I1+2+0+0:vgprValuA_X1_I1+2+0+0+1], v[120:127]
-/* numPrefetchIter=0 */
-/* dataAtIterA=0 numReadsIterA=2 skipReadsIterA=1 readsPerIterA=1 */
-/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */
-
-
-/* iter 2 (last unrolled loop) */
-
-/*  grEndMfmaIndex:0, lwStartMfmaIndex:48, lwEndMfmaIndex:48  */
-/*  numMfmaForLR:13, barrierMfmaIndex:50  */
-/*  mfmaIndex:32  */
-/* localReadsVacancy: letencyLeft 5 */
-s_waitcnt lgkmcnt(0) vmcnt(1)                               // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=0 newLW=0 newLR=0
-v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[0:7]
-/*  mfmaIndex:33  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[8:15]
-/*  mfmaIndex:34  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[16:23]
-/*  mfmaIndex:35  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[24:31]
-/*  mfmaIndex:36  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[32:39]
-/*  mfmaIndex:37  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[40:47]
-/*  mfmaIndex:38  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[48:55]
-/*  mfmaIndex:39  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[56:63]
-/*  mfmaIndex:40  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[64:71]
-/*  mfmaIndex:41  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[72:79]
-/*  mfmaIndex:42  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[80:87]
-/*  mfmaIndex:43  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[88:95]
-/*  mfmaIndex:44  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[96:103]
-/*  mfmaIndex:45  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[104:111]
-/*  mfmaIndex:46  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I1+0+0+0:vgprValuA_X2_I1+0+0+0+1], v[112:119]
-/*  mfmaIndex:47  */
-/* localReadsVacancy: letencyLeft 5 */
-v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I1+2+0+0:vgprValuA_X2_I1+2+0+0+1], v[120:127]
-/* numPrefetchIter=0 */
-/* dataAtIterA=1 numReadsIterA=3 skipReadsIterA=1 readsPerIterA=1 */
-/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */
-
-
-/* iter 3 (last unrolled loop) */
-
-/*  grEndMfmaIndex:0, lwStartMfmaIndex:48, lwEndMfmaIndex:48  */
-/*  numMfmaForLR:13, barrierMfmaIndex:50  */
-/*  mfmaIndex:48  */
-s_waitcnt lgkmcnt(0) vmcnt(0)                               // lgkmcnt=0 vmcnt=-1wait for prior local read local write old=0, new=0 newLW=0 newLR=0
-v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[0:7]
-/*  mfmaIndex:49  */
-v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[8:15]
-/*  mfmaIndex:50  */
-v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[16:23]
-/*  mfmaIndex:51  */
-v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[24:31]
-/*  mfmaIndex:52  */
-v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[32:39]
-/*  mfmaIndex:53  */
-v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[40:47]
-/*  mfmaIndex:54  */
-v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[48:55]
-/*  mfmaIndex:55  */
-v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[56:63]
-/*  mfmaIndex:56  */
-v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[64:71]
-/*  mfmaIndex:57  */
-v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[72:79]
-/*  mfmaIndex:58  */
-v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[80:87]
-/*  mfmaIndex:59  */
-v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[88:95]
-/*  mfmaIndex:60  */
-v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[96:103]
-/*  mfmaIndex:61  */
-v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[104:111]
-/*  mfmaIndex:62  */
-v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X3_I1+0+0+0:vgprValuA_X3_I1+0+0+0+1], v[112:119]
-/*  mfmaIndex:63  */
-v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X3_I1+2+0+0:vgprValuA_X3_I1+2+0+0+1], v[120:127]
-/* numPrefetchIter=0 */
-/* dataAtIterA=2 numReadsIterA=3 skipReadsIterA=0 readsPerIterA=1 */
-/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=8 */
-
-PrefetchGlobalLastIterEnd_4:
-
-
-/******************************************/
-/* Tail Loop                              */
-/******************************************/
-
-
-/* local write reset offsets a */
-
-
-
-/* local write reset offsets b */
-
-
-/* tail loop: add vgpr [132...136) to pool (vgprValuA_X1_I0) */
-/* tail loop: add vgpr [136...140) to pool (vgprValuA_X2_I0) */
-/* tail loop: add vgpr [140...144) to pool (vgprValuA_X3_I0) */
-/* tail loop: add vgpr [160...176) to pool (vgprValuB_X1_I0) */
-/* tail loop: add vgpr [176...192) to pool (vgprValuB_X2_I0) */
-/* tail loop: add vgpr [192...208) to pool (vgprValuB_X3_I0) */
-
-//numIterL = (((sizeL % LOCAL_DEPTHU) + LOCAL_SPLITU - 1) / LOCAL_SPLITU)
-s_and_b32 s[sgprLoopCounterL], 15, s[sgprSizesSum+0] // s[sgprLoopCounterL] = s[sgprSizesSum+0] % 16
-s_cmp_eq_u32 s[sgprLoopCounterL], 0x0              // numIterL == 0
-s_mov_b32 s[sgprOrigLoopCounter], 0                // repurpose to count each localRead increment
-s_cbranch_scc1 SkipTailLoopL_7                     // skip to end of tail loop b/c numIter==0
-
-
-/* remove stagger offsets for tail loop */
-
-s_sub_i32 s68, 3, s[sgprStaggerUIter]              // 
-s_mul_hi_i32 s69, s68, s[sgprGlobalReadIncsA+0]    // start offset S in bytes
-s_mul_i32 s68, s68, s[sgprGlobalReadIncsA+0]       // start offset S in bytes
-s_sub_u32 s68, s68, s[sgprWrapUA]                  // S - WrapU
-s_subb_u32 s69, s69, s[sgprWrapUA+1]               // S - WrapU
-s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s68        // gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdA+1], s[sgprSrdA+1], s69      // gra SRD += inc(upper)
-s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s68 // limit -= inc)
-s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s69 // limit -= inc)
-s_cmp_eq_u32 s[sgprShadowLimitA+1], 0              // are we within 2^32?
-s_cmov_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0]    // Move shadow to real if we are within 2^32
-
-s_sub_i32 s68, 3, s[sgprStaggerUIter]              // 
-s_mul_hi_i32 s69, s68, s[sgprGlobalReadIncsB+0]    // start offset S in bytes
-s_mul_i32 s68, s68, s[sgprGlobalReadIncsB+0]       // start offset S in bytes
-s_sub_u32 s68, s68, s[sgprWrapUB]                  // S - WrapU
-s_subb_u32 s69, s69, s[sgprWrapUB+1]               // S - WrapU
-s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s68        // gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdB+1], s[sgprSrdB+1], s69      // gra SRD += inc(upper)
-s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s68 // limit -= inc)
-s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s69 // limit -= inc)
-s_cmp_eq_u32 s[sgprShadowLimitB+1], 0              // are we within 2^32?
-s_cmov_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0]    // Move shadow to real if we are within 2^32
-
-
-/* Update M0 for DTLDS */
-
-
-
-/* global read a */
-
-/* g2l=0, load component 0 */
-buffer_load_dwordx2 v[vgprG2LA+0+0:vgprG2LA+0+0+1], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // load one buffer value
-/* g2l=0, load component 1 */
-buffer_load_dwordx2 v[vgprG2LA+0+2:vgprG2LA+0+2+1], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0, offen offset:8 // load one buffer value
-/* g2l=4, load component 0 */
-buffer_load_dwordx2 v[vgprG2LA+4+0:vgprG2LA+4+0+1], v[vgprGlobalReadOffsetA+1], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // load one buffer value
-/* g2l=4, load component 1 */
-buffer_load_dwordx2 v[vgprG2LA+4+2:vgprG2LA+4+2+1], v[vgprGlobalReadOffsetA+1], s[sgprSrdA:sgprSrdA+3], 0, offen offset:8 // load one buffer value
-/* g2l=8, load component 0 */
-buffer_load_dwordx2 v[vgprG2LA+8+0:vgprG2LA+8+0+1], v[vgprGlobalReadOffsetA+2], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // load one buffer value
-/* g2l=8, load component 1 */
-buffer_load_dwordx2 v[vgprG2LA+8+2:vgprG2LA+8+2+1], v[vgprGlobalReadOffsetA+2], s[sgprSrdA:sgprSrdA+3], 0, offen offset:8 // load one buffer value
-/* g2l=12, load component 0 */
-buffer_load_dwordx2 v[vgprG2LA+12+0:vgprG2LA+12+0+1], v[vgprGlobalReadOffsetA+3], s[sgprSrdA:sgprSrdA+3], 0, offen offset:0 // load one buffer value
-/* g2l=12, load component 1 */
-buffer_load_dwordx2 v[vgprG2LA+12+2:vgprG2LA+12+2+1], v[vgprGlobalReadOffsetA+3], s[sgprSrdA:sgprSrdA+3], 0, offen offset:8 // load one buffer value
-
-
-/* Update M0 for DTLDS */
-
-
-
-/* global read b */
-
-/* g2l=0, load component 0 */
-buffer_load_dwordx2 v[vgprG2LB+0+0:vgprG2LB+0+0+1], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // load one buffer value
-/* g2l=0, load component 1 */
-buffer_load_dwordx2 v[vgprG2LB+0+2:vgprG2LB+0+2+1], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0, offen offset:8 // load one buffer value
-/* g2l=4, load component 0 */
-buffer_load_dwordx2 v[vgprG2LB+4+0:vgprG2LB+4+0+1], v[vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // load one buffer value
-/* g2l=4, load component 1 */
-buffer_load_dwordx2 v[vgprG2LB+4+2:vgprG2LB+4+2+1], v[vgprGlobalReadOffsetB+1], s[sgprSrdB:sgprSrdB+3], 0, offen offset:8 // load one buffer value
-/* g2l=8, load component 0 */
-buffer_load_dwordx2 v[vgprG2LB+8+0:vgprG2LB+8+0+1], v[vgprGlobalReadOffsetB+2], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // load one buffer value
-/* g2l=8, load component 1 */
-buffer_load_dwordx2 v[vgprG2LB+8+2:vgprG2LB+8+2+1], v[vgprGlobalReadOffsetB+2], s[sgprSrdB:sgprSrdB+3], 0, offen offset:8 // load one buffer value
-/* g2l=12, load component 0 */
-buffer_load_dwordx2 v[vgprG2LB+12+0:vgprG2LB+12+0+1], v[vgprGlobalReadOffsetB+3], s[sgprSrdB:sgprSrdB+3], 0, offen offset:0 // load one buffer value
-/* g2l=12, load component 1 */
-buffer_load_dwordx2 v[vgprG2LB+12+2:vgprG2LB+12+2+1], v[vgprGlobalReadOffsetB+3], s[sgprSrdB:sgprSrdB+3], 0, offen offset:8 // load one buffer value
-
-s_waitcnt vmcnt(0)                                 // lgkmcnt=-1 vmcnt=02wait for global read
-
-// Skip force waitcnt0
-s_barrier //
-
-
-
-
-/* local write a */
-
-ds_write_b128 v[vgprLocalWriteAddrA], v[vgprG2LA+0:vgprG2LA+0+3] offset:0 // lwoA_0_0_0_0 = (0*LSCA) + (0*LSPA)(*MT0I+PAD) = 0
-ds_write_b128 v[vgprLocalWriteAddrA], v[vgprG2LA+4:vgprG2LA+4+3] offset:4096 // lwoA_0_0_1_0 = (0*LSCA) + (1*LSPA)(*MT0I+PAD) = 4096
-ds_write_b128 v[vgprLocalWriteAddrA], v[vgprG2LA+8:vgprG2LA+8+3] offset:8192 // lwoA_0_0_2_0 = (0*LSCA) + (2*LSPA)(*MT0I+PAD) = 8192
-ds_write_b128 v[vgprLocalWriteAddrA], v[vgprG2LA+12:vgprG2LA+12+3] offset:12288 // lwoA_0_0_3_0 = (0*LSCA) + (3*LSPA)(*MT0I+PAD) = 12288
-
-
-/* local write b */
-
-ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+0:vgprG2LB+0+3] offset:0 // lwoB_0_0_0_0 = (0*LSCB)*(MT1J+PAD) + (0*LSPB) = 0
-ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+4:vgprG2LB+4+3] offset:1280 // lwoB_0_0_1_0 = (0*LSCB)*(MT1J+PAD) + (1*LSPB) = 1280
-ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+8:vgprG2LB+8+3] offset:2560 // lwoB_0_0_2_0 = (0*LSCB)*(MT1J+PAD) + (2*LSPB) = 2560
-ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+12:vgprG2LB+12+3] offset:3840 // lwoB_0_0_3_0 = (0*LSCB)*(MT1J+PAD) + (3*LSPB) = 3840
-
-
-/* Recalc local read offsets */
-
-/*lr0I*/
-v_and_b32 v134, 63, v[vgprSerial]                  // 0. thread id in wave: wtid = tid % wavelength(64)
-v_and_b32 v133, 15, v134                           // 1. N offset: nIdx = wtid % MI_N(16)
-                                                   // 1. N offset: nOffset = nIdx * nStride(1) (multiplier is 1, do nothing)
-v_lshrrev_b32 v132, 4, v134                        // 2. block offset: bnIdx = wtid / dividedForBlkId(16)
-v_and_b32 v132, 0, v132                            // 2. block offset: bnIdx = bnIdx % num1DBlocks(1)
-v_lshlrev_b32 v132, 0x4, v132                      // 2. block offset: bnOffset = bnIdx * strideBlock(16)
-_v_add_u32 v133, v132, v133                        // 3. add N and block offset: bnOffset = block and N offset
-v_lshlrev_b32 v133, 0x1, v133                      // 3. apply VectorWidth: bnOffset = bnOffset * vw(2)
-v_lshrrev_b32 v134, 4, v134                        // 4. K offset: kIdx = wtid / (MIN(16) * MIBB(1))
-v_lshlrev_b32 v134, 0x7, v134                      // 4. K offset: lrKOffset = kIdx * mStride(128)
-_v_add_u32 v133, v134, v133                        // 5. offset in wave: lrOffset = bnOffset + lrKOffset
-v_lshrrev_b32 v132, 6, v[vgprSerial]               // 6. wave offset in N dimen: wtid = tid / dividedForWaveId(64)
-v_and_b32 v132, 3, v132                            // 6. wave offset in M dimen: wtid0 = wtid / num1DWaves(4)
-v_lshlrev_b32 v132, 0x5, v132                      // 6. wave offset in M dimen: wOffset = wtid0 * W0Stride(32)
-_v_add_u32 v133, v132, v133                        // 7. final local read offset: flrOffset = lrOffset + WOffset
-/*lr1J*/
-v_and_b32 v135, 63, v[vgprSerial]                  // 0. thread id in wave: wtid = tid % wavelength(64)
-v_and_b32 v134, 15, v135                           // 1. N offset: nIdx = wtid % MI_N(16)
-v_lshlrev_b32 v134, 0x4, v134                      // 1. N offset: nOffset = nIdx * nStride(16)
-v_lshrrev_b32 v132, 4, v135                        // 2. block offset: bnIdx = wtid / dividedForBlkId(16)
-v_and_b32 v132, 0, v132                            // 2. block offset: bnIdx = bnIdx % num1DBlocks(1)
-v_lshlrev_b32 v132, 0x8, v132                      // 2. block offset: bnOffset = bnIdx * strideBlock(256)
-_v_add_u32 v134, v132, v134                        // 3. add N and block offset: bnOffset = block and N offset
-                                                   // 3. apply VectorWidth: bnOffset = bnOffset * vw(1) (multiplier is 1, do nothing)
-v_lshrrev_b32 v135, 4, v135                        // 4. K offset: kIdx = wtid / (MIN(16) * MIBB(1))
-                                                   // 4. K offset: lrKOffset = kIdx * mStride(1) (multiplier is 1, do nothing)
-_v_add_u32 v134, v135, v134                        // 5. offset in wave: lrOffset = bnOffset + lrKOffset
-v_lshrrev_b32 v132, 8, v[vgprSerial]               // LSU offset: sgid = Serial / subGroup(256)
-s_mov_b32 s68, 128                                 // LSU offset: stirde = MT0(128) + PAD0(0)
-v_mul_lo_u32 v132, s68, v132                       // LSU offset: lsuoffset = sgid*(MT0+PAD)
-_v_add_lshl_u32 v[vgprLocalReadAddrA], v132, v133, 0x3 // Final Offset: offset = (lro0*VW+lsuoffset)*bpe
-/* N/A */
-v_lshrrev_b32 v132, 8, v[vgprSerial]               // LSU offset: sgid = Serial / subGroup(256)
-s_mov_b32 s68, 128                                 // LSU offset: stirde = MT1(128) + PAD1(0)
-v_mul_lo_u32 v132, s68, v132                       // LSU offset: lsuoffset = sgid*(MT1+PAD)
-_v_add_lshl_u32 v[vgprLocalReadAddrB], v132, v134, 0x3 // Final Offset: offset = (lro1*VW+lsuoffset)*bpe
-v_lshrrev_b32 v133, 7, v[vgprLocalReadAddrB]       // Final Offset: padding 4 per block 128
-v_lshlrev_b32 v133, 0x5, v133                      // Final Offset: padding 4 per block 128
-_v_add_u32 v[vgprLocalReadAddrB], v133, v[vgprLocalReadAddrB] // Final Offset: add padding 4 per block 128
-_v_add_co_u32 v[vgprLocalReadAddrB+0], vcc, 0x4000, v[vgprLocalReadAddrB+0] //  += LdsOffsetB (lower)
-
-s_waitcnt lgkmcnt(0)                               // lgkmcnt=0 vmcnt=-15wait for local write
-
-// Skip force waitcnt0
-s_barrier //
-
-
-/* local read reset offsets a */
-
-
-
-/* local read reset offsets b */
-
-
-
-/* local read init pointers a */
-
-
-/* localReadInitPointers */
-
-
-/* local read init pointers b */
-
-
-/* localReadInitPointers */
-
-
-/* tail loop: macs */
-
-TailLoopBeginL_5:
-
-
-/* local read a */
-
-ds_read_b128 v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], v[vgprLocalReadAddrA] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=128 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-
-
-/* local read b */
-
-ds_read_b64 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+1], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-ds_read_b64 v[vgprValuB_X0_I0+2:vgprValuB_X0_I0+2+1], v[vgprLocalReadAddrB] offset:2560 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=1 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-ds_read_b64 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+1], v[vgprLocalReadAddrB] offset:5120 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=2 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-ds_read_b64 v[vgprValuB_X0_I0+6:vgprValuB_X0_I0+6+1], v[vgprLocalReadAddrB] offset:7680 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=3 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-ds_read_b64 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+1], v[vgprLocalReadAddrB] offset:10240 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=4 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-ds_read_b64 v[vgprValuB_X0_I0+10:vgprValuB_X0_I0+10+1], v[vgprLocalReadAddrB] offset:12800 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=5 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-ds_read_b64 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+1], v[vgprLocalReadAddrB] offset:15360 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=6 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-ds_read_b64 v[vgprValuB_X0_I0+14:vgprValuB_X0_I0+14+1], v[vgprLocalReadAddrB] offset:17920 // L -> Reg lro=0 swapByteOffset=0 ti=16 vIdx=7 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
-
-
-/* local read inc a */
-
-s_mov_b32 s68, 0x1000                              // inc
-_v_add_co_u32 v[vgprLocalReadAddrA], vcc, s68, v[vgprLocalReadAddrA] // lrA += 4096 (LSU*(MT+PAD)*bpe)
-
-
-/* local read inc b */
-
-s_mov_b32 s68, 0x20                                // inc
-_v_add_co_u32 v[vgprLocalReadAddrB], vcc, s68, v[vgprLocalReadAddrB] // lrB += 32 (LSU*(MT+PAD)*bpe)
-
-s_waitcnt lgkmcnt(0)                               // lgkmcnt=0 vmcnt=-14wait for local read
-
-
-v_and_b32 v132, 63, v[vgprSerial]                  // v132 = v[vgprSerial] % 64
-v_lshrrev_b32 v132, 4, v132                        // v132 = v132 / 16
-                                                   // v132 = v132 * 1 (multiplier is 1, do nothing)
-v_cmp_ge_i32 s[68:69], v132, s[sgprLoopCounterL]   // check K index >= Size L
-v_cndmask_b32 v[vgprValuA_X0_I0+0+0], v[vgprValuA_X0_I0+0+0], 0x0, s[68:69] // set 0 if K_idx >= sizeL
-v_cndmask_b32 v[vgprValuA_X0_I0+2+0], v[vgprValuA_X0_I0+2+0], 0x0, s[68:69] // set 0 if K_idx >= sizeL
-v_cndmask_b32 v[vgprValuB_X0_I0+0+0], v[vgprValuB_X0_I0+0+0], 0x0, s[68:69] // set 0 if K_idx >= sizeL
-v_cndmask_b32 v[vgprValuB_X0_I0+2+0], v[vgprValuB_X0_I0+2+0], 0x0, s[68:69] // set 0 if K_idx >= sizeL
-v_cndmask_b32 v[vgprValuB_X0_I0+4+0], v[vgprValuB_X0_I0+4+0], 0x0, s[68:69] // set 0 if K_idx >= sizeL
-v_cndmask_b32 v[vgprValuB_X0_I0+6+0], v[vgprValuB_X0_I0+6+0], 0x0, s[68:69] // set 0 if K_idx >= sizeL
-v_cndmask_b32 v[vgprValuB_X0_I0+8+0], v[vgprValuB_X0_I0+8+0], 0x0, s[68:69] // set 0 if K_idx >= sizeL
-v_cndmask_b32 v[vgprValuB_X0_I0+10+0], v[vgprValuB_X0_I0+10+0], 0x0, s[68:69] // set 0 if K_idx >= sizeL
-v_cndmask_b32 v[vgprValuB_X0_I0+12+0], v[vgprValuB_X0_I0+12+0], 0x0, s[68:69] // set 0 if K_idx >= sizeL
-v_cndmask_b32 v[vgprValuB_X0_I0+14+0], v[vgprValuB_X0_I0+14+0], 0x0, s[68:69] // set 0 if K_idx >= sizeL
-v_cndmask_b32 v[vgprValuA_X0_I0+0+1], v[vgprValuA_X0_I0+0+1], 0x0, s[68:69] // set 0 if K_idx >= sizeL
-v_cndmask_b32 v[vgprValuA_X0_I0+2+1], v[vgprValuA_X0_I0+2+1], 0x0, s[68:69] // set 0 if K_idx >= sizeL
-v_cndmask_b32 v[vgprValuB_X0_I0+0+1], v[vgprValuB_X0_I0+0+1], 0x0, s[68:69] // set 0 if K_idx >= sizeL
-v_cndmask_b32 v[vgprValuB_X0_I0+2+1], v[vgprValuB_X0_I0+2+1], 0x0, s[68:69] // set 0 if K_idx >= sizeL
-v_cndmask_b32 v[vgprValuB_X0_I0+4+1], v[vgprValuB_X0_I0+4+1], 0x0, s[68:69] // set 0 if K_idx >= sizeL
-v_cndmask_b32 v[vgprValuB_X0_I0+6+1], v[vgprValuB_X0_I0+6+1], 0x0, s[68:69] // set 0 if K_idx >= sizeL
-v_cndmask_b32 v[vgprValuB_X0_I0+8+1], v[vgprValuB_X0_I0+8+1], 0x0, s[68:69] // set 0 if K_idx >= sizeL
-v_cndmask_b32 v[vgprValuB_X0_I0+10+1], v[vgprValuB_X0_I0+10+1], 0x0, s[68:69] // set 0 if K_idx >= sizeL
-v_cndmask_b32 v[vgprValuB_X0_I0+12+1], v[vgprValuB_X0_I0+12+1], 0x0, s[68:69] // set 0 if K_idx >= sizeL
-v_cndmask_b32 v[vgprValuB_X0_I0+14+1], v[vgprValuB_X0_I0+14+1], 0x0, s[68:69] // set 0 if K_idx >= sizeL
-s_nop 1
-v_mfma_f64_16x16x4f64 v[0:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[0:7]
-v_mfma_f64_16x16x4f64 v[8:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[8:15]
-v_mfma_f64_16x16x4f64 v[16:23], v[vgprValuB_X0_I0+2+0+0:vgprValuB_X0_I0+2+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[16:23]
-v_mfma_f64_16x16x4f64 v[24:31], v[vgprValuB_X0_I0+2+0+0:vgprValuB_X0_I0+2+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[24:31]
-v_mfma_f64_16x16x4f64 v[32:39], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[32:39]
-v_mfma_f64_16x16x4f64 v[40:47], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[40:47]
-v_mfma_f64_16x16x4f64 v[48:55], v[vgprValuB_X0_I0+6+0+0:vgprValuB_X0_I0+6+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[48:55]
-v_mfma_f64_16x16x4f64 v[56:63], v[vgprValuB_X0_I0+6+0+0:vgprValuB_X0_I0+6+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[56:63]
-v_mfma_f64_16x16x4f64 v[64:71], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[64:71]
-v_mfma_f64_16x16x4f64 v[72:79], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[72:79]
-v_mfma_f64_16x16x4f64 v[80:87], v[vgprValuB_X0_I0+10+0+0:vgprValuB_X0_I0+10+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[80:87]
-v_mfma_f64_16x16x4f64 v[88:95], v[vgprValuB_X0_I0+10+0+0:vgprValuB_X0_I0+10+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[88:95]
-v_mfma_f64_16x16x4f64 v[96:103], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[96:103]
-v_mfma_f64_16x16x4f64 v[104:111], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[104:111]
-v_mfma_f64_16x16x4f64 v[112:119], v[vgprValuB_X0_I0+14+0+0:vgprValuB_X0_I0+14+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], v[112:119]
-v_mfma_f64_16x16x4f64 v[120:127], v[vgprValuB_X0_I0+14+0+0:vgprValuB_X0_I0+14+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], v[120:127]
-
-
-/* closeLoop loopL finalLoop=1 tailLoop=1 */
-s_sub_i32 s[sgprLoopCounterL], s[sgprLoopCounterL], 0x4 // dec counterL (tailLoop)
-s_add_u32 s[sgprOrigLoopCounter], s[sgprOrigLoopCounter], 0x4 // inc counterL
-s_cmp_le_i32 s[sgprLoopCounterL], 0x0              // counterL<=0
-s_cbranch_scc0 TailLoopBeginL_5                    // restart LoopL
-TailLoopEndL_6:
-
-SkipTailLoopL_7:
-
-Summation_End_25:
-s_setprio 0                                        // optimization store
-/* endSummation: add vgpr [128...132) to pool (vgprValuA_X0_I0) */
-/* endSummation: add vgpr [144...160) to pool (vgprValuB_X0_I0) */
-/* endSummation: add vgpr [208...252) to pool */
-.set NumFullBlocks, UNDEF
-.set WgmRemainder1, UNDEF
-.set MagicNumberWgmRemainder1, UNDEF
-.set ShadowLimitA, UNDEF
-.set ShadowLimitB, UNDEF
-.set WrapUA, UNDEF
-.set WrapUB, UNDEF
-.set GlobalReadIncsA, UNDEF
-.set GlobalReadIncsB, UNDEF
-
-/* Mapping of Acc register -> C Vgpr register */
-
-/* Multiply MI out register with Alpha -> C Vgpr register */
-
-// TODO in Generator
-// skip shift vector if M % 2 == 0
-s_and_b32 s63, 0x1, s[sgprSizeI]
-s_cbranch_scc0 label_0029                                // done shifting
-
-/* shift vector components d0 */
-
-v_mov_b32 v131, s[sgprWorkGroup0]                  // 
-v_mul_i32_i24 v131, -0x80, v131                    // wg*MT
-_v_add_co_u32 v131, vcc, s[sgprSizesFree+0], v131  // wgMT = Size - wg*MT
-v_mov_b32 v132, 0x80                               // MT
-v_cmp_lt_u32 s[64:65], v131, v132                  // wgMT < MT
-v_cndmask_b32 v131, v132, v131, s[64:65]           // wgMT = (wgMT < MT) ? wgMT : MT
-v_lshrrev_b32 v133, 6, v[vgprSerial]               // v133 = v[vgprSerial] / 64
-v_and_b32 v133, 3, v133                            // v133 = v133 % 4
-v_lshrrev_b32 v134, 5, v131                        // v134 = v131 / 32
-v_and_b32 v134, 3, v134                            // v134 = v134 % 4
-v_cmp_eq_u32 s[64:65], v134, v133                  // wave_id == block_belong_to_wave?
-v_cndmask_b32 v131, v132, v131, s[64:65]           // wgMT = (wgMT < MT) ? wgMT : MT
-
-/* mbReg: which mb block need to shift, mb(matrixInstCoal(16) * VectorWidth(2)) */
-v_lshrrev_b32 v132, 5, v131                        // v132 = v131 / 32
-v_lshlrev_b32 v134, 0x0, v133                      // v134 = v133 * 1
-_v_sub_u32 v132, v132, v134                        // 
-
-/* gbReg: glvw block id */
-v_lshrrev_b32 v134, 1, v131                        // v134 = v131 / 2
-
-/* tgbReg: glvw block id */
-v_lshrrev_b32 v135, 0, v[vgprSerial]               // v135 = v[vgprSerial] / 1
-v_and_b32 v135, 15, v135                           // v135 = v135 % 16
-v_lshlrev_b32 v135, 0x1, v135                      // v135 = v135 * 2
-v_lshrrev_b32 v135, 1, v135                        // v135 = v135 / 2
-v_lshlrev_b32 v133, 0x4, v133                      // v133 = v133 * 16
-_v_add_co_u32 v135, vcc, v133, v135                // tgbReg = (tid_coal * continOut) / GLVW
-_v_sub_u32 v134, v134, v135                        // 
-
-/* vwReg: glvw in which vw block? */
-v_and_b32 v133, 1, v131                            // permute register between threads
-v_lshrrev_b32 v133, 1, v133                        // permute register between threads
-
-/* rReg : reminder of M_size % GlobalLoadVectorWidth */
-v_and_b32 v135, 1, v131                            // v135 = v131 % 2
-v_cmp_eq_u32 vcc, v135, 0x1                        // wgMT%VW == 1
-s_cbranch_vccnz label_0026                         // branch to shift d0 r=1
-s_branch label_0029                                // no shifting
-
-/******************************************/
-/* shift d0 r=1                           */
-/******************************************/
-label_0026:
-v_cmp_eq_u32 vcc, v132, 0x0                        // 
-s_cbranch_vccnz label_0027                         // branch to shift d0 r1 mb0
-
-/******************************************/
-/* shift d0 r=1 mb=0                      */
-/******************************************/
-label_0027: // r1 mb0 
-v_cmp_eq_u32 vcc, v133, 0x0                        // 
-s_cbranch_vccnz label_0028                         // branch to shift d0 r1 mb0 vw0
-
-/******************************************/
-/* shift d0 r=1 mb=0 vw0                  */
-/******************************************/
-label_0028: // r1 mb0 vw0 
-s_mov_b32 s64, 0                                   // 
-v_cmpx_eq_u32 s[64:65], v134, s64                  // is thread in edge glvw region
-v_and_b32 v128, 63, v[vgprSerial]                  // permute register between threads
-v_lshlrev_b32 v128, 2, v128                        // permute register between threads
-v_mov_b32 v135, v8                                 // glvw 1 mb 0 tt1 0 r 0
-v_mov_b32 v0, v135                                 // 
-v_mov_b32 v135, v9                                 // glvw 1 mb 0 tt1 0 r 1
-v_mov_b32 v1, v135                                 // 
-v_mov_b32 v135, v10                                // glvw 1 mb 0 tt1 1 r 0
-v_mov_b32 v2, v135                                 // 
-v_mov_b32 v135, v11                                // glvw 1 mb 0 tt1 1 r 1
-v_mov_b32 v3, v135                                 // 
-v_mov_b32 v135, v12                                // glvw 1 mb 0 tt1 2 r 0
-v_mov_b32 v4, v135                                 // 
-v_mov_b32 v135, v13                                // glvw 1 mb 0 tt1 2 r 1
-v_mov_b32 v5, v135                                 // 
-v_mov_b32 v135, v14                                // glvw 1 mb 0 tt1 3 r 0
-v_mov_b32 v6, v135                                 // 
-v_mov_b32 v135, v15                                // glvw 1 mb 0 tt1 3 r 1
-v_mov_b32 v7, v135                                 // 
-v_mov_b32 v135, v24                                // glvw 1 mb 0 tt1 4 r 0
-v_mov_b32 v16, v135                                // 
-v_mov_b32 v135, v25                                // glvw 1 mb 0 tt1 4 r 1
-v_mov_b32 v17, v135                                // 
-v_mov_b32 v135, v26                                // glvw 1 mb 0 tt1 5 r 0
-v_mov_b32 v18, v135                                // 
-v_mov_b32 v135, v27                                // glvw 1 mb 0 tt1 5 r 1
-v_mov_b32 v19, v135                                // 
-v_mov_b32 v135, v28                                // glvw 1 mb 0 tt1 6 r 0
-v_mov_b32 v20, v135                                // 
-v_mov_b32 v135, v29                                // glvw 1 mb 0 tt1 6 r 1
-v_mov_b32 v21, v135                                // 
-v_mov_b32 v135, v30                                // glvw 1 mb 0 tt1 7 r 0
-v_mov_b32 v22, v135                                // 
-v_mov_b32 v135, v31                                // glvw 1 mb 0 tt1 7 r 1
-v_mov_b32 v23, v135                                // 
-v_mov_b32 v135, v40                                // glvw 1 mb 0 tt1 8 r 0
-v_mov_b32 v32, v135                                // 
-v_mov_b32 v135, v41                                // glvw 1 mb 0 tt1 8 r 1
-v_mov_b32 v33, v135                                // 
-v_mov_b32 v135, v42                                // glvw 1 mb 0 tt1 9 r 0
-v_mov_b32 v34, v135                                // 
-v_mov_b32 v135, v43                                // glvw 1 mb 0 tt1 9 r 1
-v_mov_b32 v35, v135                                // 
-v_mov_b32 v135, v44                                // glvw 1 mb 0 tt1 10 r 0
-v_mov_b32 v36, v135                                // 
-v_mov_b32 v135, v45                                // glvw 1 mb 0 tt1 10 r 1
-v_mov_b32 v37, v135                                // 
-v_mov_b32 v135, v46                                // glvw 1 mb 0 tt1 11 r 0
-v_mov_b32 v38, v135                                // 
-v_mov_b32 v135, v47                                // glvw 1 mb 0 tt1 11 r 1
-v_mov_b32 v39, v135                                // 
-v_mov_b32 v135, v56                                // glvw 1 mb 0 tt1 12 r 0
-v_mov_b32 v48, v135                                // 
-v_mov_b32 v135, v57                                // glvw 1 mb 0 tt1 12 r 1
-v_mov_b32 v49, v135                                // 
-v_mov_b32 v135, v58                                // glvw 1 mb 0 tt1 13 r 0
-v_mov_b32 v50, v135                                // 
-v_mov_b32 v135, v59                                // glvw 1 mb 0 tt1 13 r 1
-v_mov_b32 v51, v135                                // 
-v_mov_b32 v135, v60                                // glvw 1 mb 0 tt1 14 r 0
-v_mov_b32 v52, v135                                // 
-v_mov_b32 v135, v61                                // glvw 1 mb 0 tt1 14 r 1
-v_mov_b32 v53, v135                                // 
-v_mov_b32 v135, v62                                // glvw 1 mb 0 tt1 15 r 0
-v_mov_b32 v54, v135                                // 
-v_mov_b32 v135, v63                                // glvw 1 mb 0 tt1 15 r 1
-v_mov_b32 v55, v135                                // 
-v_mov_b32 v135, v72                                // glvw 1 mb 0 tt1 16 r 0
-v_mov_b32 v64, v135                                // 
-v_mov_b32 v135, v73                                // glvw 1 mb 0 tt1 16 r 1
-v_mov_b32 v65, v135                                // 
-v_mov_b32 v135, v74                                // glvw 1 mb 0 tt1 17 r 0
-v_mov_b32 v66, v135                                // 
-v_mov_b32 v135, v75                                // glvw 1 mb 0 tt1 17 r 1
-v_mov_b32 v67, v135                                // 
-v_mov_b32 v135, v76                                // glvw 1 mb 0 tt1 18 r 0
-v_mov_b32 v68, v135                                // 
-v_mov_b32 v135, v77                                // glvw 1 mb 0 tt1 18 r 1
-v_mov_b32 v69, v135                                // 
-v_mov_b32 v135, v78                                // glvw 1 mb 0 tt1 19 r 0
-v_mov_b32 v70, v135                                // 
-v_mov_b32 v135, v79                                // glvw 1 mb 0 tt1 19 r 1
-v_mov_b32 v71, v135                                // 
-v_mov_b32 v135, v88                                // glvw 1 mb 0 tt1 20 r 0
-v_mov_b32 v80, v135                                // 
-v_mov_b32 v135, v89                                // glvw 1 mb 0 tt1 20 r 1
-v_mov_b32 v81, v135                                // 
-v_mov_b32 v135, v90                                // glvw 1 mb 0 tt1 21 r 0
-v_mov_b32 v82, v135                                // 
-v_mov_b32 v135, v91                                // glvw 1 mb 0 tt1 21 r 1
-v_mov_b32 v83, v135                                // 
-v_mov_b32 v135, v92                                // glvw 1 mb 0 tt1 22 r 0
-v_mov_b32 v84, v135                                // 
-v_mov_b32 v135, v93                                // glvw 1 mb 0 tt1 22 r 1
-v_mov_b32 v85, v135                                // 
-v_mov_b32 v135, v94                                // glvw 1 mb 0 tt1 23 r 0
-v_mov_b32 v86, v135                                // 
-v_mov_b32 v135, v95                                // glvw 1 mb 0 tt1 23 r 1
-v_mov_b32 v87, v135                                // 
-v_mov_b32 v135, v104                               // glvw 1 mb 0 tt1 24 r 0
-v_mov_b32 v96, v135                                // 
-v_mov_b32 v135, v105                               // glvw 1 mb 0 tt1 24 r 1
-v_mov_b32 v97, v135                                // 
-v_mov_b32 v135, v106                               // glvw 1 mb 0 tt1 25 r 0
-v_mov_b32 v98, v135                                // 
-v_mov_b32 v135, v107                               // glvw 1 mb 0 tt1 25 r 1
-v_mov_b32 v99, v135                                // 
-v_mov_b32 v135, v108                               // glvw 1 mb 0 tt1 26 r 0
-v_mov_b32 v100, v135                               // 
-v_mov_b32 v135, v109                               // glvw 1 mb 0 tt1 26 r 1
-v_mov_b32 v101, v135                               // 
-v_mov_b32 v135, v110                               // glvw 1 mb 0 tt1 27 r 0
-v_mov_b32 v102, v135                               // 
-v_mov_b32 v135, v111                               // glvw 1 mb 0 tt1 27 r 1
-v_mov_b32 v103, v135                               // 
-v_mov_b32 v135, v120                               // glvw 1 mb 0 tt1 28 r 0
-v_mov_b32 v112, v135                               // 
-v_mov_b32 v135, v121                               // glvw 1 mb 0 tt1 28 r 1
-v_mov_b32 v113, v135                               // 
-v_mov_b32 v135, v122                               // glvw 1 mb 0 tt1 29 r 0
-v_mov_b32 v114, v135                               // 
-v_mov_b32 v135, v123                               // glvw 1 mb 0 tt1 29 r 1
-v_mov_b32 v115, v135                               // 
-v_mov_b32 v135, v124                               // glvw 1 mb 0 tt1 30 r 0
-v_mov_b32 v116, v135                               // 
-v_mov_b32 v135, v125                               // glvw 1 mb 0 tt1 30 r 1
-v_mov_b32 v117, v135                               // 
-v_mov_b32 v135, v126                               // glvw 1 mb 0 tt1 31 r 0
-v_mov_b32 v118, v135                               // 
-v_mov_b32 v135, v127                               // glvw 1 mb 0 tt1 31 r 1
-v_mov_b32 v119, v135                               // 
-s_mov_b64 s[64:65], 0xFFFFFFFFFFFFFFFF             // to restore all threads active
-s_or_saveexec_b64 vcc, s[64:65]                    // all threads active
-s_branch label_0029                                // done shifting
-
-label_0029: // end shift0
-
-
-
-/* not-LocalSplitU: global write indices */
-
-/* computeStoreVgprs */
-v_lshrrev_b32 v132, 6, v[vgprSerial]               // v132 = v[vgprSerial] / 64
-v_lshrrev_b32 v133, 2, v132                        // v133 = v132 / 4
-v_mul_lo_u32 v133, 0x10, v133                      // wave coordination offset 1
-v_and_b32 v129, 63, v[vgprSerial]                  // v129 = v[vgprSerial] % 64
-v_lshrrev_b32 v129, 4, v129                        // v129 = v129 / 16
-                                                   // thread0 * continuous_output (multiplier is 1, do nothing)
-v_add_u32 v129, v133, v129                         // coordination 1 = wave_id1 + tid1
-v_mul_lo_u32 v130, v129, s[sgprStrideC1J]          //  offset 1
-v_mul_lo_u32 v131, v129, s[sgprStrideD1J]          //  offset 1
-v_and_b32 v128, 3, v132                            // v128 = v132 % 4
-v_mul_lo_u32 v128, 0x10, v128                      // wave coordination offset 0
-v_and_b32 v133, 15, v[vgprSerial]                  // v133 = v[vgprSerial] % 16
-_v_add_lshl_u32 v128, v133, v128, 1                // coordination 0 = wave_id0 + tid0
-s_mul_i32 s63, 128, s[sgprWorkGroup0]              // wgp0 * MT0
-v_add_u32 v128, s63, v128                          // coord 0 = (tid0/MI_m)*4 + waveG0*MIB_m + MT0*SG0
-s_mul_i32 s63, 128, s[sgprWorkGroup1]              // wgp1 * MT1
-v_add_u32 v129, s63, v129                          // coord 1 = (tid0%MI_m) + waveG1*MIB_n + MT1*SG1
-
-
-/* not-LocalSplitU: global write */
-
-s_mov_b32 s64, s[sgprBeta+0]                       // tmp = Beta[0]
-s_or_b32 s64, s[sgprBeta+1], s64                   // tmp |= Beta[1] 
-s_cmpk_eq_u32 s64, 0x0                             // Beta == 0
-s_cbranch_scc0 GW_Beta_46                          // Branch if Beta is not zero
-
-s_and_b32 s64, 127, s[sgprSizeI]                   // s64 = s[sgprSizeI] % 128
-s_add_u32 s65, -0x1, s[sgprNumWorkGroups0]         // 
-s_cmp_ge_u32 s[sgprWorkGroup0], s65                // wg0 >= nwg0-1 ?
-s_cselect_b32 s64, s64, 0                          // set rMT0
-s_cmpk_gt_u32 s64, 0x0                             // rMT0 > 0
-s_cbranch_scc1 GW_B0_E1_37                         // jump if edges required
-s_and_b32 s64, 127, s[sgprSizeJ]                   // s64 = s[sgprSizeJ] % 128
-s_add_u32 s65, -0x1, s[sgprNumWorkGroups1]         // 
-s_cmp_ge_u32 s[sgprWorkGroup1], s65                // wg1 >= nwg1-1
-s_cselect_b32 s64, s64, 0                          // set rMT1
-s_cmpk_gt_u32 s64, 0x0                             // rMT1 > 0
-s_cbranch_scc1 GW_B0_E1_37                         // jump if edges required
-GW_B0_E0_34:
-
-/* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=1 */
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #0 (d1,d0,vc1,vc0) = */
-/*    (0,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(0,0,0,0) */
-_v_add_lshl_u32 v134, v131, v128, 0x3              // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=128, coord0Vgpr=128
-
-/* rC *= alpha batchEements=[(0, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+0:vgprValuC+0+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+8:vgprValuC+8+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #1 (d1,d0,vc1,vc0) = */
-/*    (1,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(1,0,0,0) */
-
-/* rC *= alpha batchEements=[(1, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+2:vgprValuC+2+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+10:vgprValuC+10+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #2 (d1,d0,vc1,vc0) = */
-/*    (2,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(2,0,0,0) */
-
-/* rC *= alpha batchEements=[(2, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+4:vgprValuC+4+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #3 (d1,d0,vc1,vc0) = */
-/*    (3,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(3,0,0,0) */
-
-/* rC *= alpha batchEements=[(3, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+6:vgprValuC+6+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #4 (d1,d0,vc1,vc0) = */
-/*    (4,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(4,0,0,0) */
-
-/* rC *= alpha batchEements=[(4, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #5 (d1,d0,vc1,vc0) = */
-/*    (5,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(5,0,0,0) */
-
-/* rC *= alpha batchEements=[(5, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #6 (d1,d0,vc1,vc0) = */
-/*    (6,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(6,0,0,0) */
-
-/* rC *= alpha batchEements=[(6, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #7 (d1,d0,vc1,vc0) = */
-/*    (7,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(7,0,0,0) */
-
-/* rC *= alpha batchEements=[(7, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #8 (d1,d0,vc1,vc0) = */
-/*    (8,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(8,0,0,0) */
-
-/* rC *= alpha batchEements=[(8, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #9 (d1,d0,vc1,vc0) = */
-/*    (9,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(9,0,0,0) */
-
-/* rC *= alpha batchEements=[(9, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #10 (d1,d0,vc1,vc0) = */
-/*    (10,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(10,0,0,0) */
-
-/* rC *= alpha batchEements=[(10, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #11 (d1,d0,vc1,vc0) = */
-/*    (11,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(11,0,0,0) */
-
-/* rC *= alpha batchEements=[(11, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #12 (d1,d0,vc1,vc0) = */
-/*    (12,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(12,0,0,0) */
-
-/* rC *= alpha batchEements=[(12, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #13 (d1,d0,vc1,vc0) = */
-/*    (13,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(13,0,0,0) */
-
-/* rC *= alpha batchEements=[(13, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #14 (d1,d0,vc1,vc0) = */
-/*    (14,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(14,0,0,0) */
-
-/* rC *= alpha batchEements=[(14, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #15 (d1,d0,vc1,vc0) = */
-/*    (15,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(15,0,0,0) */
-
-/* rC *= alpha batchEements=[(15, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #16 (d1,d0,vc1,vc0) = */
-/*    (16,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(16,0,0,0) */
-
-/* rC *= alpha batchEements=[(16, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #17 (d1,d0,vc1,vc0) = */
-/*    (17,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(17,0,0,0) */
-
-/* rC *= alpha batchEements=[(17, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #18 (d1,d0,vc1,vc0) = */
-/*    (18,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(18,0,0,0) */
-
-/* rC *= alpha batchEements=[(18, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #19 (d1,d0,vc1,vc0) = */
-/*    (19,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(19,0,0,0) */
-
-/* rC *= alpha batchEements=[(19, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #20 (d1,d0,vc1,vc0) = */
-/*    (20,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(20,0,0,0) */
-
-/* rC *= alpha batchEements=[(20, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #21 (d1,d0,vc1,vc0) = */
-/*    (21,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(21,0,0,0) */
-
-/* rC *= alpha batchEements=[(21, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #22 (d1,d0,vc1,vc0) = */
-/*    (22,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(22,0,0,0) */
-
-/* rC *= alpha batchEements=[(22, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #23 (d1,d0,vc1,vc0) = */
-/*    (23,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(23,0,0,0) */
-
-/* rC *= alpha batchEements=[(23, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #24 (d1,d0,vc1,vc0) = */
-/*    (24,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(24,0,0,0) */
-
-/* rC *= alpha batchEements=[(24, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #25 (d1,d0,vc1,vc0) = */
-/*    (25,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(25,0,0,0) */
-
-/* rC *= alpha batchEements=[(25, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #26 (d1,d0,vc1,vc0) = */
-/*    (26,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(26,0,0,0) */
-
-/* rC *= alpha batchEements=[(26, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #27 (d1,d0,vc1,vc0) = */
-/*    (27,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(27,0,0,0) */
-
-/* rC *= alpha batchEements=[(27, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #28 (d1,d0,vc1,vc0) = */
-/*    (28,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(28,0,0,0) */
-
-/* rC *= alpha batchEements=[(28, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #29 (d1,d0,vc1,vc0) = */
-/*    (29,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(29,0,0,0) */
-
-/* rC *= alpha batchEements=[(29, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #30 (d1,d0,vc1,vc0) = */
-/*    (30,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(30,0,0,0) */
-
-/* rC *= alpha batchEements=[(30, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Batch #31 (d1,d0,vc1,vc0) = */
-/*    (31,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(31,0,0,0) */
-
-/* rC *= alpha batchEements=[(31, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-s_branch label_GW_End_45                           // jump to end
-GW_B0_E1_37:
-
-/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=1 */
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #0 (d1,d0,vc1,vc0) = */
-/*    (0,0,0,0:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(0,0,0,0) */
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(0, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+0:vgprValuC+0+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #1 (d1,d0,vc1,vc0) = */
-/*    (0,0,0,1:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(0,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(0, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+8:vgprValuC+8+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #2 (d1,d0,vc1,vc0) = */
-/*    (1,0,0,0:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(1,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(1, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+2:vgprValuC+2+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #3 (d1,d0,vc1,vc0) = */
-/*    (1,0,0,1:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(1,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(1, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+10:vgprValuC+10+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #4 (d1,d0,vc1,vc0) = */
-/*    (2,0,0,0:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(2,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(2, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+4:vgprValuC+4+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #5 (d1,d0,vc1,vc0) = */
-/*    (2,0,0,1:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(2,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(2, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #6 (d1,d0,vc1,vc0) = */
-/*    (3,0,0,0:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(3,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(3, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+6:vgprValuC+6+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #7 (d1,d0,vc1,vc0) = */
-/*    (3,0,0,1:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(3,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(3, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #8 (d1,d0,vc1,vc0) = */
-/*    (4,0,0,0:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(4,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(4, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #9 (d1,d0,vc1,vc0) = */
-/*    (4,0,0,1:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(4,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(4, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #10 (d1,d0,vc1,vc0) = */
-/*    (5,0,0,0:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(5,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(5, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #11 (d1,d0,vc1,vc0) = */
-/*    (5,0,0,1:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(5,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(5, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #12 (d1,d0,vc1,vc0) = */
-/*    (6,0,0,0:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(6,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(6, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #13 (d1,d0,vc1,vc0) = */
-/*    (6,0,0,1:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(6,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(6, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #14 (d1,d0,vc1,vc0) = */
-/*    (7,0,0,0:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(7,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(7, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #15 (d1,d0,vc1,vc0) = */
-/*    (7,0,0,1:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(7,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(7, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #16 (d1,d0,vc1,vc0) = */
-/*    (8,0,0,0:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(8,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(8, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #17 (d1,d0,vc1,vc0) = */
-/*    (8,0,0,1:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(8,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(8, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #18 (d1,d0,vc1,vc0) = */
-/*    (9,0,0,0:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(9,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(9, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #19 (d1,d0,vc1,vc0) = */
-/*    (9,0,0,1:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(9,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(9, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #20 (d1,d0,vc1,vc0) = */
-/*    (10,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(10,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(10, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #21 (d1,d0,vc1,vc0) = */
-/*    (10,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(10,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(10, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #22 (d1,d0,vc1,vc0) = */
-/*    (11,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(11,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(11, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #23 (d1,d0,vc1,vc0) = */
-/*    (11,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(11,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(11, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #24 (d1,d0,vc1,vc0) = */
-/*    (12,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(12,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(12, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #25 (d1,d0,vc1,vc0) = */
-/*    (12,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(12,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(12, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #26 (d1,d0,vc1,vc0) = */
-/*    (13,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(13,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(13, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #27 (d1,d0,vc1,vc0) = */
-/*    (13,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(13,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(13, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #28 (d1,d0,vc1,vc0) = */
-/*    (14,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(14,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(14, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #29 (d1,d0,vc1,vc0) = */
-/*    (14,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(14,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(14, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #30 (d1,d0,vc1,vc0) = */
-/*    (15,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(15,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(15, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #31 (d1,d0,vc1,vc0) = */
-/*    (15,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(15,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(15, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #32 (d1,d0,vc1,vc0) = */
-/*    (16,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(16,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(16, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #33 (d1,d0,vc1,vc0) = */
-/*    (16,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(16,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(16, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #34 (d1,d0,vc1,vc0) = */
-/*    (17,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(17,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(17, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #35 (d1,d0,vc1,vc0) = */
-/*    (17,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(17,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(17, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #36 (d1,d0,vc1,vc0) = */
-/*    (18,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(18,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(18, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #37 (d1,d0,vc1,vc0) = */
-/*    (18,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(18,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(18, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #38 (d1,d0,vc1,vc0) = */
-/*    (19,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(19,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(19, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #39 (d1,d0,vc1,vc0) = */
-/*    (19,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(19,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(19, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #40 (d1,d0,vc1,vc0) = */
-/*    (20,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(20,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(20, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #41 (d1,d0,vc1,vc0) = */
-/*    (20,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(20,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(20, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #42 (d1,d0,vc1,vc0) = */
-/*    (21,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(21,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(21, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #43 (d1,d0,vc1,vc0) = */
-/*    (21,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(21,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(21, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #44 (d1,d0,vc1,vc0) = */
-/*    (22,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(22,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(22, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #45 (d1,d0,vc1,vc0) = */
-/*    (22,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(22,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(22, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #46 (d1,d0,vc1,vc0) = */
-/*    (23,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(23,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(23, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #47 (d1,d0,vc1,vc0) = */
-/*    (23,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(23,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(23, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #48 (d1,d0,vc1,vc0) = */
-/*    (24,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(24,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(24, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #49 (d1,d0,vc1,vc0) = */
-/*    (24,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(24,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(24, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #50 (d1,d0,vc1,vc0) = */
-/*    (25,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(25,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(25, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #51 (d1,d0,vc1,vc0) = */
-/*    (25,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(25,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(25, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #52 (d1,d0,vc1,vc0) = */
-/*    (26,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(26,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(26, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #53 (d1,d0,vc1,vc0) = */
-/*    (26,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(26,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(26, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #54 (d1,d0,vc1,vc0) = */
-/*    (27,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(27,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(27, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #55 (d1,d0,vc1,vc0) = */
-/*    (27,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(27,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(27, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #56 (d1,d0,vc1,vc0) = */
-/*    (28,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(28,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(28, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #57 (d1,d0,vc1,vc0) = */
-/*    (28,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(28,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(28, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #58 (d1,d0,vc1,vc0) = */
-/*    (29,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(29,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(29, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #59 (d1,d0,vc1,vc0) = */
-/*    (29,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(29,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(29, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #60 (d1,d0,vc1,vc0) = */
-/*    (30,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(30,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(30, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #61 (d1,d0,vc1,vc0) = */
-/*    (30,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(30,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(30, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #62 (d1,d0,vc1,vc0) = */
-/*    (31,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(31,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(31, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Edge Batch #63 (d1,d0,vc1,vc0) = */
-/*    (31,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-/* (d1,vc1,d0,vc0)=(31,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-
-/* rC *= alpha batchEements=[(31, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] // Multiply MI out reg with alpha
-
-/* apply mask, calc new C and issue writes */
-buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-s_branch label_GW_End_45                           // jump to end
-GW_Beta_46:
-s_and_b32 s64, 127, s[sgprSizeI]                   // s64 = s[sgprSizeI] % 128
-s_add_u32 s65, -0x1, s[sgprNumWorkGroups0]         // 
-s_cmp_ge_u32 s[sgprWorkGroup0], s65                // wg0 >= nwg0-1 ?
-s_cselect_b32 s64, s64, 0                          // set rMT0
-s_cmpk_gt_u32 s64, 0x0                             // rMT0 > 0
-s_cbranch_scc1 GW_B1_E1_44                         // jump if edges required
-s_and_b32 s64, 127, s[sgprSizeJ]                   // s64 = s[sgprSizeJ] % 128
-s_add_u32 s65, -0x1, s[sgprNumWorkGroups1]         // 
-s_cmp_ge_u32 s[sgprWorkGroup1], s65                // wg1 >= nwg1-1
-s_cselect_b32 s64, s64, 0                          // set rMT1
-s_cmpk_gt_u32 s64, 0x0                             // rMT1 > 0
-s_cbranch_scc1 GW_B1_E1_44                         // jump if edges required
-GW_B1_E0_41:
-
-/* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=1 */
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #0 (d1,d0,vc1,vc0) = */
-/*    (0,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(0, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+0:vgprValuC+0+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+8:vgprValuC+8+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(0,0,0,0) */
-_v_add_lshl_u32 v135, v130, v128, 0x3              // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=128, coord0Vgpr=128
-_v_add_lshl_u32 v134, v131, v128, 0x3              // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=128, coord0Vgpr=128
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #1 (d1,d0,vc1,vc0) = */
-/*    (1,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(1, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+2:vgprValuC+2+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+10:vgprValuC+10+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(1,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #2 (d1,d0,vc1,vc0) = */
-/*    (2,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(2, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+4:vgprValuC+4+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(2,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #3 (d1,d0,vc1,vc0) = */
-/*    (3,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(3, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+6:vgprValuC+6+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(3,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #4 (d1,d0,vc1,vc0) = */
-/*    (4,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(4, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(4,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #5 (d1,d0,vc1,vc0) = */
-/*    (5,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(5, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(5,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #6 (d1,d0,vc1,vc0) = */
-/*    (6,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(6, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(6,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #7 (d1,d0,vc1,vc0) = */
-/*    (7,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(7, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(7,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #8 (d1,d0,vc1,vc0) = */
-/*    (8,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(8, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(8,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #9 (d1,d0,vc1,vc0) = */
-/*    (9,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(9, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(9,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #10 (d1,d0,vc1,vc0) = */
-/*    (10,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(10, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(10,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #11 (d1,d0,vc1,vc0) = */
-/*    (11,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(11, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(11,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #12 (d1,d0,vc1,vc0) = */
-/*    (12,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(12, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(12,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #13 (d1,d0,vc1,vc0) = */
-/*    (13,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(13, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(13,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #14 (d1,d0,vc1,vc0) = */
-/*    (14,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(14, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(14,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #15 (d1,d0,vc1,vc0) = */
-/*    (15,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(15, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(15,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #16 (d1,d0,vc1,vc0) = */
-/*    (16,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(16, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(16,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #17 (d1,d0,vc1,vc0) = */
-/*    (17,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(17, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(17,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #18 (d1,d0,vc1,vc0) = */
-/*    (18,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(18, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(18,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #19 (d1,d0,vc1,vc0) = */
-/*    (19,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(19, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(19,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #20 (d1,d0,vc1,vc0) = */
-/*    (20,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(20, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(20,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #21 (d1,d0,vc1,vc0) = */
-/*    (21,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(21, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(21,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #22 (d1,d0,vc1,vc0) = */
-/*    (22,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(22, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(22,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #23 (d1,d0,vc1,vc0) = */
-/*    (23,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(23, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(23,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #24 (d1,d0,vc1,vc0) = */
-/*    (24,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(24, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(24,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #25 (d1,d0,vc1,vc0) = */
-/*    (25,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(25, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(25,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #26 (d1,d0,vc1,vc0) = */
-/*    (26,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(26, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(26,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #27 (d1,d0,vc1,vc0) = */
-/*    (27,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(27, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(27,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #28 (d1,d0,vc1,vc0) = */
-/*    (28,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(28, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(28,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #29 (d1,d0,vc1,vc0) = */
-/*    (29,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(29, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(29,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #30 (d1,d0,vc1,vc0) = */
-/*    (30,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(30, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(30,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Batch #31 (d1,d0,vc1,vc0) = */
-/*    (31,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(31, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(31,0,0,0) */
-s_mul_i32 s64, s[sgprStrideC1J], 32                // scale StrideC *= numRows(4) * bpe
-s_add_u32  s[sgprSrdC+0], s[sgprSrdC+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdC+1], s[sgprSrdC+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-s_mul_i32 s64, s[sgprStrideD1J], 32                // scale StrideD *= numRows(4) * bpe
-s_add_u32  s[sgprSrdD+0], s[sgprSrdD+0], s64       // incToNextRow: gra SRD += inc(lower)
-s_addc_u32  s[sgprSrdD+1], s[sgprSrdD+1], 0        // incToNextRow: gra SRD += inc(upper)
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-s_branch label_GW_End_45                           // jump to end
-GW_B1_E1_44:
-
-// TODO in Generator
-// wider store if M % 2 == 0
-s_and_b32 s63, 0x1, s[sgprSizeI]
-s_cbranch_scc0 GW_B1_E1_VW2                                // done shifting
-
-/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=1 */
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #0 (d1,d0,vc1,vc0) = */
-/*    (0,0,0,0:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(0, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+0:vgprValuC+0+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(0,0,0,0) */
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #1 (d1,d0,vc1,vc0) = */
-/*    (0,0,0,1:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(0, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+8:vgprValuC+8+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(0,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #2 (d1,d0,vc1,vc0) = */
-/*    (1,0,0,0:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(1, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+2:vgprValuC+2+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(1,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #3 (d1,d0,vc1,vc0) = */
-/*    (1,0,0,1:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(1, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+10:vgprValuC+10+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(1,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #4 (d1,d0,vc1,vc0) = */
-/*    (2,0,0,0:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(2, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+4:vgprValuC+4+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(2,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #5 (d1,d0,vc1,vc0) = */
-/*    (2,0,0,1:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(2, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(2,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #6 (d1,d0,vc1,vc0) = */
-/*    (3,0,0,0:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(3, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+6:vgprValuC+6+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(3,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #7 (d1,d0,vc1,vc0) = */
-/*    (3,0,0,1:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(3, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(3,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #8 (d1,d0,vc1,vc0) = */
-/*    (4,0,0,0:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(4, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(4,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #9 (d1,d0,vc1,vc0) = */
-/*    (4,0,0,1:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(4, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(4,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #10 (d1,d0,vc1,vc0) = */
-/*    (5,0,0,0:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(5, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(5,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #11 (d1,d0,vc1,vc0) = */
-/*    (5,0,0,1:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(5, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(5,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #12 (d1,d0,vc1,vc0) = */
-/*    (6,0,0,0:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(6, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(6,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #13 (d1,d0,vc1,vc0) = */
-/*    (6,0,0,1:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(6, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(6,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #14 (d1,d0,vc1,vc0) = */
-/*    (7,0,0,0:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(7, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(7,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #15 (d1,d0,vc1,vc0) = */
-/*    (7,0,0,1:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(7, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(7,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #16 (d1,d0,vc1,vc0) = */
-/*    (8,0,0,0:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(8, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(8,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #17 (d1,d0,vc1,vc0) = */
-/*    (8,0,0,1:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(8, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(8,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #18 (d1,d0,vc1,vc0) = */
-/*    (9,0,0,0:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(9, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(9,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #19 (d1,d0,vc1,vc0) = */
-/*    (9,0,0,1:vw1)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(9, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(9,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #20 (d1,d0,vc1,vc0) = */
-/*    (10,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(10, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(10,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #21 (d1,d0,vc1,vc0) = */
-/*    (10,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(10, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(10,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #22 (d1,d0,vc1,vc0) = */
-/*    (11,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(11, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(11,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #23 (d1,d0,vc1,vc0) = */
-/*    (11,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(11, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(11,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #24 (d1,d0,vc1,vc0) = */
-/*    (12,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(12, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(12,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #25 (d1,d0,vc1,vc0) = */
-/*    (12,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(12, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(12,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #26 (d1,d0,vc1,vc0) = */
-/*    (13,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(13, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(13,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #27 (d1,d0,vc1,vc0) = */
-/*    (13,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(13, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(13,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #28 (d1,d0,vc1,vc0) = */
-/*    (14,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(14, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(14,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #29 (d1,d0,vc1,vc0) = */
-/*    (14,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(14, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(14,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #30 (d1,d0,vc1,vc0) = */
-/*    (15,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(15, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(15,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #31 (d1,d0,vc1,vc0) = */
-/*    (15,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(15, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(15,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #32 (d1,d0,vc1,vc0) = */
-/*    (16,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(16, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(16,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #33 (d1,d0,vc1,vc0) = */
-/*    (16,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(16, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(16,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #34 (d1,d0,vc1,vc0) = */
-/*    (17,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(17, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(17,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #35 (d1,d0,vc1,vc0) = */
-/*    (17,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(17, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(17,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #36 (d1,d0,vc1,vc0) = */
-/*    (18,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(18, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(18,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #37 (d1,d0,vc1,vc0) = */
-/*    (18,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(18, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(18,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #38 (d1,d0,vc1,vc0) = */
-/*    (19,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(19, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(19,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #39 (d1,d0,vc1,vc0) = */
-/*    (19,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(19, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(19,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #40 (d1,d0,vc1,vc0) = */
-/*    (20,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(20, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(20,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #41 (d1,d0,vc1,vc0) = */
-/*    (20,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(20, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(20,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #42 (d1,d0,vc1,vc0) = */
-/*    (21,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(21, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(21,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #43 (d1,d0,vc1,vc0) = */
-/*    (21,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(21, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(21,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #44 (d1,d0,vc1,vc0) = */
-/*    (22,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(22, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(22,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #45 (d1,d0,vc1,vc0) = */
-/*    (22,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(22, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(22,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #46 (d1,d0,vc1,vc0) = */
-/*    (23,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(23, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(23,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #47 (d1,d0,vc1,vc0) = */
-/*    (23,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(23, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(23,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #48 (d1,d0,vc1,vc0) = */
-/*    (24,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(24, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(24,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #49 (d1,d0,vc1,vc0) = */
-/*    (24,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(24, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(24,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #50 (d1,d0,vc1,vc0) = */
-/*    (25,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(25, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(25,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #51 (d1,d0,vc1,vc0) = */
-/*    (25,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(25, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(25,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #52 (d1,d0,vc1,vc0) = */
-/*    (26,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(26, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(26,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #53 (d1,d0,vc1,vc0) = */
-/*    (26,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(26, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(26,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #54 (d1,d0,vc1,vc0) = */
-/*    (27,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(27, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(27,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #55 (d1,d0,vc1,vc0) = */
-/*    (27,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(27, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(27,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #56 (d1,d0,vc1,vc0) = */
-/*    (28,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(28, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(28,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #57 (d1,d0,vc1,vc0) = */
-/*    (28,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(28, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(28,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #58 (d1,d0,vc1,vc0) = */
-/*    (29,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(29, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(29,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #59 (d1,d0,vc1,vc0) = */
-/*    (29,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(29, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(29,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #60 (d1,d0,vc1,vc0) = */
-/*    (30,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(30, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(30,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #61 (d1,d0,vc1,vc0) = */
-/*    (30,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(30, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(30,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #62 (d1,d0,vc1,vc0) = */
-/*    (31,0,0,0:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(31, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(31,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #63 (d1,d0,vc1,vc0) = */
-/*    (31,0,0,1:vw1)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(31, 0, 0, 1)] */
-v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(31,0,0,1) */
-_v_add_co_u32 v132, vcc, v128, 1                   // coord0.1: coord0 += d0*sg0*VW + vc0
-v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v132, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+138:vgprValuC+138+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+138:vgprValuC+138+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx2 v[138:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-s_branch label_GW_End_45                           // jump to end
-
-GW_B1_E1_VW2:
-
-/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=1 */
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #0 (d1,d0,vc1,vc0) = */
-/*    (0,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(0, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+0:vgprValuC+0+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+8:vgprValuC+8+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(0,0,0,0) */
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #1 (d1,d0,vc1,vc0) = */
-/*    (1,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(1, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+2:vgprValuC+2+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+10:vgprValuC+10+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(1,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #2 (d1,d0,vc1,vc0) = */
-/*    (2,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(2, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+4:vgprValuC+4+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(2,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #3 (d1,d0,vc1,vc0) = */
-/*    (3,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(3, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+6:vgprValuC+6+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(3,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #4 (d1,d0,vc1,vc0) = */
-/*    (4,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(4, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(4,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #5 (d1,d0,vc1,vc0) = */
-/*    (5,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(5, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(5,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #6 (d1,d0,vc1,vc0) = */
-/*    (6,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(6, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(6,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #7 (d1,d0,vc1,vc0) = */
-/*    (7,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(7, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(7,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #8 (d1,d0,vc1,vc0) = */
-/*    (8,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(8, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(8,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #9 (d1,d0,vc1,vc0) = */
-/*    (9,0,0,0:vw2)                       */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(9, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(9,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #10 (d1,d0,vc1,vc0) = */
-/*    (10,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(10, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(10,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #11 (d1,d0,vc1,vc0) = */
-/*    (11,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(11, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(11,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #12 (d1,d0,vc1,vc0) = */
-/*    (12,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(12, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(12,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #13 (d1,d0,vc1,vc0) = */
-/*    (13,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(13, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(13,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #14 (d1,d0,vc1,vc0) = */
-/*    (14,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(14, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(14,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #15 (d1,d0,vc1,vc0) = */
-/*    (15,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(15, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(15,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #16 (d1,d0,vc1,vc0) = */
-/*    (16,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(16, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(16,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #17 (d1,d0,vc1,vc0) = */
-/*    (17,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(17, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(17,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #18 (d1,d0,vc1,vc0) = */
-/*    (18,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(18, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(18,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #19 (d1,d0,vc1,vc0) = */
-/*    (19,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(19, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(19,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #20 (d1,d0,vc1,vc0) = */
-/*    (20,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(20, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(20,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #21 (d1,d0,vc1,vc0) = */
-/*    (21,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(21, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(21,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #22 (d1,d0,vc1,vc0) = */
-/*    (22,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(22, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(22,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #23 (d1,d0,vc1,vc0) = */
-/*    (23,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(23, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(23,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #24 (d1,d0,vc1,vc0) = */
-/*    (24,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(24, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(24,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #25 (d1,d0,vc1,vc0) = */
-/*    (25,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(25, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(25,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #26 (d1,d0,vc1,vc0) = */
-/*    (26,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(26, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(26,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #27 (d1,d0,vc1,vc0) = */
-/*    (27,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(27, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(27,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #28 (d1,d0,vc1,vc0) = */
-/*    (28,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(28, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(28,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #29 (d1,d0,vc1,vc0) = */
-/*    (29,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(29, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(29,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #30 (d1,d0,vc1,vc0) = */
-/*    (30,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(30, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(30,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 */
-s_sleep 5 // optimization: sync and wait
-s_barrier
-
-/******************************************/
-/* Global Write Beta Edge Batch #31 (d1,d0,vc1,vc0) = */
-/*    (31,0,0,0:vw2)                      */
-/******************************************/
-
-/* calc coords, apply mask, and issue loads (if necessary) */
-
-/* rC *= alpha batchEements=[(31, 0, 0, 0)] */
-v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] // Multiply MI out reg with alpha
-v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] // Multiply MI out reg with alpha
-/* (d1,vc1,d0,vc0)=(31,0,0,0) */
-_v_add_co_u32 v129, vcc, v129, 4                   // coord1.1: coord1Vgpr += d1*sg1*VW + vc1
-
-/* Fix for UseInitialStridesCD, emitAddressSetupCode */
-s_mul_i32 s64, s[sgprStrideC1J], 4                 // scale stride
-_v_add_u32 v130, v130, s64                         // ROWINC- Move cinRowPtr to next row
-s_mul_i32 s64, s[sgprStrideD1J], 4                 // scale stride
-_v_add_u32 v131, v131, s64                         // Move coutRowPtr to next row
-v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI]          // coord0 < size0
-v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ]          // coord1 < size1
-s_and_b64 s[38:39], s[64:65], s[38:39]             // in0 && in1
-_v_add_lshl_u32 v135, v130, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v135, -1, v135, s[38:39]             // LDC clip if OOB. offset
-_v_add_lshl_u32 v134, v131, v128, 0x3              // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
-v_cndmask_b32 v134, -1, v134, s[38:39]             // LDD clip if OOB. offset
-buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0,  glc slc // load C for beta calc
-s_sleep 5 // optimization: sync and wait
-s_barrier
-s_waitcnt vmcnt(0)                                 // wait C
-
-/* apply mask, calc new C and issue writes */
-v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta
-v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta
-buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0,  glc slc // store D
-s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
-s_branch label_GW_End_45                           // jump to end
-label_GW_End_45:
-
-label_0047:  /// KernelEnd
-s_endpgm                                           // Kernel End
-
-
diff --git a/src/Tensile/KernelWriter.py b/src/Tensile/KernelWriter.py
index e51b22e631..5acb743afe 100644
--- a/src/Tensile/KernelWriter.py
+++ b/src/Tensile/KernelWriter.py
@@ -5291,7 +5291,7 @@ def getReplacementKernelPath(self, kernel):
     kernelName = self.getKernelName(kernel)
 
     if isCustomKernelConfig(kernel):
-      return os.path.join(globalParameters["CustomKernelDirectory"], (kernelName + ".s"))
+      return globalParameters["CustomKernelDirectory"].joinpath(kernelName + ".s")
     else: # Replacement kernel
       return ReplacementKernels.Get(kernelName)
 
diff --git a/src/Tensile/TensileCreateLibrary.py b/src/Tensile/TensileCreateLibrary.py
index 455435c85c..fbe60be050 100644
--- a/src/Tensile/TensileCreateLibrary.py
+++ b/src/Tensile/TensileCreateLibrary.py
@@ -34,7 +34,8 @@
 from . import LibraryIO
 from . import Utils
 from .Common import globalParameters, HR, print1, print2, printExit, ensurePath, \
-                    CHeader, CMakeHeader, assignGlobalParameters, gfxName, architectureMap
+                    CHeader, CMakeHeader, assignGlobalParameters, gfxName, architectureMap, \
+                    copy_data_files  
 from .KernelWriterAssembly import KernelWriterAssembly
 from .KernelWriterSource import KernelWriterSource
 from .SolutionLibrary import MasterSolutionLibrary
@@ -651,11 +652,7 @@ def copyStaticFiles(outputPath=None):
     "tensile_float8_bfloat8.h",
     "hip_f8_impl.h",
     "KernelHeader.h" ]
-
-  for fileName in libraryStaticFiles:
-    # copy file
-    shutil.copy( os.path.join(globalParameters["SourcePath"], fileName), \
-        outputPath )
+  copy_data_files(libraryStaticFiles, outputPath)
 
   return libraryStaticFiles
 
diff --git a/src/Tensile/data/Source/client/CMakeLists.txt b/src/Tensile/data/Source/client/CMakeLists.txt
index cae52f54c1..cd4075de74 100644
--- a/src/Tensile/data/Source/client/CMakeLists.txt
+++ b/src/Tensile/data/Source/client/CMakeLists.txt
@@ -60,7 +60,7 @@ find_package(Boost COMPONENTS program_options REQUIRED)
 if (NOT WIN32)
     find_package(ROCmSMI QUIET)
     if(NOT ROCmSMI_FOUND)
-        set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH}" "${Tensile_DIR}" "${Tensile_DIR}/../Source/cmake" "${CMAKE_SOURCE_DIR}/cmake")
+        set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH}" "${CMAKE_SOURCE_DIR}/cmake")
         find_package(ROCmSMI REQUIRED)
     endif()
 endif()
diff --git a/src/Tensile/data/cmake/TensileConfigVersion.cmake.j2 b/src/Tensile/data/template/cmake/TensileConfigVersion.cmake.j2
similarity index 96%
rename from src/Tensile/data/cmake/TensileConfigVersion.cmake.j2
rename to src/Tensile/data/template/cmake/TensileConfigVersion.cmake.j2
index 1c56f4a612..5b6d82ecbb 100644
--- a/src/Tensile/data/cmake/TensileConfigVersion.cmake.j2
+++ b/src/Tensile/data/template/cmake/TensileConfigVersion.cmake.j2
@@ -27,7 +27,7 @@ set(TENSILE_VERSION_MINOR {TENSILE_VERSION_MINOR})
 set(TENSILE_VERSION_PATCH  {TENSILE_VERSION_PATCH})
 
 # export version
-set(PACKAGE_VERSION "${TENSILE_VERSION_MAJOR}.${TENSILE_VERSION_MINOR}.${TENSILE_VERSION_PATCH}")
+set(PACKAGE_VERSION "{TENSILE_VERSION_MAJOR}.{TENSILE_VERSION_MINOR}.{TENSILE_VERSION_PATCH}")
 
 if(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
     set(PACKAGE_VERSION_EXACT TRUE)